From 421b9277b0af3ea2583323e2ceac9e840442a7b0 Mon Sep 17 00:00:00 2001
From: Jared Roesch <roeschinc@gmail.com>
Date: Wed, 20 Feb 2019 14:20:00 -0800
Subject: [PATCH 01/93] Fix fusion bug when call symbol that is not an
 operator. (#2630)

---
 src/relay/pass/fuse_ops.cc               | 13 ++++++++-
 tests/python/relay/test_pass_fuse_ops.py | 37 ++++++++++++++++++++++++
 2 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 3227a70f3e7c..99a5421e2ff9 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -208,11 +208,22 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     Node* node = graph_.node_map.at(call);
     static auto fpattern =
         Op::GetAttr<TOpPattern>("TOpPattern");
-    // setup pattern.
+    // Now we set the pattern of this call.
+    //
+    // If we see a call mentioning an operator we should mark it with its
+    // annotated pattern.
+    //
+    // If the pattern is not annotated we will default to opaque.
+    //
+    // Finally if the operator position is not a call node we will
+    // need to call Update, as it may be an arbitrary expression.
     OpPatternKind op_pattern = kOpaque;
     if (const OpNode* opnode = call->op.as<OpNode>()) {
       op_pattern = static_cast<OpPatternKind>(fpattern[GetRef<Op>(opnode)]);
+    } else {
+      this->Update(call->op, node, kOpaque);
     }
+
     node->pattern = op_pattern;
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
     // pass the message back to all the children it references.
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 1d926a325b1a..634d69bae823 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -251,6 +251,42 @@ def expected(dshape):
     assert relay.ir_pass.alpha_equal(z, after)
 
 
+def test_fuse_myia_regression():
+    def before(dshape, dtype):
+        x = relay.var('x', shape=dshape, dtype=dtype)
+        y = relay.var('y', shape=dshape, dtype=dtype)
+        sb = relay.ScopeBuilder()
+        with sb.if_scope(relay.op.greater(x, y)):
+            sb.ret(relay.Function([], x))
+        with sb.else_scope():
+            sb.ret(relay.Function([], y))
+        return relay.Function([x, y],
+            relay.Call(sb.get(), []))
+
+    def expected(dshape, dtype):
+        x = relay.var('x', shape=dshape, dtype=dtype)
+        y = relay.var('y', shape=dshape, dtype=dtype)
+        sb = relay.ScopeBuilder()
+        p1 = relay.var('p1', shape=dshape, dtype=dtype)
+        p2 = relay.var('p2', shape=dshape, dtype=dtype)
+        fused_gt = relay.Function([p1, p2],
+            relay.op.greater(p1, p2))
+        with sb.if_scope(fused_gt(x, y)):
+            sb.ret(relay.Function([], x))
+        with sb.else_scope():
+            sb.ret(relay.Function([], y))
+        return relay.Function([x, y],
+            relay.Call(sb.get(), []))
+
+    dshape = ()
+    dtype = 'int64'
+    f = before(dshape, dtype)
+    f = relay.ir_pass.infer_type(f)
+    f = relay.ir_pass.fuse_ops(f)
+    after = relay.ir_pass.infer_type(expected(dshape, dtype))
+    assert relay.ir_pass.alpha_equal(f, after)
+
+
 if __name__ == "__main__":
     test_fuse_simple()
     test_conv2d_fuse()
@@ -258,3 +294,4 @@ def expected(dshape):
     test_tuple_root()
     test_tuple_strided_slice()
     test_stop_fusion()
+    test_fuse_myia_regression()

From f409b691891e163036eaa6bd72063d14c5423ba6 Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Thu, 21 Feb 2019 08:57:13 -0800
Subject: [PATCH 02/93] [RUNTIME][NDArray] Allowing External Libraries to
 Subclass NDArrays (#2613)

---
 apps/extension/Makefile                      |  2 +-
 apps/extension/python/tvm_ext/__init__.py    | 29 ++++++-
 apps/extension/src/tvm_ext.cc                | 85 +++++++++++++++++++-
 apps/extension/tests/test_ext.py             | 16 ++++
 include/tvm/runtime/ndarray.h                | 29 ++++++-
 include/tvm/runtime/packed_func.h            | 48 ++++++++---
 include/tvm/runtime/registry.h               |  2 +-
 nnvm/include/nnvm/compiler/packed_func_ext.h |  6 +-
 nnvm/src/compiler/packed_func_ext.cc         |  4 +-
 python/tvm/_ffi/_ctypes/function.py          |  6 +-
 python/tvm/_ffi/_ctypes/ndarray.py           | 19 ++++-
 python/tvm/_ffi/_cython/base.pxi             | 10 ++-
 python/tvm/_ffi/_cython/function.pxi         |  4 +-
 python/tvm/_ffi/_cython/ndarray.pxi          | 22 +++--
 python/tvm/_ffi/ndarray.py                   | 39 ++++++---
 python/tvm/_ffi/runtime_ctypes.py            |  9 +++
 python/tvm/ndarray.py                        |  2 +-
 tests/cpp/packed_func_test.cc                |  2 +-
 18 files changed, 280 insertions(+), 54 deletions(-)

diff --git a/apps/extension/Makefile b/apps/extension/Makefile
index 3a1f8a2160ee..41e9bf621cb6 100644
--- a/apps/extension/Makefile
+++ b/apps/extension/Makefile
@@ -6,7 +6,7 @@ PKG_CFLAGS = -std=c++11 -O2 -fPIC\
 	-I${TVM_ROOT}/3rdparty/dlpack/include\
 	-I${TVM_ROOT}/3rdparty/HalideIR/src
 
-PKG_LDFLAGS =-L${TVM_ROOT}/lib
+PKG_LDFLAGS =-L${TVM_ROOT}/build
 UNAME_S := $(shell uname -s)
 
 ifeq ($(UNAME_S), Darwin)
diff --git a/apps/extension/python/tvm_ext/__init__.py b/apps/extension/python/tvm_ext/__init__.py
index 25286f67b4f5..78b407ae9aa1 100644
--- a/apps/extension/python/tvm_ext/__init__.py
+++ b/apps/extension/python/tvm_ext/__init__.py
@@ -31,7 +31,7 @@ def __init__(self, handle):
     def __del__(self):
         # You can also call your own customized
         # deleter if you can free it via your own FFI.
-        tvm.nd.free_extension_handle(self.handle, 17)
+        tvm.nd.free_extension_handle(self.handle, self.__class__._tvm_tcode)
 
     @property
     def _tvm_handle(self):
@@ -42,3 +42,30 @@ def __getitem__(self, idx):
 
 # Register IntVec extension on python side.
 tvm.register_extension(IntVec, IntVec)
+
+
+nd_create = tvm.get_global_func("tvm_ext.nd_create")
+nd_add_two = tvm.get_global_func("tvm_ext.nd_add_two")
+nd_get_addtional_info = tvm.get_global_func("tvm_ext.nd_get_addtional_info")
+
+class NDSubClass(tvm.nd.NDArrayBase):
+    """Example for subclassing TVM's NDArray infrastructure.
+
+    By inheriting TMV's NDArray, external libraries could
+    leverage TVM's FFI without any modification.
+    """
+    # Should be consistent with the type-trait set in the backend
+    _array_type_code = 1
+
+    @staticmethod
+    def create(addtional_info):
+        return nd_create(addtional_info)
+
+    @property
+    def addtional_info(self):
+        return nd_get_addtional_info(self)
+
+    def __add__(self, other):
+        return nd_add_two(self, other)
+
+tvm.register_extension(NDSubClass, NDSubClass)
diff --git a/apps/extension/src/tvm_ext.cc b/apps/extension/src/tvm_ext.cc
index 362ac62dea3d..97e0ada25a2e 100644
--- a/apps/extension/src/tvm_ext.cc
+++ b/apps/extension/src/tvm_ext.cc
@@ -7,24 +7,87 @@
 #include <tvm/runtime/packed_func.h>
 #include <tvm/runtime/module.h>
 #include <tvm/runtime/registry.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/packed_func_ext.h>
+#include <tvm/runtime/device_api.h>
 
 namespace tvm_ext {
 using IntVector = std::vector<int>;
+class NDSubClass;
 }  // namespace tvm_ext
 
 namespace tvm {
 namespace runtime {
 template<>
-struct extension_class_info<tvm_ext::IntVector> {
+struct extension_type_info<tvm_ext::IntVector> {
   static const int code = 17;
 };
+template<>
+struct array_type_info<tvm_ext::NDSubClass> {
+  static const int code = 1;
+};
 }  // namespace tvm
 }  // namespace runtime
 
 using namespace tvm;
 using namespace tvm::runtime;
 
+namespace tvm_ext {
+/*!
+ * \brief A subclass of TVM's NDArray.
+ *
+ * To use this extension, an external library should
+ *
+ * 1) Inherit TVM's NDArray and NDArray container,
+ *    and define the trait `array_type_info` for this class.
+ *
+ * 2) Define a constructor in the inherited class that accepts
+ *    a pointer to TVM's Container, which is nullable.
+ *
+ * 3) On Python frontend, inherit `tvm.nd.NDArrayBase`,
+ *    define the class attribute `_array_type_code` consistent to
+ *    the C++ type trait, and register the subclass using `tvm.register_extension`.
+ */
+class NDSubClass : public tvm::runtime::NDArray {
+ public:
+  class SubContainer : public NDArray::Container {
+   public:
+    SubContainer(int addtional_info) :
+      addtional_info_(addtional_info) {
+      array_type_code_ = array_type_info<NDSubClass>::code;
+    }
+    static bool Is(NDArray::Container *container) {
+      SubContainer *c = static_cast<SubContainer*>(container);
+      return c->array_type_code_ == array_type_info<NDSubClass>::code;
+    }
+    int addtional_info_{0};
+  };
+  NDSubClass(NDArray::Container *container) {
+    if (container == nullptr) {
+      data_ = nullptr;
+      return;
+    }
+    CHECK(SubContainer::Is(container));
+    container->IncRef();
+    data_ = container;
+  }
+  ~NDSubClass() {
+    this->reset();
+  }
+  NDSubClass AddWith(const NDSubClass &other) const {
+    SubContainer *a = static_cast<SubContainer*>(data_);
+    SubContainer *b = static_cast<SubContainer*>(other.data_);
+    CHECK(a != nullptr && b != nullptr);
+    return NDSubClass(new SubContainer(a->addtional_info_ + b->addtional_info_));
+  }
+  int get_additional_info() const {
+    SubContainer *self = static_cast<SubContainer*>(data_);
+    CHECK(self != nullptr);
+    return self->addtional_info_;
+  }
+};
+}  // namespace tvm_ext
+
 namespace tvm_ext {
 
 TVM_REGISTER_EXT_TYPE(IntVector);
@@ -64,6 +127,26 @@ TVM_REGISTER_GLOBAL("device_api.ext_dev")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
     *rv = (*tvm::runtime::Registry::Get("device_api.cpu"))();
   });
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_create")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  int addtional_info = args[0];
+  *rv = NDSubClass(new NDSubClass::SubContainer(addtional_info));
+});
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_add_two")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  NDSubClass a = args[0];
+  NDSubClass b = args[1];
+  *rv = a.AddWith(b);
+});
+
+TVM_REGISTER_GLOBAL("tvm_ext.nd_get_addtional_info")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  NDSubClass a = args[0];
+  *rv = a.get_additional_info();
+});
+
 }  // namespace tvm_ext
 
 // External function exposed to runtime.
diff --git a/apps/extension/tests/test_ext.py b/apps/extension/tests/test_ext.py
index def30803135e..a6246d6be2e1 100644
--- a/apps/extension/tests/test_ext.py
+++ b/apps/extension/tests/test_ext.py
@@ -32,6 +32,7 @@ def test_sym_add():
     c = tvm_ext.sym_add(a, b)
     assert c.a == a and c.b == b
 
+
 def test_ext_vec():
     ivec = tvm_ext.ivec_create(1, 2, 3)
     assert(isinstance(ivec, tvm_ext.IntVec))
@@ -44,6 +45,7 @@ def ivec_cb(v2):
 
     tvm.convert(ivec_cb)(ivec)
 
+
 def test_extract_ext():
     fdict = tvm.extract_ext_funcs(tvm_ext._LIB.TVMExtDeclare)
     assert fdict["mul"](3, 4) == 12
@@ -68,7 +70,21 @@ def check_llvm():
     check_llvm()
 
 
+def test_nd_subclass():
+    a = tvm_ext.NDSubClass.create(addtional_info=3)
+    b = tvm_ext.NDSubClass.create(addtional_info=5)
+    c = a + b
+    d = a + a
+    e = b + b
+    assert(a.addtional_info == 3)
+    assert(b.addtional_info == 5)
+    assert(c.addtional_info == 8)
+    assert(d.addtional_info == 6)
+    assert(e.addtional_info == 10)
+
+
 if __name__ == "__main__":
+    test_nd_subclass()
     test_extern_call()
     test_ext_dev()
     test_ext_vec()
diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h
index e2a447e4235c..2b9674301607 100644
--- a/include/tvm/runtime/ndarray.h
+++ b/include/tvm/runtime/ndarray.h
@@ -178,10 +178,30 @@ class NDArray {
   Container* data_{nullptr};
   // enable internal functions
   friend struct Internal;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
   friend class TVMRetValue;
   friend class TVMArgsSetter;
 };
 
+/*!
+ * \brief The type trait indicates subclass of TVM's NDArray.
+ *  For irrelavant classes, code = -1.
+ *  For TVM NDArray itself, code = 0.
+ *  All subclasses of NDArray should override code > 0.
+ */
+template<typename T>
+struct array_type_info {
+  /*! \brief the value of the traits */
+  static const int code = -1;
+};
+
+// Overrides the type trait for tvm's NDArray.
+template<>
+struct array_type_info<NDArray> {
+  static const int code = 0;
+};
+
 /*!
  * \brief Save a DLTensor to stream
  * \param strm The outpu stream
@@ -196,7 +216,7 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
  *    the pointer to the NDArrayContainer can be directly
  *    interpreted as a DLTensor*
  *
- * \note: do not use this function directly, use NDArray.
+ * \note do not use this function directly, use NDArray.
  */
 class NDArray::Container {
  public:
@@ -228,16 +248,19 @@ class NDArray::Container {
 
  protected:
   friend class NDArray;
+  friend class TVMPODValue_;
+  friend class TVMArgValue;
+  friend class TVMRetValue;
   friend class RPCWrappedFunc;
   /*!
    * \brief Type flag used to indicate subclass.
    *  Default value 0 means normal NDArray::Conatainer.
    *
    *  We can extend a more specialized NDArray::Container
-   *  and use the array_type_index_ to indicate
+   *  and use the array_type_code_ to indicate
    *  the specific array subclass.
    */
-  uint32_t array_type_index_{0};
+  int32_t array_type_code_{0};
   /*! \brief The internal reference counter */
   std::atomic<int> ref_counter_{0};
   /*!
diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h
index a3b4a1696bf0..1398da0d748b 100644
--- a/include/tvm/runtime/packed_func.h
+++ b/include/tvm/runtime/packed_func.h
@@ -362,7 +362,7 @@ inline std::string TVMType2String(TVMType t);
  * \tparam T the typename
  */
 template<typename T>
-struct extension_class_info {
+struct extension_type_info {
   static const int code = 0;
 };
 
@@ -455,6 +455,15 @@ class TVMPODValue_ {
     TVM_CHECK_TYPE_CODE(type_code_, kTVMContext);
     return value_.v_ctx;
   }
+  template<typename TNDArray,
+           typename = typename std::enable_if<
+           std::is_base_of<NDArray, TNDArray>::value>::type>
+  TNDArray AsNDArray() const {
+    if (type_code_ == kNull) return TNDArray(nullptr);
+    auto *container = static_cast<NDArray::Container*>(value_.v_handle);
+    CHECK_EQ(container->array_type_code_, array_type_info<TNDArray>::code);
+    return TNDArray(container);
+  }
   template<typename TExtension>
   const TExtension& AsExtension() const {
     CHECK_LT(type_code_, kExtEnd);
@@ -561,7 +570,7 @@ class TVMArgValue : public TVMPODValue_ {
   inline TNodeRef AsNodeRef() const;
   template<typename T,
            typename = typename std::enable_if<
-             std::is_class<T>::value>::type>
+           std::is_class<T>::value>::type>
   inline operator T() const;
   template<typename TNodeRef,
            typename = typename std::enable_if<
@@ -727,10 +736,10 @@ class TVMRetValue : public TVMPODValue_ {
   }
   template<typename T,
            typename = typename std::enable_if<
-             extension_class_info<T>::code != 0>::type>
+             extension_type_info<T>::code != 0>::type>
   TVMRetValue& operator=(const T& other) {
     this->SwitchToClass<T>(
-        extension_class_info<T>::code, other);
+        extension_type_info<T>::code, other);
     return *this;
   }
   /*!
@@ -1094,7 +1103,7 @@ class TVMArgsSetter {
   // extension
   template<typename T,
            typename = typename std::enable_if<
-             extension_class_info<T>::code != 0>::type>
+             extension_type_info<T>::code != 0>::type>
   inline void operator()(size_t i, const T& value) const;
   // NodeRef related extenstions: in tvm/packed_func_ext.h
   inline void operator()(size_t i, const NodeRef& other) const;  // NOLINT(*)
@@ -1212,40 +1221,53 @@ inline R TypedPackedFunc<R(Args...)>::operator()(Args... args) const {
 
 // extension and node type handling
 namespace detail {
-template<typename T, typename TSrc, bool is_ext>
+template<typename T, typename TSrc, bool is_ext, bool is_nd>
 struct TVMValueCast {
   static T Apply(const TSrc* self) {
+    static_assert(!is_ext && !is_nd, "The default case accepts only non-extensions");
     return self->template AsNodeRef<T>();
   }
 };
 
 template<typename T, typename TSrc>
-struct TVMValueCast<T, TSrc, true> {
+struct TVMValueCast<T, TSrc, true, false> {
   static T Apply(const TSrc* self) {
     return self->template AsExtension<T>();
   }
 };
+
+template<typename T, typename TSrc>
+struct TVMValueCast<T, TSrc, false, true> {
+  static T Apply(const TSrc* self) {
+    return self->template AsNDArray<T>();
+  }
+};
+
 }  // namespace detail
 
 template<typename T, typename>
 inline TVMArgValue::operator T() const {
   return detail::
-      TVMValueCast<T, TVMArgValue, extension_class_info<T>::code != 0>
+      TVMValueCast<T, TVMArgValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
       ::Apply(this);
 }
 
 template<typename T, typename>
 inline TVMRetValue::operator T() const {
   return detail::
-      TVMValueCast<T, TVMRetValue, extension_class_info<T>::code != 0>
+      TVMValueCast<T, TVMRetValue,
+                   (extension_type_info<T>::code != 0),
+                   (array_type_info<T>::code > 0)>
       ::Apply(this);
 }
 
 template<typename T, typename>
 inline void TVMArgsSetter::operator()(size_t i, const T& value) const {
-  static_assert(extension_class_info<T>::code != 0,
+  static_assert(extension_type_info<T>::code != 0,
                 "Need to have extesion code");
-  type_codes_[i] = extension_class_info<T>::code;
+  type_codes_[i] = extension_type_info<T>::code;
   values_[i].v_handle = const_cast<T*>(&value);
 }
 
@@ -1262,9 +1284,9 @@ struct ExtTypeInfo {
 
 template<typename T>
 inline ExtTypeVTable* ExtTypeVTable::Register_() {
-  const int code = extension_class_info<T>::code;
+  const int code = extension_type_info<T>::code;
   static_assert(code != 0,
-                "require extension_class_info traits to be declared with non-zero code");
+                "require extension_type_info traits to be declared with non-zero code");
   ExtTypeVTable vt;
   vt.clone = ExtTypeInfo<T>::clone;
   vt.destroy = ExtTypeInfo<T>::destroy;
diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h
index 9466056a1282..a53a76f4df2e 100644
--- a/include/tvm/runtime/registry.h
+++ b/include/tvm/runtime/registry.h
@@ -133,7 +133,7 @@ class Registry {
 /*!
  * \brief Macro to register extension type.
  *  This must be registered in a cc file
- *  after the trait extension_class_info is defined.
+ *  after the trait extension_type_info is defined.
  */
 #define TVM_REGISTER_EXT_TYPE(T)                                 \
   TVM_STR_CONCAT(TVM_TYPE_REG_VAR_DEF, __COUNTER__) =            \
diff --git a/nnvm/include/nnvm/compiler/packed_func_ext.h b/nnvm/include/nnvm/compiler/packed_func_ext.h
index e289fd4efa59..a79574fa0879 100644
--- a/nnvm/include/nnvm/compiler/packed_func_ext.h
+++ b/nnvm/include/nnvm/compiler/packed_func_ext.h
@@ -40,17 +40,17 @@ namespace tvm {
 namespace runtime {
 
 template<>
-struct extension_class_info<nnvm::Symbol> {
+struct extension_type_info<nnvm::Symbol> {
   static const int code = 16;
 };
 
 template<>
-struct extension_class_info<nnvm::Graph> {
+struct extension_type_info<nnvm::Graph> {
   static const int code = 17;
 };
 
 template<>
-struct extension_class_info<nnvm::compiler::AttrDict> {
+struct extension_type_info<nnvm::compiler::AttrDict> {
   static const int code = 18;
 };
 
diff --git a/nnvm/src/compiler/packed_func_ext.cc b/nnvm/src/compiler/packed_func_ext.cc
index 1a19feabfe8a..8530a5556b64 100644
--- a/nnvm/src/compiler/packed_func_ext.cc
+++ b/nnvm/src/compiler/packed_func_ext.cc
@@ -76,8 +76,8 @@ TVM_REGISTER_GLOBAL("nnvm.compiler._register_alter_op_layout")
     if (ret.type_code() == TVMTypeCode::kNull) {
       return false;
     }
-    CHECK_EQ(ret.type_code(), tvm::runtime::extension_class_info<Symbol>::code)
-      << " expected " << "Symbol (code = " << tvm::runtime::extension_class_info<Symbol>::code
+    CHECK_EQ(ret.type_code(), tvm::runtime::extension_type_info<Symbol>::code)
+      << " expected " << "Symbol (code = " << tvm::runtime::extension_type_info<Symbol>::code
       << ") but get code = " << ret.type_code();
     *ret_symbol = *(static_cast<Symbol*>(ret.value().v_handle));
     return true;
diff --git a/python/tvm/_ffi/_ctypes/function.py b/python/tvm/_ffi/_ctypes/function.py
index 3c2a7a5f8c9b..5c176f819105 100644
--- a/python/tvm/_ffi/_ctypes/function.py
+++ b/python/tvm/_ffi/_ctypes/function.py
@@ -223,13 +223,13 @@ def _handle_return_func(x):
 _node.__init_by_constructor__ = __init_handle_by_constructor__
 RETURN_SWITCH[TypeCode.FUNC_HANDLE] = _handle_return_func
 RETURN_SWITCH[TypeCode.MODULE_HANDLE] = _return_module
-RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
+RETURN_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
 C_TO_PY_ARG_SWITCH[TypeCode.FUNC_HANDLE] = _wrap_arg_func(
     _handle_return_func, TypeCode.FUNC_HANDLE)
 C_TO_PY_ARG_SWITCH[TypeCode.MODULE_HANDLE] = _wrap_arg_func(
     _return_module, TypeCode.MODULE_HANDLE)
-C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True)
-C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False)
+C_TO_PY_ARG_SWITCH[TypeCode.ARRAY_HANDLE] = lambda x: _make_array(x.v_handle, True, False)
+C_TO_PY_ARG_SWITCH[TypeCode.NDARRAY_CONTAINER] = lambda x: _make_array(x.v_handle, False, True)
 
 _CLASS_MODULE = None
 _CLASS_FUNCTION = None
diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index 8b88e7dc98ea..37a18cbe4051 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -4,7 +4,7 @@
 
 import ctypes
 from ..base import _LIB, check_call, c_str
-from ..runtime_ctypes import TVMArrayHandle
+from ..runtime_ctypes import TVMArrayHandle, TVMNDArrayContainerHandle
 from .types import RETURN_SWITCH, C_TO_PY_ARG_SWITCH, _wrap_arg_func, _return_handle
 
 
@@ -28,7 +28,7 @@ def _from_dlpack(dltensor):
         check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
         ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
         ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
-        return _make_array(handle, False)
+        return _make_array(handle, False, False)
     raise ValueError("Expect a dltensor field, PyCapsule can only be consumed once")
 
 
@@ -77,9 +77,15 @@ def to_dlpack(self):
         return ctypes.pythonapi.PyCapsule_New(handle, _c_str_dltensor, _c_dlpack_deleter)
 
 
-def _make_array(handle, is_view):
+def _make_array(handle, is_view, is_container):
+    global _TVM_ND_CLS
     handle = ctypes.cast(handle, TVMArrayHandle)
-    return _CLASS_NDARRAY(handle, is_view)
+    fcreate = _CLASS_NDARRAY
+    if is_container and _TVM_ND_CLS:
+        array_type_info = ctypes.cast(handle, TVMNDArrayContainerHandle).array_type_info.value
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    return fcreate(handle, is_view)
 
 _TVM_COMPATS = ()
 
@@ -91,6 +97,11 @@ def _reg_extension(cls, fcreate):
         RETURN_SWITCH[cls._tvm_tcode] = fret
         C_TO_PY_ARG_SWITCH[cls._tvm_tcode] = _wrap_arg_func(fret, cls._tvm_tcode)
 
+_TVM_ND_CLS = {}
+
+def _reg_ndarray(cls, fcreate):
+    global _TVM_ND_CLS
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
 
 _CLASS_NDARRAY = None
 
diff --git a/python/tvm/_ffi/_cython/base.pxi b/python/tvm/_ffi/_cython/base.pxi
index ac5532835c47..feb2fffebd23 100644
--- a/python/tvm/_ffi/_cython/base.pxi
+++ b/python/tvm/_ffi/_cython/base.pxi
@@ -2,7 +2,7 @@ from ..base import TVMError
 from libcpp.vector cimport vector
 from cpython.version cimport PY_MAJOR_VERSION
 from cpython cimport pycapsule
-from libc.stdint cimport int64_t, uint64_t, uint8_t, uint16_t
+from libc.stdint cimport int32_t, int64_t, uint64_t, uint8_t, uint16_t
 import ctypes
 
 cdef enum TVMTypeCode:
@@ -61,6 +61,14 @@ ctypedef void* TVMRetValueHandle
 ctypedef void* TVMFunctionHandle
 ctypedef void* NodeHandle
 
+ctypedef struct TVMNDArrayContainer:
+    DLTensor dl_tensor
+    void* manager_ctx
+    void (*deleter)(DLManagedTensor* self)
+    int32_t array_type_info
+
+ctypedef TVMNDArrayContainer* TVMNDArrayContainerHandle
+
 ctypedef int (*TVMPackedCFunc)(
     TVMValue* args,
     int* type_codes,
diff --git a/python/tvm/_ffi/_cython/function.pxi b/python/tvm/_ffi/_cython/function.pxi
index dcbf4c665e66..9995aea6357a 100644
--- a/python/tvm/_ffi/_cython/function.pxi
+++ b/python/tvm/_ffi/_cython/function.pxi
@@ -33,7 +33,7 @@ cdef int tvm_callback(TVMValue* args,
         if tcode != kArrayHandle:
             pyargs.append(make_ret(value, tcode))
         else:
-            pyargs.append(c_make_array(value.v_handle, True))
+            pyargs.append(c_make_array(value.v_handle, True, False))
     try:
         rv = local_pyfunc(*pyargs)
     except Exception:
@@ -175,7 +175,7 @@ cdef inline object make_ret(TVMValue value, int tcode):
     elif tcode == kFloat:
         return value.v_float64
     elif tcode == kNDArrayContainer:
-        return c_make_array(value.v_handle, False)
+        return c_make_array(value.v_handle, False, True)
     elif tcode == kStr:
         return py_str(value.v_str)
     elif tcode == kBytes:
diff --git a/python/tvm/_ffi/_cython/ndarray.pxi b/python/tvm/_ffi/_cython/ndarray.pxi
index 0a507affec1c..4cd6709a0118 100644
--- a/python/tvm/_ffi/_cython/ndarray.pxi
+++ b/python/tvm/_ffi/_cython/ndarray.pxi
@@ -20,7 +20,7 @@ def _from_dlpack(object dltensor):
         # set name and destructor to be empty
         pycapsule.PyCapsule_SetDestructor(dltensor, NULL)
         pycapsule.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
-        return c_make_array(chandle, 0)
+        return c_make_array(chandle, False, False)
     raise ValueError("Expect a dltensor field, pycapsule.PyCapsule can only be consumed once")
 
 
@@ -73,8 +73,15 @@ cdef class NDArrayBase:
         return pycapsule.PyCapsule_New(dltensor, _c_str_dltensor, _c_dlpack_deleter)
 
 
-cdef c_make_array(void* chandle, is_view):
-    ret = _CLASS_NDARRAY(None, is_view)
+cdef c_make_array(void* chandle, is_view, is_container):
+    global _TVM_ND_CLS
+    cdef int32_t array_type_info
+    fcreate = _CLASS_NDARRAY
+    if is_container and len(_TVM_ND_CLS) > 0:
+        array_type_info = (<TVMNDArrayContainerHandle>chandle).array_type_info
+        if array_type_info > 0:
+            fcreate = _TVM_ND_CLS[array_type_info]
+    ret = fcreate(None, is_view)
     (<NDArrayBase>ret).chandle = <DLTensor*>chandle
     return ret
 
@@ -89,11 +96,16 @@ def _reg_extension(cls, fcreate):
     if fcreate:
         _TVM_EXT_RET[cls._tvm_tcode] = fcreate
 
+cdef _TVM_ND_CLS = {}
 
-def _make_array(handle, is_view):
+def _reg_ndarray(cls, fcreate):
+    global _TVM_ND_CLS
+    _TVM_ND_CLS[cls._array_type_code] = fcreate
+
+def _make_array(handle, is_view, is_container):
     cdef unsigned long long ptr
     ptr = ctypes.cast(handle, ctypes.c_void_p).value
-    return c_make_array(<void*>ptr, is_view)
+    return c_make_array(<void*>ptr, is_view, is_container)
 
 cdef object _CLASS_NDARRAY = None
 
diff --git a/python/tvm/_ffi/ndarray.py b/python/tvm/_ffi/ndarray.py
index e49c3b62f473..3c5b170bdca7 100644
--- a/python/tvm/_ffi/ndarray.py
+++ b/python/tvm/_ffi/ndarray.py
@@ -17,15 +17,18 @@
     if _FFI_MODE == "ctypes":
         raise ImportError()
     if sys.version_info >= (3, 0):
-        from ._cy3.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+        from ._cy3.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy3.core import NDArrayBase as _NDArrayBase
+        from ._cy3.core import _reg_extension, _reg_ndarray
     else:
-        from ._cy2.core import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+        from ._cy2.core import _set_class_ndarray, _make_array, _from_dlpack
         from ._cy2.core import NDArrayBase as _NDArrayBase
+        from ._cy2.core import _reg_extension, _reg_ndarray
 except IMPORT_EXCEPT:
     # pylint: disable=wrong-import-position
-    from ._ctypes.ndarray import _set_class_ndarray, _reg_extension, _make_array, _from_dlpack
+    from ._ctypes.ndarray import _set_class_ndarray, _make_array, _from_dlpack
     from ._ctypes.ndarray import NDArrayBase as _NDArrayBase
+    from ._ctypes.ndarray import _reg_extension, _reg_ndarray
 
 
 def context(dev_type, dev_id=0):
@@ -111,7 +114,7 @@ def empty(shape, dtype="float32", ctx=context(1, 0)):
         ctx.device_type,
         ctx.device_id,
         ctypes.byref(handle)))
-    return _make_array(handle, False)
+    return _make_array(handle, False, False)
 
 
 def from_dlpack(dltensor):
@@ -295,6 +298,7 @@ def free_extension_handle(handle, type_code):
     """
     check_call(_LIB.TVMExtTypeFree(handle, ctypes.c_int(type_code)))
 
+
 def register_extension(cls, fcreate=None):
     """Register a extension class to TVM.
 
@@ -306,21 +310,26 @@ def register_extension(cls, fcreate=None):
     cls : class
         The class object to be registered as extension.
 
+    fcreate : function, optional
+        The creation function to create a class object given handle value.
+
     Note
     ----
-    The registered class is requires one property: _tvm_handle and a class attribute _tvm_tcode.
+    The registered class is requires one property: _tvm_handle.
+
+    If the registered class is a subclass of NDArray,
+    it is required to have a class attribute _array_type_code.
+    Otherwise, it is required to have a class attribute _tvm_tcode.
 
     - ```_tvm_handle``` returns integer represents the address of the handle.
-    - ```_tvm_tcode``` gives integer represents type code of the class.
+    - ```_tvm_tcode``` or ```_array_type_code``` gives integer represents type
+      code of the class.
 
     Returns
     -------
     cls : class
         The class being registered.
 
-    fcreate : function, optional
-        The creation function to create a class object given handle value.
-
     Example
     -------
     The following code registers user defined class
@@ -339,7 +348,13 @@ def __init__(self):
            def _tvm_handle(self):
                return self.handle.value
     """
-    if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
-        raise ValueError("Cannot register create when extension tcode is same as buildin")
-    _reg_extension(cls, fcreate)
+    if issubclass(cls, _NDArrayBase):
+        assert fcreate is not None
+        assert hasattr(cls, "_array_type_code")
+        _reg_ndarray(cls, fcreate)
+    else:
+        assert hasattr(cls, "_tvm_tcode")
+        if fcreate and cls._tvm_tcode < TypeCode.EXT_BEGIN:
+            raise ValueError("Cannot register create when extension tcode is same as buildin")
+        _reg_extension(cls, fcreate)
     return cls
diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py
index ef5316b5e267..e1b78735a97d 100644
--- a/python/tvm/_ffi/runtime_ctypes.py
+++ b/python/tvm/_ffi/runtime_ctypes.py
@@ -240,3 +240,12 @@ class TVMArray(ctypes.Structure):
                 ("byte_offset", ctypes.c_uint64)]
 
 TVMArrayHandle = ctypes.POINTER(TVMArray)
+
+class TVMNDArrayContainer(ctypes.Structure):
+    """TVM NDArray::Container"""
+    _fields_ = [("dl_tensor", TVMArray),
+                ("manager_ctx", ctypes.c_void_p),
+                ("deleter", ctypes.c_void_p),
+                ("array_type_info", ctypes.c_int32)]
+
+TVMNDArrayContainerHandle = ctypes.POINTER(TVMNDArrayContainer)
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index b35c3de63918..e2750369b6af 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -16,7 +16,7 @@
 class NDArray(NDArrayBase):
     """Lightweight NDArray class of TVM runtime.
 
-    Strictly this is only an Array Container(a buffer object)
+    Strictly this is only an Array Container (a buffer object)
     No arthimetic operations are defined.
     All operations are performed by TVM functions.
 
diff --git a/tests/cpp/packed_func_test.cc b/tests/cpp/packed_func_test.cc
index abe26fabe9ea..83c0ba602927 100644
--- a/tests/cpp/packed_func_test.cc
+++ b/tests/cpp/packed_func_test.cc
@@ -168,7 +168,7 @@ namespace tvm {
 namespace runtime {
 
 template<>
-struct extension_class_info<test::IntVector> {
+struct extension_type_info<test::IntVector> {
   static const int code = kExtBegin + 1;
 };
 }  // runtime

From a1c2c4398a6ae98222164ef05f0f7382b77100a4 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Thu, 21 Feb 2019 18:43:07 +0000
Subject: [PATCH 03/93] Fix pylint 2.2.2 gripes. (#2642)

---
 nnvm/python/nnvm/_base.py                     |  2 +-
 nnvm/python/nnvm/attribute.py                 |  3 +-
 nnvm/python/nnvm/compiler/compile_engine.py   |  2 -
 nnvm/python/nnvm/frontend/caffe2.py           | 12 ++-
 nnvm/python/nnvm/frontend/coreml.py           | 37 ++++-----
 nnvm/python/nnvm/frontend/darknet.py          |  2 -
 nnvm/python/nnvm/frontend/keras.py            | 78 +++++++++---------
 nnvm/python/nnvm/frontend/mxnet.py            |  2 +-
 .../python/nnvm/frontend/onnx_caffe2_utils.py |  3 +-
 nnvm/python/nnvm/frontend/tensorflow.py       |  8 +-
 .../nnvm/frontend/util/tensorflow_parser.py   |  2 +
 nnvm/python/nnvm/symbol.py                    | 39 ++++-----
 nnvm/python/nnvm/testing/inception_v3.py      |  5 +-
 nnvm/python/nnvm/testing/yolo_detection.py    |  2 -
 nnvm/python/nnvm/top/attr_dict.py             |  9 +--
 python/tvm/_ffi/base.py                       |  1 -
 python/tvm/_ffi/function.py                   | 11 ++-
 python/tvm/_ffi/node_generic.py               | 18 ++---
 python/tvm/arith.py                           |  4 +-
 python/tvm/autotvm/measure/executor.py        |  5 +-
 python/tvm/autotvm/record.py                  | 12 +--
 python/tvm/autotvm/task/space.py              | 25 +++---
 python/tvm/autotvm/task/task.py               | 24 +++---
 python/tvm/autotvm/tuner/tuner.py             |  2 +-
 python/tvm/container.py                       |  2 +-
 python/tvm/contrib/nvcc.py                    |  4 +-
 python/tvm/contrib/verilog.py                 |  3 +-
 python/tvm/hybrid/parser.py                   | 20 ++---
 python/tvm/hybrid/util.py                     |  2 +-
 python/tvm/intrin.py                          |  2 +-
 python/tvm/make.py                            |  2 +-
 python/tvm/ndarray.py                         |  1 -
 python/tvm/relay/_parser.py                   | 22 +++--
 python/tvm/relay/adt.py                       |  2 +-
 python/tvm/relay/backend/compile_engine.py    |  2 -
 python/tvm/relay/backend/interpreter.py       |  1 -
 python/tvm/relay/build_module.py              |  5 +-
 python/tvm/relay/frontend/caffe2.py           | 15 ++--
 python/tvm/relay/frontend/common.py           |  1 -
 python/tvm/relay/frontend/coreml.py           | 35 ++++----
 python/tvm/relay/frontend/keras.py            | 80 +++++++++----------
 python/tvm/relay/frontend/mxnet.py            | 16 ++--
 python/tvm/relay/frontend/onnx.py             |  3 +-
 python/tvm/relay/frontend/tensorflow.py       |  8 +-
 python/tvm/relay/frontend/tflite.py           | 39 +++++----
 python/tvm/relay/op/nn/_nn.py                 | 12 +--
 python/tvm/relay/op/op_attrs.py               |  7 +-
 python/tvm/relay/testing/inception_v3.py      |  5 +-
 python/tvm/relay/ty.py                        |  1 -
 python/tvm/rpc/proxy.py                       |  2 +-
 python/tvm/rpc/tornado_util.py                |  5 +-
 python/tvm/rpc/tracker.py                     |  2 +-
 python/tvm/schedule.py                        |  3 -
 python/tvm/stmt.py                            |  2 +-
 python/tvm/tensor.py                          |  5 +-
 topi/python/topi/arm_cpu/bitserial_conv2d.py  |  2 +-
 topi/python/topi/arm_cpu/conv2d.py            |  2 +-
 topi/python/topi/cuda/conv2d.py               |  5 +-
 topi/python/topi/cuda/conv2d_winograd.py      |  4 +-
 topi/python/topi/cuda/reduction.py            |  2 +-
 topi/python/topi/nn/bitserial_conv2d.py       |  4 +-
 topi/python/topi/nn/conv2d.py                 |  7 +-
 topi/python/topi/testing/upsampling_python.py |  5 +-
 topi/python/topi/x86/bitserial_conv2d.py      |  2 +-
 topi/python/topi/x86/conv2d.py                |  7 +-
 vta/python/vta/environment.py                 |  5 +-
 vta/python/vta/graph.py                       |  2 +-
 vta/python/vta/intrin.py                      |  2 +-
 vta/python/vta/ir_pass.py                     | 29 +++----
 vta/python/vta/top/vta_conv2d.py              |  5 +-
 70 files changed, 311 insertions(+), 389 deletions(-)

diff --git a/nnvm/python/nnvm/_base.py b/nnvm/python/nnvm/_base.py
index 29390a2201bf..dd797ba4489f 100644
--- a/nnvm/python/nnvm/_base.py
+++ b/nnvm/python/nnvm/_base.py
@@ -31,7 +31,7 @@
 
 class NNVMError(Exception):
     """Error that will be throwed by all nnvm functions"""
-    pass
+
 
 def _load_lib():
     """Load libary by searching possible path."""
diff --git a/nnvm/python/nnvm/attribute.py b/nnvm/python/nnvm/attribute.py
index a023b9cd88df..4a08bb622ed5 100644
--- a/nnvm/python/nnvm/attribute.py
+++ b/nnvm/python/nnvm/attribute.py
@@ -42,8 +42,7 @@ def get(self, attr):
             if attr:
                 ret.update(attr)
             return ret
-        else:
-            return attr
+        return attr
 
     def __enter__(self):
         # pylint: disable=protected-access
diff --git a/nnvm/python/nnvm/compiler/compile_engine.py b/nnvm/python/nnvm/compiler/compile_engine.py
index 289f09deb280..e6158fb611fe 100644
--- a/nnvm/python/nnvm/compiler/compile_engine.py
+++ b/nnvm/python/nnvm/compiler/compile_engine.py
@@ -23,13 +23,11 @@ def graph(self):
 @tvm.register_node
 class GraphCacheEntry(tvm.node.NodeBase):
     """CacheEntry of compilation into a TVM Function"""
-    pass
 
 
 @tvm.register_node
 class GraphFunc(tvm.node.NodeBase):
     """Compiled result of a graph into a TVM Function"""
-    pass
 
 
 class Engine(object):
diff --git a/nnvm/python/nnvm/frontend/caffe2.py b/nnvm/python/nnvm/frontend/caffe2.py
index 2450af628a90..8211971a8c3c 100755
--- a/nnvm/python/nnvm/frontend/caffe2.py
+++ b/nnvm/python/nnvm/frontend/caffe2.py
@@ -73,9 +73,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        else:
-            raise NotImplementedError('{} not implemented'.format(
-                cls.__name__))
+        raise NotImplementedError('{} not implemented'.format(
+            cls.__name__))
 
 
 _caffe2_internal_args = {
@@ -175,11 +174,10 @@ def _get_axis_from_order_str(order):
             order = order if isinstance(order, str) else order.decode('UTF-8')
             if order == 'NCHW':
                 return 1
-            elif order == 'NHWC':
+            if order == 'NHWC':
                 return 3
-            else:
-                raise RuntimeError(
-                    "Unsupported storage order: {} in caffe2".format(order))
+            raise RuntimeError(
+                "Unsupported storage order: {} in caffe2".format(order))
 
         return AttrCvt(
             op_name='concatenate',
diff --git a/nnvm/python/nnvm/frontend/coreml.py b/nnvm/python/nnvm/frontend/coreml.py
index bc544243bd92..77285efe7a76 100644
--- a/nnvm/python/nnvm/frontend/coreml.py
+++ b/nnvm/python/nnvm/frontend/coreml.py
@@ -98,33 +98,33 @@ def ActivationParams(op, insym, symtab):
     par = getattr(op, whichActivation)
     if whichActivation == 'linear':
         return _sym.__add_scalar__(_sym.__mul_scalar__(insym, scalar=par.alpha), scalar=par.beta)
-    elif whichActivation == 'ReLU':
+    if whichActivation == 'ReLU':
         return _sym.relu(insym)
-    elif whichActivation == 'leakyReLU':
+    if whichActivation == 'leakyReLU':
         return _sym.leaky_relu(insym, alpha=par.alpha)
-    elif whichActivation == 'thresholdedReLU':
+    if whichActivation == 'thresholdedReLU':
         alpha_tensor = _sym.full_like(insym, fill_value=float(par.alpha))
         return _sym.elemwise_mul(insym, _sym.greater(insym, alpha_tensor))
-    elif whichActivation == 'PReLU':
+    if whichActivation == 'PReLU':
         return _sym.prelu(insym, alpha=par.alpha)
-    elif whichActivation == 'tanh':
+    if whichActivation == 'tanh':
         return _sym.tanh(insym)
-    elif whichActivation == 'scaledTanh':
+    if whichActivation == 'scaledTanh':
         return _sym.__mul_scalar__(_sym.tanh(_sym.__mul_scalar__(
             insym, scalar=par.beta)), scalar=par.alpha)
-    elif whichActivation == 'sigmoid':
+    if whichActivation == 'sigmoid':
         return _sym.sigmoid(insym)
-    elif whichActivation == 'sigmoidHard':
+    if whichActivation == 'sigmoidHard':
         transformX = (par.alpha * insym) + par.beta
         return _sym.clip(transformX, a_min=0, a_max=1)
-    elif whichActivation == 'ELU':
+    if whichActivation == 'ELU':
         return _sym.__mul_scalar__(_sym.__add_scalar__(
             _sym.exp(insym), scalar=-1), scalar=par.alpha)
-    elif whichActivation == 'softsign':
+    if whichActivation == 'softsign':
         return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
-    elif whichActivation == 'softplus':
+    if whichActivation == 'softplus':
         return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
-    elif whichActivation == 'parametricSoftplus':
+    if whichActivation == 'parametricSoftplus':
         alpha = list(par.alpha.floatValue)
         beta = list(par.alpha.floatValue)
         if len(alpha) == 1:
@@ -136,8 +136,7 @@ def ActivationParams(op, insym, symtab):
         betasym = symtab.new_const(beta)
         return _sym.broadcast_mul(_sym.log(_sym.broadcast_add(
             _sym.exp(insym), betasym)), alphasym)
-    else:
-        raise NotImplementedError('%s not implemented' % whichActivation)
+    raise NotImplementedError('%s not implemented' % whichActivation)
 
 def ScaleLayerParams(op, insym, symtab):
     """Scale layer params."""
@@ -157,10 +156,9 @@ def PoolingLayerParams(op, insym, symtab):
     if op.globalPooling:
         if op.type == 0:
             return _sym.global_max_pool2d(insym)
-        elif op.type == 1:
+        if op.type == 1:
             return _sym.global_avg_pool2d(insym)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -190,10 +188,9 @@ def PoolingLayerParams(op, insym, symtab):
 
         if op.type == 0:
             return _sym.max_pool2d(insym, **params)
-        elif op.type == 1:
+        if op.type == 1:
             return _sym.avg_pool2d(insym, **params)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
 def SoftmaxLayerParams(op, insym, symtab):
     return _sym.softmax(_sym.flatten(insym))
diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 18d07d07ac6b..39470bfb02ec 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -921,8 +921,6 @@ def _make_outlist(self, sym, op_name, layer, layer_num):
             if layer_num != self.net.n-1:
                 self._outs.insert(0, sym)
 
-        return
-
     def from_darknet(self):
         """To convert the darknet symbol to nnvm symbols."""
         for i in range(self.net.n):
diff --git a/nnvm/python/nnvm/frontend/keras.py b/nnvm/python/nnvm/frontend/keras.py
index 9dabebc14b90..56758ada5f46 100644
--- a/nnvm/python/nnvm/frontend/keras.py
+++ b/nnvm/python/nnvm/frontend/keras.py
@@ -47,35 +47,34 @@ def _convert_activation(insym, keras_layer, _):
         beta = keras_layer.beta if hasattr(keras_layer, "beta") else 0
         return _sym.__add_scalar__(_sym.__mul_scalar__(insym, \
             scalar=alpha), scalar=beta)
-    elif act_type == 'softmax':
+    if act_type == 'softmax':
         return _sym.softmax(insym, axis=1)
-    elif act_type == 'sigmoid':
+    if act_type == 'sigmoid':
         return _sym.sigmoid(insym)
-    elif act_type == 'tanh':
+    if act_type == 'tanh':
         return _sym.tanh(insym)
-    elif act_type == 'relu':
+    if act_type == 'relu':
         return _sym.relu(insym)
-    elif act_type == 'softplus':
+    if act_type == 'softplus':
         return _sym.log(_sym.__add_scalar__(_sym.exp(insym), scalar=1))
-    elif act_type == 'elu':
+    if act_type == 'elu':
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
         return _get_elu(insym, alpha)
-    elif act_type == 'selu':
+    if act_type == 'selu':
         # Alpha, Gamma values, obtained from  https://arxiv.org/abs/1706.02515
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") \
             else 1.6732632423543772848170429916717
         gamma = keras_layer.gamma if hasattr(keras_layer, "gamma") \
             else 1.0507009873554804934193349852946
         return gamma * _get_elu(insym, alpha)
-    elif act_type == 'relu6':
+    if act_type == 'relu6':
         return _sym.clip(insym, a_min=0, a_max=6)
-    elif act_type == 'softsign':
+    if act_type == 'softsign':
         return insym / (1 + (_sym.relu(insym) + _sym.relu(_sym.negative(insym))))
-    elif act_type == 'hard_sigmoid':
+    if act_type == 'hard_sigmoid':
         transformX = (0.2 * insym) + 0.5
         return _sym.clip(transformX, a_min=0, a_max=1)
-    else:
-        raise TypeError("Unsupported activation type : {}".format(act_type))
+    raise TypeError("Unsupported activation type : {}".format(act_type))
 
 
 def _convert_advanced_activation(insym, keras_layer, symtab):
@@ -84,12 +83,12 @@ def _convert_advanced_activation(insym, keras_layer, symtab):
         if keras_layer.max_value:
             return _sym.clip(insym, a_min=0, a_max=keras_layer.max_value)
         return _sym.relu(insym)
-    elif act_type == 'LeakyReLU':
+    if act_type == 'LeakyReLU':
         return _sym.leaky_relu(insym, alpha=keras_layer.alpha)
-    elif act_type == 'ELU':
+    if act_type == 'ELU':
         alpha = keras_layer.alpha if hasattr(keras_layer, "alpha") else 1
         return _get_elu(insym, alpha)
-    elif act_type == 'PReLU':
+    if act_type == 'PReLU':
         assert hasattr(keras_layer, "alpha"), \
             "alpha required for PReLU."
         _check_data_format(keras_layer)
@@ -97,12 +96,11 @@ def _convert_advanced_activation(insym, keras_layer, symtab):
         return -symtab.new_const(keras_layer.get_weights()[0] \
                                  .transpose(np.roll(range(size), 1))) \
                                  * _sym.relu(-insym) + _sym.relu(insym)
-    elif act_type == 'ThresholdedReLU':
+    if act_type == 'ThresholdedReLU':
         theta = keras_layer.theta if hasattr(keras_layer, "theta") else 1.0
         theta_tensor = _sym.full_like(insym[0], fill_value=float(theta))
         return _sym.elemwise_mul(insym[0], _sym.greater(insym[0], theta_tensor, out_type="float32"))
-    else:
-        raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
 
 
 def _convert_merge(insym, keras_layer, _):
@@ -280,31 +278,29 @@ def _convert_pooling(insym, keras_layer, symtab):
     # global pool in keras = global pool + flatten in nnvm
     if pool_type == 'GlobalMaxPooling2D':
         return _convert_flatten(_sym.global_max_pool2d(insym), keras_layer, symtab)
-    elif pool_type == 'GlobalAveragePooling2D':
+    if pool_type == 'GlobalAveragePooling2D':
         return _convert_flatten(_sym.global_avg_pool2d(insym), keras_layer, symtab)
+    pool_h, pool_w = keras_layer.pool_size
+    stride_h, stride_w = keras_layer.strides
+    params = {'pool_size': [pool_h, pool_w],
+              'strides': [stride_h, stride_w],
+              'padding': [0, 0]}
+    if keras_layer.padding == 'valid':
+        pass
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+        params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        pool_h, pool_w = keras_layer.pool_size
-        stride_h, stride_w = keras_layer.strides
-        params = {'pool_size': [pool_h, pool_w],
-                  'strides': [stride_h, stride_w],
-                  'padding': [0, 0]}
-        if keras_layer.padding == 'valid':
-            pass
-        elif keras_layer.padding == 'same':
-            in_h = keras_layer.input_shape[1]
-            in_w = keras_layer.input_shape[2]
-            pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
-            pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
-        else:
-            raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
-        if pool_type == 'MaxPooling2D':
-            return _sym.max_pool2d(insym, **params)
-        elif pool_type == 'AveragePooling2D':
-            # TODO: in keras, padded zeros are not calculated
-            return _sym.avg_pool2d(insym, **params)
-        else:
-            raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    if pool_type == 'MaxPooling2D':
+        return _sym.max_pool2d(insym, **params)
+    if pool_type == 'AveragePooling2D':
+        # TODO: in keras, padded zeros are not calculated
+        return _sym.avg_pool2d(insym, **params)
+    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
 
 
 def _convert_upsample(insym, keras_layer, _):
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 2cf701ea9040..8c92cb99f37c 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -438,7 +438,7 @@ def _topo_sort(symbol):
         if childs is None:
             dep_cnts[name] = 0
         else:
-            dep_cnts[name] = len(set([c.attr('name') for c in childs]))
+            dep_cnts[name] = len({c.attr('name') for c in childs})
             for child in childs:
                 child_name = child.attr('name')
                 if child_name not in deps:
diff --git a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
index 4dfc366d0b6f..ff74016cde06 100644
--- a/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
+++ b/nnvm/python/nnvm/frontend/onnx_caffe2_utils.py
@@ -9,8 +9,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 3099911b86d0..980e60414595 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -68,8 +68,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
     return _impl
 
 def _dimension_constraint():
@@ -433,8 +432,7 @@ def _impl(inputs, attr, params):
                     op_name="reshape",
                     extras={'shape':tuple(params_new[0].asnumpy().flatten())},
                     ignores=['Tshape'])(inputs, attr)
-            else:
-                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -1411,7 +1409,7 @@ def _parse_param(self, key, value, name):
             self._nodes[name] = _sym.Variable(name=name,
                                               shape=self._params[name].shape)
         else:
-            if key != 'dtype' and key != '_output_shapes' and key != '_class':
+            if key not in ('dtype', '_output_shapes', '_class'):
                 raise NotImplementedError \
                     ("Other attributes for a Const(param) Node {} ? .".format(key))
 
diff --git a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py b/nnvm/python/nnvm/frontend/util/tensorflow_parser.py
index 9b745c9d02c9..ce51f7c2315b 100644
--- a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py
+++ b/nnvm/python/nnvm/frontend/util/tensorflow_parser.py
@@ -115,6 +115,8 @@ def _load_ckpt(self):
         """TODO: Load checkpoint model."""
         raise RuntimeError("InputConfiguration: Loading tf checkpoint model is "
                            "not supported yet.")
+        # pylint: disable=unreachable
+        return 0
 
     def parse(self):
         """Parse tensorflow models: checkpoints, saved models, and single pb
diff --git a/nnvm/python/nnvm/symbol.py b/nnvm/python/nnvm/symbol.py
index 0acacb247a2c..ec8853c3d118 100644
--- a/nnvm/python/nnvm/symbol.py
+++ b/nnvm/python/nnvm/symbol.py
@@ -50,10 +50,9 @@ def __add__(self, other):
         """x.__add__(y) <=> x+y"""
         if isinstance(other, Symbol):
             return __add_symbol__(self, other)
-        elif isinstance(other, _Number):
+        if isinstance(other, _Number):
             return __add_scalar__(self, scalar=other)
-        else:
-            raise TypeError("type %s not supported" % str(type(other)))
+        raise TypeError("type %s not supported" % str(type(other)))
 
     def __radd__(self, other):
         return self.__add__(other)
@@ -64,14 +63,12 @@ def __sub__(self, other):
             return __sub_symbol__(self, other)
         if isinstance(other, _Number):
             return __sub_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rsub__(self, other):
         if isinstance(other, _Number):
             return __rsub_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __mul__(self, other):
         """x.__mul__(y) <=> x*y"""
@@ -79,8 +76,7 @@ def __mul__(self, other):
             return __mul_symbol__(self, other)
         if isinstance(other, _Number):
             return __mul_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rmul__(self, other):
         return self.__mul__(other)
@@ -91,28 +87,24 @@ def __div__(self, other):
             return __div_symbol__(self, other)
         if isinstance(other, _Number):
             return __div_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rdiv__(self, other):
         if isinstance(other, _Number):
             return __rdiv_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __lshift__(self, other):
         """x.__lshift__(y) <=> x << y"""
         if isinstance(other, _Number):
             return __lshift_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rshift__(self, other):
         """x.__rshift__(y) <=> x >> y"""
         if isinstance(other, _Number):
             return __rshift_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __truediv__(self, other):
         return self.__div__(other)
@@ -126,14 +118,12 @@ def __pow__(self, other):
             return __pow_symbol__(self, other)
         if isinstance(other, _Number):
             return __pow_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __rpow__(self, other):
         if isinstance(other, _Number):
             return __rpow_scalar__(self, scalar=other)
-        else:
-            raise TypeError('type %s not supported' % str(type(other)))
+        raise TypeError('type %s not supported' % str(type(other)))
 
     def __neg__(self):
         """x.__neg__() <=> -x"""
@@ -238,12 +228,11 @@ def _get_list_copt(self, option):
         """internal function to get list option"""
         if option == 'all':
             return _ctypes.c_int(0)
-        elif option == 'read_only':
+        if option == 'read_only':
             return _ctypes.c_int(1)
-        elif option == 'aux_state':
+        if option == 'aux_state':
             return _ctypes.c_int(2)
-        else:
-            raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
+        raise ValueError("option need to be in {'all', 'read_only, 'aux_state'}")
 
     def list_input_variables(self, option='all'):
         """List all the input variables in the symbol.
diff --git a/nnvm/python/nnvm/testing/inception_v3.py b/nnvm/python/nnvm/testing/inception_v3.py
index f14daa1ae656..3faded3b2ece 100644
--- a/nnvm/python/nnvm/testing/inception_v3.py
+++ b/nnvm/python/nnvm/testing/inception_v3.py
@@ -23,11 +23,10 @@ def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None,
 def Pooling(data, kernel, stride, pad, pool_type, name):
     if pool_type == 'max':
         return sym.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name)
-    elif pool_type == 'avg':
+    if pool_type == 'avg':
         return sym.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad, name=name,
                               count_include_pad=True)
-    else:
-        raise ValueError("Invalid pooling type: " + pool_type)
+    raise ValueError("Invalid pooling type: " + pool_type)
 
 def Inception7A(data,
                 num_1x1,
diff --git a/nnvm/python/nnvm/testing/yolo_detection.py b/nnvm/python/nnvm/testing/yolo_detection.py
index 7c600d38db62..3d9f2cacd482 100644
--- a/nnvm/python/nnvm/testing/yolo_detection.py
+++ b/nnvm/python/nnvm/testing/yolo_detection.py
@@ -88,7 +88,6 @@ def _get_yolo_detections(l, im_shape, net_shape, thresh, relative, dets):
         before_correct_dets.append(detection)
     dets.extend(_correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
                                net_shape[0], net_shape[1], relative))
-    return
 
 def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
     data = l['output']
@@ -114,7 +113,6 @@ def _get_region_detections(l, im_shape, net_shape, thresh, relative, dets):
     _correct_boxes(before_correct_dets, im_shape[0], im_shape[1],
                    net_shape[0], net_shape[1], relative)
     dets.extend(before_correct_dets)
-    return
 
 def fill_network_boxes(net_shape, im_shape,
                        thresh, relative, tvm_out):
diff --git a/nnvm/python/nnvm/top/attr_dict.py b/nnvm/python/nnvm/top/attr_dict.py
index 834fffdd01c2..58561e7d5111 100644
--- a/nnvm/python/nnvm/top/attr_dict.py
+++ b/nnvm/python/nnvm/top/attr_dict.py
@@ -129,14 +129,13 @@ def get_bool(self, key):
         lowercase = self[key].lower()
         if lowercase == "1":
             return True
-        elif lowercase == "0":
+        if lowercase == "0":
             return False
-        elif lowercase == "true":
+        if lowercase == "true":
             return True
-        elif lowercase == "false":
+        if lowercase == "false":
             return False
-        else:
-            raise ValueError("Wrong bool format for key %s" % key)
+        raise ValueError("Wrong bool format for key %s" % key)
 
     def get_str(self, key):
         """Get string from attr dict
diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py
index 2579f22e44af..98229c092792 100644
--- a/python/tvm/_ffi/base.py
+++ b/python/tvm/_ffi/base.py
@@ -32,7 +32,6 @@
 
 class TVMError(Exception):
     """Error thrown by TVM function"""
-    pass
 
 
 def _load_lib():
diff --git a/python/tvm/_ffi/function.py b/python/tvm/_ffi/function.py
index ca1812d4109a..33013a4df5ef 100644
--- a/python/tvm/_ffi/function.py
+++ b/python/tvm/_ffi/function.py
@@ -51,7 +51,6 @@ class Function(_FunctionBase):
     tvm.register_func: How to register global function.
     tvm.get_global_func: How to get global function.
     """
-    pass
 
 
 class ModuleBase(object):
@@ -207,11 +206,11 @@ def get_global_func(name, allow_missing=False):
     check_call(_LIB.TVMFuncGetGlobal(c_str(name), ctypes.byref(handle)))
     if handle.value:
         return Function(handle, False)
-    else:
-        if allow_missing:
-            return None
-        else:
-            raise ValueError("Cannot find global function %s" % name)
+
+    if allow_missing:
+        return None
+
+    raise ValueError("Cannot find global function %s" % name)
 
 
 
diff --git a/python/tvm/_ffi/node_generic.py b/python/tvm/_ffi/node_generic.py
index e86453499faa..9f9c5383e3ba 100644
--- a/python/tvm/_ffi/node_generic.py
+++ b/python/tvm/_ffi/node_generic.py
@@ -36,16 +36,16 @@ def convert_to_node(value):
     """
     if isinstance(value, _CLASS_NODE_BASE):
         return value
-    elif isinstance(value, bool):
+    if isinstance(value, bool):
         return const(value, 'uint1x1')
-    elif isinstance(value, Number):
+    if isinstance(value, Number):
         return const(value)
-    elif isinstance(value, string_types):
+    if isinstance(value, string_types):
         return _api_internal._str(value)
-    elif isinstance(value, (list, tuple)):
+    if isinstance(value, (list, tuple)):
         value = [convert_to_node(x) for x in value]
         return _api_internal._Array(*value)
-    elif isinstance(value, dict):
+    if isinstance(value, dict):
         vlist = []
         for item in value.items():
             if (not isinstance(item[0], _CLASS_NODE_BASE) and
@@ -54,12 +54,12 @@ def convert_to_node(value):
             vlist.append(item[0])
             vlist.append(convert_to_node(item[1]))
         return _api_internal._Map(*vlist)
-    elif isinstance(value, NodeGeneric):
+    if isinstance(value, NodeGeneric):
         return value.asnode()
-    elif value is None:
+    if value is None:
         return None
-    else:
-        raise ValueError("don't know how to convert type %s to node" % type(value))
+
+    raise ValueError("don't know how to convert type %s to node" % type(value))
 
 
 def const(value, dtype=None):
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index 980c87d90316..778d761c659e 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -31,11 +31,11 @@ def max(self):
 @register_node
 class StrideSet(IntSet):
     """Represent set of strided integers"""
-    pass
+
 
 @register_node
 class ModularSet(IntSet):
     """Represent range of (coeff * x + base) for x in Z """
-    pass
+
 
 _init_api("tvm.arith")
diff --git a/python/tvm/autotvm/measure/executor.py b/python/tvm/autotvm/measure/executor.py
index f3ba4236ce63..ae48b9ba4c37 100644
--- a/python/tvm/autotvm/measure/executor.py
+++ b/python/tvm/autotvm/measure/executor.py
@@ -69,15 +69,14 @@ def get(self, timeout=None):
 
 class FutureError(RuntimeError):
     """Base error class of all future events"""
-    pass
+
 
 # pylint:disable=redefined-builtin
 class TimeoutError(FutureError):
     """Error raised when a task is timeout."""
-    pass
+
 
 class ExecutionError(FutureError):
     """
     Error raised when future execution crashes or failed.
     """
-    pass
diff --git a/python/tvm/autotvm/record.py b/python/tvm/autotvm/record.py
index 5adfae465ce3..c09fc82fb72c 100644
--- a/python/tvm/autotvm/record.py
+++ b/python/tvm/autotvm/record.py
@@ -83,7 +83,7 @@ def encode(inp, result, protocol='json'):
             "v": AUTOTVM_LOG_VERSION
         }
         return json.dumps(json_dict)
-    elif protocol == 'pickle':
+    if protocol == 'pickle':
         row = (str(inp.target),
                str(base64.b64encode(pickle.dumps([inp.task.name,
                                                   inp.task.args,
@@ -92,8 +92,8 @@ def encode(inp, result, protocol='json'):
                str(base64.b64encode(pickle.dumps(inp.config)).decode()),
                str(base64.b64encode(pickle.dumps(tuple(result))).decode()))
         return '\t'.join(row)
-    else:
-        raise RuntimeError("Invalid log protocol: " + protocol)
+
+    raise RuntimeError("Invalid log protocol: " + protocol)
 
 
 def decode(row, protocol='json'):
@@ -136,7 +136,7 @@ def clean_json_to_python(x):
         result = MeasureResult(*[tuple(x) if isinstance(x, list) else x for x in row["r"]])
 
         return inp, result
-    elif protocol == 'pickle':
+    if protocol == 'pickle':
         items = row.split("\t")
         tgt = _target.create(items[0])
         task_tuple = pickle.loads(base64.b64decode(items[1].encode()))
@@ -146,8 +146,8 @@ def clean_json_to_python(x):
         tsk = task.Task(task_tuple[0], task_tuple[1])
         tsk.workload = task_tuple[3]
         return MeasureInput(tgt, tsk, config), MeasureResult(*result)
-    else:
-        raise RuntimeError("Invalid log protocol: " + protocol)
+
+    raise RuntimeError("Invalid log protocol: " + protocol)
 
 
 def load_from_file(filename):
diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py
index 3fb02c6190cf..09f2dd0576a5 100644
--- a/python/tvm/autotvm/task/space.py
+++ b/python/tvm/autotvm/task/space.py
@@ -32,7 +32,6 @@ class InstantiationError(ValueError):
      raised by cfg.raise_error
      e.g. too many unrolling, too many threads in a block
     """
-    pass
 
 
 class TransformSpace(object):
@@ -321,17 +320,17 @@ def _merge_dfs(self, chains, size, tmp_pt, tmp_stack, merged):
         if np.sum(tmp_pt) == size:
             merged.append(list(tmp_stack))
             return
-        else:
-            for i in range(len(chains)):
-                # use i == np.argmax(....) here to take spatial order into consideration
-                # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
-                if (tmp_pt[i] < len(chains[i]) and
-                        (i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))]))):
-                    tmp_stack.append(chains[i][tmp_pt[i]])
-                    tmp_pt[i] += 1
-                    self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
-                    tmp_pt[i] -= 1
-                    tmp_stack.pop()
+
+        for i in range(len(chains)):
+            # use i == np.argmax(....) here to take spatial order into consideration
+            # if we don't want to consider spatial order, we can use tmp_pt[i] == np.max(....)
+            if (tmp_pt[i] < len(chains[i]) and
+                    (i == np.argmax([len(chains[x]) - tmp_pt[x] for x in range(len(chains))]))):
+                tmp_stack.append(chains[i][tmp_pt[i]])
+                tmp_pt[i] += 1
+                self._merge_dfs(chains, size, tmp_pt, tmp_stack, merged)
+                tmp_pt[i] -= 1
+                tmp_stack.pop()
 
 
 class ReorderEntity(object):
@@ -441,7 +440,7 @@ def _generate_space(self, now, tmp_stack):
         if now == self.num_axis:
             # only vectorize inner most dimension
             vec_ct = tmp_stack.count('vec')
-            if vec_ct == 0 or vec_ct == 1:
+            if vec_ct in (0, 1):
                 self.entities.append(AnnotateEntity(list(tmp_stack)))
         else:
             for ann in self.anns[now]:
diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 22a15143b96e..7c587fe39783 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -294,7 +294,7 @@ def get_config():
 
 class FlopCalculationError(RuntimeError):
     """Error happens when estimating FLOP for a compute op"""
-    pass
+
 
 def compute_flop(sch):
     """Calculate number of FLOP (floating number operations) of the compute ops in a schedule
@@ -328,29 +328,29 @@ def _count_flop(exp):
             if len(source) != 1:
                 raise FlopCalculationError("Found multiple output in the source of reduce op")
             return num_iter * (_count_flop(combiner[0]) + _count_flop(source[0]))
-        elif isinstance(exp, (expr.FloatImm, expr.IntImm, expr.UIntImm)):
+        if isinstance(exp, (expr.FloatImm, expr.IntImm, expr.UIntImm)):
             return 0
-        elif isinstance(exp, expr.Cast):
+        if isinstance(exp, expr.Cast):
             return _count_flop(exp.value)
-        elif isinstance(exp, expr.Var):
+        if isinstance(exp, expr.Var):
             return 0
-        elif isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod,
-                              expr.Max, expr.Min,
-                              expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
-                              expr.And, expr.Or, expr.Not)):
+        if isinstance(exp, (expr.Add, expr.Sub, expr.Mul, expr.Div, expr.Mod,
+                            expr.Max, expr.Min,
+                            expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
+                            expr.And, expr.Or, expr.Not)):
             base = 1 if "float" in exp.a.dtype else 0
 
             if isinstance(exp, expr.Not):  # unary
                 return base + _count_flop(exp.a)
 
             return base + _count_flop(exp.a) + _count_flop(exp.b)
-        elif isinstance(exp, expr.Select):
+        if isinstance(exp, expr.Select):
             return _count_flop(exp.condition) + max(_count_flop(exp.true_value),
                                                     _count_flop(exp.false_value))
-        elif isinstance(exp, expr.Call):
+        if isinstance(exp, expr.Call):
             return sum([_count_flop(x) for x in exp.args])
-        else:
-            raise FlopCalculationError("Found unsupported operator in the compute expr")
+
+        raise FlopCalculationError("Found unsupported operator in the compute expr")
 
     def traverse(ops):
         """accumulate flops"""
diff --git a/python/tvm/autotvm/tuner/tuner.py b/python/tvm/autotvm/tuner/tuner.py
index abd7ec4fad0b..120c97c2c003 100644
--- a/python/tvm/autotvm/tuner/tuner.py
+++ b/python/tvm/autotvm/tuner/tuner.py
@@ -69,7 +69,7 @@ def update(self, inputs, results):
         results: Array of autotvm.measure.MeasureResult
             result for measurement
         """
-        pass
+
 
     def tune(self, n_trial, measure_option, early_stopping=None, callbacks=()):
         """Begin tuning
diff --git a/python/tvm/container.py b/python/tvm/container.py
index ba30255f650a..e384a742c36f 100644
--- a/python/tvm/container.py
+++ b/python/tvm/container.py
@@ -90,7 +90,7 @@ class Range(NodeBase):
     You do not need to create Range explicitly.
     Python list and tuple will be converted automatically to Range in api functions.
     """
-    pass
+
 
 @register_node
 class LoweredFunc(NodeBase):
diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 21cc4844087c..99cea18d1487 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -151,14 +151,14 @@ def find_libdevice_path(arch):
     selected_ver = 0
     selected_path = None
     cuda_ver = get_cuda_version(cuda_path)
-    if cuda_ver == 9.0 or cuda_ver == 9.1:
+    if cuda_ver in (9.0, 9.1):
         path = os.path.join(lib_path, "libdevice.10.bc")
     else:
         for fn in os.listdir(lib_path):
             if not fn.startswith("libdevice"):
                 continue
             ver = int(fn.split(".")[-3].split("_")[-1])
-            if ver > selected_ver and ver <= arch:
+            if selected_ver < ver <= arch:
                 selected_ver = ver
                 selected_path = fn
         if selected_path is None:
diff --git a/python/tvm/contrib/verilog.py b/python/tvm/contrib/verilog.py
index 358366684fa4..f904a0cb01bf 100644
--- a/python/tvm/contrib/verilog.py
+++ b/python/tvm/contrib/verilog.py
@@ -118,8 +118,7 @@ def _find_vpi_path():
     vpi_found = [p for p in vpi_path if os.path.exists(p) and os.path.isfile(p)]
     if vpi_found:
         return os.path.dirname(vpi_found[0])
-    else:
-        raise ValueError("Cannot find tvm_vpi.vpi, make sure you did `make verilog`")
+    raise ValueError("Cannot find tvm_vpi.vpi, make sure you did `make verilog`")
 
 def search_path():
     """Get the search directory."""
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index b9d64866b305..147d164b61e1 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -189,9 +189,9 @@ def visit_Name(self, node):
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
         if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
             return entry
-        elif ty is Symbol.ConstVar:
+        if ty is Symbol.ConstVar:
             return entry if isinstance(node.ctx, ast.Load) else None
-        elif ty is Symbol.BufferVar:
+        if ty is Symbol.BufferVar:
             if isinstance(node.ctx, ast.Load):
                 return _make.Call(entry.dtype, entry.name, [_api.const(0, 'int32')], \
                                   _expr.Call.Halide, entry.op, entry.value_index)
@@ -274,12 +274,12 @@ def visit_Assign(self, node):
                 buf, args = lhs
                 return _make.Provide(buf.op, 0, rhs, args)
             return util.make_nop()
-        else:
-            lhs, args = self.visit(lhs)
-            _internal_assert(isinstance(lhs, Tensor), \
-                             "An array access's LHS is expected to be a expr.Call!")
-            res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
-            return res
+
+        lhs, args = self.visit(lhs)
+        _internal_assert(isinstance(lhs, Tensor), \
+                         "An array access's LHS is expected to be a expr.Call!")
+        res = _make.Provide(lhs.op, lhs.value_index, rhs, args)
+        return res
 
 
     def visit_Index(self, node):
@@ -347,7 +347,7 @@ def visit_If(self, node):
         if isinstance(cond, _expr.UIntImm):
             if cond.value:
                 return visit_list_to_block(self.visit, node.body)
-            elif node.orelse:
+            if node.orelse:
                 return visit_list_to_block(self.visit, node.orelse)
             return util.make_nop()
 
@@ -451,7 +451,7 @@ def visit_For(self, node):
                 bodies.append(body)
             return concat_list_to_block(bodies)
 
-        elif iter_var is None:
+        if iter_var is None:
             _internal_assert(for_type is not None, "The loop bind function parse error!")
             offset = iter_var = _api.var(_name)
             if not _ir_pass.Equal(low, _api.const(0, 'int32')):
diff --git a/python/tvm/hybrid/util.py b/python/tvm/hybrid/util.py
index 56190a82765e..dcccaa465883 100644
--- a/python/tvm/hybrid/util.py
+++ b/python/tvm/hybrid/util.py
@@ -60,7 +60,7 @@ def replace(op):
         if isinstance(op, _stmt.Provide) and op.func in rmap.keys():
             buf = rmap[op.func]
             return _make.Provide(buf.op, op.value_index, op.value, op.args)
-        elif isinstance(op, _expr.Call) and  op.func in rmap.keys():
+        if isinstance(op, _expr.Call) and  op.func in rmap.keys():
             buf = rmap[op.func]
             return _make.Call(buf.dtype, buf.name, op.args, \
                               _expr.Call.Halide, buf.op, buf.value_index)
diff --git a/python/tvm/intrin.py b/python/tvm/intrin.py
index bb15c314ff23..a0dabfc8a6e2 100644
--- a/python/tvm/intrin.py
+++ b/python/tvm/intrin.py
@@ -495,7 +495,7 @@ def _rule_float_suffix(op):
     """
     if op.dtype == "float32":
         return call_pure_extern(op.dtype, "%sf" % op.name, *op.args)
-    elif op.dtype == "float64":
+    if op.dtype == "float64":
         return call_pure_extern(op.dtype, op.name, *op.args)
     return op
 
diff --git a/python/tvm/make.py b/python/tvm/make.py
index 6238fd7f1789..780bdc246508 100644
--- a/python/tvm/make.py
+++ b/python/tvm/make.py
@@ -56,7 +56,7 @@ def static_cast(dtype, expr):
     if target_type.type_code == src_type.type_code and src_type.bits == target_type.bits:
         if src_type.lanes == target_type.lanes:
             return expr
-        elif src_type.lanes == 1 and target_type.lanes > 1:
+        if src_type.lanes == 1 and target_type.lanes > 1:
             return Broadcast(expr, target_type.lanes)
     return Cast(dtype, expr)
 
diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py
index e2750369b6af..567aff6fba9c 100644
--- a/python/tvm/ndarray.py
+++ b/python/tvm/ndarray.py
@@ -24,7 +24,6 @@ class NDArray(NDArrayBase):
     Instead, this is a minimal data structure to demonstrate
     how can we use TVM in existing project which might have their own array containers.
     """
-    pass
 
 
 def cpu(dev_id=0):
diff --git a/python/tvm/relay/_parser.py b/python/tvm/relay/_parser.py
index c0455a3361e9..9fdffab4e62e 100644
--- a/python/tvm/relay/_parser.py
+++ b/python/tvm/relay/_parser.py
@@ -43,8 +43,8 @@ def __init__(self, message):
     from antlr4.tree.Tree import TerminalNode
 except ImportError:
     raise ParseError("Couldn't find ANTLR runtime." +
-                     "Try running `pip{} install antlr4-python{}-runtime`."
-                     .format(PYTHON_VERSION, PYTHON_VERSION))
+                     "Try running `pip{version} install antlr4-python{version}-runtime`."
+                     .format(version=PYTHON_VERSION))
 
 BINARY_OPS = {
     RelayParser.MUL: op.multiply,
@@ -179,33 +179,31 @@ def visitTerminal(self, node):
         # variables
         if node_type == RelayLexer.GLOBAL_VAR:
             return lookup(deque([self.global_var_scope]), node_text[1:])
-        elif node_type == RelayLexer.LOCAL_VAR:
+        if node_type == RelayLexer.LOCAL_VAR:
             # Remove the leading '%' and lookup the name.
             var = lookup(self.var_scopes, name)
             if var is None:
                 raise ParseError("Couldn't resolve `{}`.".format(name))
             return var
-        elif node_type == RelayLexer.GRAPH_VAR:
+        if node_type == RelayLexer.GRAPH_VAR:
             try:
                 return self.graph_expr[int(name)]
             except IndexError:
                 raise ParseError("Couldn't resolve `{}`".format(name))
 
         # data types
-        elif node_type == RelayLexer.NAT:
+        if node_type == RelayLexer.NAT:
             return int(node_text)
-        elif node_type == RelayLexer.FLOAT:
+        if node_type == RelayLexer.FLOAT:
             return float(node_text)
-        elif node_type == RelayLexer.BOOL_LIT:
+        if node_type == RelayLexer.BOOL_LIT:
             if node_text == "True":
                 return True
-            elif node_text == "False":
+            if node_text == "False":
                 return False
-            else:
-                raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
+            raise ParseError("Unrecognized BOOL_LIT: `{}`".format(node_text))
 
-        else:
-            raise ParseError("todo: {}".format(node_text))
+        raise ParseError("todo: {}".format(node_text))
 
     def visit_list(self, ctx_list):
         # type: (List[ParserRuleContext]) -> List[Any]
diff --git a/python/tvm/relay/adt.py b/python/tvm/relay/adt.py
index bc516a8f3ddb..abf78d565d62 100644
--- a/python/tvm/relay/adt.py
+++ b/python/tvm/relay/adt.py
@@ -8,7 +8,7 @@
 
 class Pattern(RelayNode):
     """Base type for pattern matching constructs."""
-    pass
+
 
 @register_relay_node
 class PatternWildcard(Pattern):
diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py
index 1f7ab18677c4..c101ed43469e 100644
--- a/python/tvm/relay/backend/compile_engine.py
+++ b/python/tvm/relay/backend/compile_engine.py
@@ -10,7 +10,6 @@
 class CachedFunc(NodeBase):
     """Low-level tensor function to back a relay primitive function.
     """
-    pass
 
 
 @register_relay_node
@@ -34,7 +33,6 @@ def __init__(self, source_func, target):
 class CCacheValue(NodeBase):
     """Value in the CompileEngine, including usage statistics.
     """
-    pass
 
 
 def _get_cache_key(source_func, target):
diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 1d50a571a460..46bb82d1a725 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -49,7 +49,6 @@ def __iter__(self):
 @register_relay_node
 class Closure(Value):
     """A closure produced by the interpreter."""
-    pass
 
 
 @register_relay_node
diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 9641e0fd6fef..9ca986907567 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -444,7 +444,6 @@ def create_executor(kind="debug",
         target = _target.create(target)
     if kind == "debug":
         return _interpreter.Interpreter(mod, ctx, target)
-    elif kind == "graph":
+    if kind == "graph":
         return GraphExecutor(mod, ctx, target)
-    else:
-        raise RuntimeError("unknown mode {0}".format(mode))
+    raise RuntimeError("unknown mode {0}".format(mode))
diff --git a/python/tvm/relay/frontend/caffe2.py b/python/tvm/relay/frontend/caffe2.py
index 69d3c3642cfe..5533eec2134b 100755
--- a/python/tvm/relay/frontend/caffe2.py
+++ b/python/tvm/relay/frontend/caffe2.py
@@ -15,8 +15,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
@@ -104,9 +103,8 @@ def get_converter(cls):
 
         if hasattr(cls, '_impl'):
             return getattr(cls, '_impl')
-        else:
-            raise NotImplementedError('{} not implemented'.format(
-                cls.__name__))
+        raise NotImplementedError('{} not implemented'.format(
+            cls.__name__))
 
 
 _caffe2_internal_args = [
@@ -234,11 +232,10 @@ def _get_axis_from_order_str(order):
             order = order if isinstance(order, str) else order.decode('UTF-8')
             if order == 'NCHW':
                 return 1
-            elif order == 'NHWC':
+            if order == 'NHWC':
                 return 3
-            else:
-                raise RuntimeError(
-                    "Unsupported storage order: {} in caffe2".format(order))
+            raise RuntimeError(
+                "Unsupported storage order: {} in caffe2".format(order))
 
         return AttrCvt(
             op_name='concatenate',
diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index 4011e29c761f..be23f2b50273 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -10,7 +10,6 @@
 
 class RequiredAttr(object):
     """Dummpy class to represent required attr"""
-    pass
 
 
 class StrAttrsDict(object):
diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py
index ba2c6dead71e..a4f9b39b70e2 100644
--- a/python/tvm/relay/frontend/coreml.py
+++ b/python/tvm/relay/frontend/coreml.py
@@ -100,37 +100,37 @@ def _ActivationParams(op, inexpr, etab):
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         return _op.add(_op.multiply(inexpr, alpha), beta)
-    elif whichActivation == 'ReLU':
+    if whichActivation == 'ReLU':
         return _op.nn.relu(inexpr)
-    elif whichActivation == 'leakyReLU':
+    if whichActivation == 'leakyReLU':
         _op.nn.leaky_relu(inexpr, alpha=_expr.const(par.alpha, dtype='float32'))
     elif whichActivation == 'thresholdedReLU':
         alpha_tensor = _op.full_like(inexpr, fill_value=_expr.const(par.alpha, dtype='float32'))
         return _op.multiply(inexpr, _op.greater(inexpr, alpha_tensor).as_type('float32'))
-    elif whichActivation == 'PReLU':
+    if whichActivation == 'PReLU':
         return _op.nn.prelu(inexpr, alpha=_expr.const(par.alpha, dtype='float32'))
-    elif whichActivation == 'tanh':
+    if whichActivation == 'tanh':
         return _op.tanh(inexpr)
-    elif whichActivation == 'scaledTanh':
+    if whichActivation == 'scaledTanh':
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         return _op.multiply(_op.tanh(_op.multiply(inexpr, beta)), alpha)
-    elif whichActivation == 'sigmoid':
+    if whichActivation == 'sigmoid':
         return _op.sigmoid(inexpr)
-    elif whichActivation == 'sigmoidHard':
+    if whichActivation == 'sigmoidHard':
         alpha = _expr.const(par.alpha, dtype='float32')
         beta = _expr.const(par.beta, dtype='float32')
         transformX = (alpha * inexpr) + beta
         return _op.clip(transformX, a_min=0., a_max=1.)
-    elif whichActivation == 'ELU':
+    if whichActivation == 'ELU':
         return _op.multiply(_op.add(_op.exp(inexpr), _expr.const(-1, dtype='float32')),
                             _expr.const(par.alpha, dtype='float32'))
-    elif whichActivation == 'softsign':
+    if whichActivation == 'softsign':
         return inexpr / (_expr.const(1, dtype='float32') + (
             op.nn.relu(inexpr) + _op.nn.relu(_op.negative(inexpr))))
-    elif whichActivation == 'softplus':
+    if whichActivation == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1, dtype='float32')))
-    elif whichActivation == 'parametricSoftplus':
+    if whichActivation == 'parametricSoftplus':
         alpha = list(par.alpha.floatValue)
         beta = list(par.alpha.floatValue)
         if len(alpha) == 1:
@@ -142,8 +142,7 @@ def _ActivationParams(op, inexpr, etab):
         alpha_expr = etab.new_const(alpha)
         beta_expr = etab.new_const(beta)
         return _op.multiply(_op.log(_op.add(_op.exp(inexpr), beta_expr)), alpha_expr)
-    else:
-        raise NotImplementedError('%s not implemented' % whichActivation)
+    raise NotImplementedError('%s not implemented' % whichActivation)
 
 
 def _ScaleLayerParams(op, inexpr, etab):
@@ -163,10 +162,9 @@ def _PoolingLayerParams(op, inexpr, etab):
     if op.globalPooling:
         if op.type == 0:
             return _op.nn.global_max_pool2d(inexpr)
-        elif op.type == 1:
+        if op.type == 1:
             return _op.nn.global_avg_pool2d(inexpr)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
     else:
         params = {'pool_size':list(op.kernelSize),
@@ -196,10 +194,9 @@ def _PoolingLayerParams(op, inexpr, etab):
 
         if op.type == 0:
             return _op.nn.max_pool2d(inexpr, **params)
-        elif op.type == 1:
+        if op.type == 1:
             return _op.nn.avg_pool2d(inexpr, **params)
-        else:
-            raise NotImplementedError("Only max and average pooling implemented")
+        raise NotImplementedError("Only max and average pooling implemented")
 
 
 def _SoftmaxLayerParams(op, inexpr, etab):
diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
index 8a2d3d58d01c..2be03c80c20b 100644
--- a/python/tvm/relay/frontend/keras.py
+++ b/python/tvm/relay/frontend/keras.py
@@ -60,21 +60,21 @@ def _convert_activation(inexpr, keras_layer, _):
         alpha = _expr.const(alpha, dtype='float32')
         beta = _expr.const(beta, dtype='float32')
         return _op.add(_op.multiply(inexpr, alpha), beta)
-    elif act_type == 'softmax':
+    if act_type == 'softmax':
         return _op.nn.softmax(inexpr, axis=1)
-    elif act_type == 'sigmoid':
+    if act_type == 'sigmoid':
         return _op.sigmoid(inexpr)
-    elif act_type == 'tanh':
+    if act_type == 'tanh':
         return _op.tanh(inexpr)
-    elif act_type == 'relu':
+    if act_type == 'relu':
         return _op.nn.relu(inexpr)
-    elif act_type == 'softplus':
+    if act_type == 'softplus':
         return _op.log(_op.add(_op.exp(inexpr), _expr.const(1., dtype='float32')))
-    elif act_type == 'elu':
+    if act_type == 'elu':
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') else 1.
         alpha = _expr.const(alpha, dtype='float32')
         return _get_elu(inexpr, alpha)
-    elif act_type == 'selu':
+    if act_type == 'selu':
         # Alpha, Gamma values obtained from https://arxiv.org/abs/1706.02515
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') \
             else 1.6732632423543772848170429916717
@@ -83,15 +83,15 @@ def _convert_activation(inexpr, keras_layer, _):
         alpha = _expr.const(alpha, dtype='float32')
         gamma = _expr.const(gamma, dtype='float32')
         return gamma * _get_elu(inexpr, alpha)
-    elif act_type == 'relu6':
+    if act_type == 'relu6':
         return _op.clip(inexpr, a_min=0., a_max=6.)
-    elif act_type == 'softsign':
+    if act_type == 'softsign':
         return inexpr / (_expr.const(1., dtype='float32') + _op.abs(inexpr))
-    elif act_type == 'hard_sigmoid':
+    if act_type == 'hard_sigmoid':
         x = (_expr.const(0.2, dtype='float32') * inexpr) + _expr.const(0.5, dtype='float32')
         return _op.clip(x, a_min=0., a_max=1.)
-    else:
-        raise TypeError("Unsupported activation type : {}".format(act_type))
+
+    raise TypeError("Unsupported activation type : {}".format(act_type))
 
 
 def _convert_advanced_activation(inexpr, keras_layer, etab):
@@ -100,25 +100,25 @@ def _convert_advanced_activation(inexpr, keras_layer, etab):
         if keras_layer.max_value:
             return _op.clip(inexpr, a_min=0., a_max=float(keras_layer.max_value))
         return _op.nn.relu(inexpr)
-    elif act_type == 'LeakyReLU':
+    if act_type == 'LeakyReLU':
         return _op.nn.leaky_relu(inexpr, alpha=float(keras_layer.alpha))
-    elif act_type == 'ELU':
+    if act_type == 'ELU':
         alpha = keras_layer.alpha if hasattr(keras_layer, 'alpha') else 1.
         alpha = _expr.const(alpha, dtype='float32')
         return _get_elu(inexpr, alpha)
-    elif act_type == 'PReLU':
+    if act_type == 'PReLU':
         assert hasattr(keras_layer, 'alpha'), "alpha required for PReLU."
         _check_data_format(keras_layer)
         size = len(keras_layer.alpha.shape)
         alpha = etab.new_const(keras_layer.get_weights()[0] \
                                .transpose(np.roll(range(size), 1)))
         return _op.negative(alpha) * _op.nn.relu(_op.negative(inexpr)) + _op.nn.relu(inexpr)
-    elif act_type == 'ThresholdedReLU':
+    if act_type == 'ThresholdedReLU':
         theta = keras_layer.theta if hasattr(keras_layer, 'theta') else 1.
         return _op.multiply(inexpr, _op.greater(inexpr, \
             _expr.const(theta, dtype='float32')).astype('float32'))
-    else:
-        raise TypeError("Unsupported advanced activation type : {}".format(act_type))
+
+    raise TypeError("Unsupported advanced activation type : {}".format(act_type))
 
 
 def _convert_merge(inexpr, keras_layer, _):
@@ -297,31 +297,29 @@ def _convert_pooling(inexpr, keras_layer, etab):
     # global pool in keras = global pool + flatten in nnvm/relay
     if pool_type == 'GlobalMaxPooling2D':
         return _convert_flatten(_op.nn.global_max_pool2d(inexpr), keras_layer, etab)
-    elif pool_type == 'GlobalAveragePooling2D':
+    if pool_type == 'GlobalAveragePooling2D':
         return _convert_flatten(_op.nn.global_avg_pool2d(inexpr), keras_layer, etab)
+    pool_h, pool_w = keras_layer.pool_size
+    stride_h, stride_w = keras_layer.strides
+    params = {'pool_size': [pool_h, pool_w],
+              'strides': [stride_h, stride_w],
+              'padding': [0, 0]}
+    if keras_layer.padding == 'valid':
+        pass
+    elif keras_layer.padding == 'same':
+        in_h = keras_layer.input_shape[1]
+        in_w = keras_layer.input_shape[2]
+        pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
+        pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
+        params['padding'] = [pad_t, pad_l, pad_b, pad_r]
     else:
-        pool_h, pool_w = keras_layer.pool_size
-        stride_h, stride_w = keras_layer.strides
-        params = {'pool_size': [pool_h, pool_w],
-                  'strides': [stride_h, stride_w],
-                  'padding': [0, 0]}
-        if keras_layer.padding == 'valid':
-            pass
-        elif keras_layer.padding == 'same':
-            in_h = keras_layer.input_shape[1]
-            in_w = keras_layer.input_shape[2]
-            pad_t, pad_b = _get_pad_pair(in_h, pool_h, stride_h)
-            pad_l, pad_r = _get_pad_pair(in_w, pool_w, stride_w)
-            params['padding'] = [pad_t, pad_l, pad_b, pad_r]
-        else:
-            raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
-        if pool_type == 'MaxPooling2D':
-            return _op.nn.max_pool2d(inexpr, **params)
-        elif pool_type == 'AveragePooling2D':
-            params['count_include_pad'] = False
-            return _op.nn.avg_pool2d(inexpr, **params)
-        else:
-            raise TypeError("Unsupported pooling type : {}".format(keras_layer))
+        raise TypeError("Unsupported padding type : {}".format(keras_layer.padding))
+    if pool_type == 'MaxPooling2D':
+        return _op.nn.max_pool2d(inexpr, **params)
+    if pool_type == 'AveragePooling2D':
+        params['count_include_pad'] = False
+        return _op.nn.avg_pool2d(inexpr, **params)
+    raise TypeError("Unsupported pooling type : {}".format(keras_layer))
 
 
 def _convert_upsample(inexpr, keras_layer, _):
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index f8b51c413193..540e139ff495 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -39,7 +39,7 @@ def _mx_fully_connected(inputs, attrs):
 def _get_channel_axis(layout, op_name):
     if layout == "NCHW":
         return 1
-    elif layout == "NHWC":
+    if layout == "NHWC":
         return 3
     raise RuntimeError("layout: {} is not supported in {}".format(layout, op_name))
 
@@ -49,11 +49,11 @@ def _mx_activations(inputs, attrs):
     assert len(inputs) == 1
     if act_type == "sigmoid":
         return _op.sigmoid(inputs[0])
-    elif act_type == "tanh":
+    if act_type == "tanh":
         return _op.tanh(inputs[0])
-    elif act_type == "relu":
+    if act_type == "relu":
         return _op.nn.relu(inputs[0])
-    elif act_type == "softrelu":
+    if act_type == "softrelu":
         def _stable_softrelu(x):
             # log(1 + exp(-abs(x))) + relu(x)
             one = _expr.const(1, dtype="float32")
@@ -147,7 +147,7 @@ def _pool2d(new_op, is_avg):
         if global_pool:
             return _op.nn.global_max_pool2d(inputs[0])
         return _pool2d(_op.nn.max_pool2d, False)
-    elif pool_type == "avg":
+    if pool_type == "avg":
         if global_pool:
             return _op.nn.global_avg_pool2d(inputs[0])
         return _pool2d(_op.nn.avg_pool2d, True)
@@ -209,10 +209,10 @@ def _mx_leaky_relu(inputs, attrs):
     act_type = attrs.get_str("act_type")
     if act_type == "leaky":
         return _op.nn.leaky_relu(inputs[0], alpha=attrs.get_float("slope", 0.25))
-    elif act_type == "prelu":
+    if act_type == "prelu":
         assert len(inputs) == 2
         return _op.nn.prelu(*inputs)
-    elif act_type == "elu":
+    if act_type == "elu":
         # -slope * relu(1-exp(x)) + relu(x)
         slope = attrs.get_float("slope", 0.25)
         one = _expr.const(1, dtype="float32")
@@ -220,7 +220,7 @@ def _mx_leaky_relu(inputs, attrs):
         mslope = _op.nn.relu(_op.subtract(one, _op.exp(x)))
         mslope = _op.multiply(mslope, _expr.const(-slope, dtype="float32"))
         return _op.add(mslope, _op.nn.relu(x))
-    elif act_type == "rrelu":
+    if act_type == "rrelu":
         # NOTE this is only converted for inference.
         lower_bound = attrs.get_float("lower_bound")
         upper_bound = attrs.get_float("upper_bound")
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index effe50e06981..d322da31fc19 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -18,8 +18,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
 
     return _impl
 
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 82b4c5b9ca37..d583053dc5a6 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -175,8 +175,7 @@ def _impl(attr):
         kernel = attr['kernel_shape']
         if len(kernel) == 2:
             return prefix + '2d' + surfix
-        else:
-            raise NotImplementedError("Only 2d kernel supported.")
+        raise NotImplementedError("Only 2d kernel supported.")
     return _impl
 
 def _dimension_constraint():
@@ -522,8 +521,7 @@ def _impl(inputs, attr, params):
                     op_name="reshape",
                     extras={'newshape':tuple(params_new.asnumpy().flatten())},
                     ignores=['Tshape'])(inputs, attr)
-            else:
-                raise RuntimeError("Reshape with dynamic shape input not supported yet.")
+            raise RuntimeError("Reshape with dynamic shape input not supported yet.")
     return _impl
 
 def _bias_add():
@@ -1385,7 +1383,7 @@ def _parse_param(self, key, value, name, shape):
                                            shape=self._params[name].shape,
                                            dtype=self._params[name].dtype)]
         else:
-            if key != 'dtype' and key != '_output_shapes' and key != '_class':
+            if key not in ('dtype', '_output_shapes', '_class'):
                 raise NotImplementedError \
                     ("Other attributes for a Const(param) Node {} ? .".format(key))
 
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d52c941e50f7..d63b470d48ab 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -126,15 +126,14 @@ def get_tensor_value(self, tensor_wrapper):
         if tensor_wrapper.tensor.Type() == TensorType.UINT8:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.uint8).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        elif tensor_wrapper.tensor.Type() == TensorType.FLOAT32:
+        if tensor_wrapper.tensor.Type() == TensorType.FLOAT32:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.float32).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        elif tensor_wrapper.tensor.Type() == TensorType.INT32:
+        if tensor_wrapper.tensor.Type() == TensorType.INT32:
             return np.frombuffer(tensor_wrapper.buffer.DataAsNumpy(), dtype=np.int32).reshape(
                 tensor_wrapper.tensor.ShapeAsNumpy())
-        else:
-            raise NotImplementedError("Not support tensor type {}"
-                                      .format(str(tensor_wrapper.tensor.Type())))
+        raise NotImplementedError("Not support tensor type {}"
+                                  .format(str(tensor_wrapper.tensor.Type())))
 
     def get_tensor_type_str(self, tensor_type):
         """Get tensor type string representation when given TFLite tensor type"""
@@ -145,12 +144,11 @@ def get_tensor_type_str(self, tensor_type):
 
         if tensor_type == TensorType.UINT8:
             return "uint8"
-        elif tensor_type == TensorType.FLOAT32:
+        if tensor_type == TensorType.FLOAT32:
             return "float32"
-        elif tensor_type == TensorType.INT32:
+        if tensor_type == TensorType.INT32:
             return "int32"
-        else:
-            raise NotImplementedError("Not support tensor type {}".format(str(tensor_type)))
+        raise NotImplementedError("Not support tensor type {}".format(str(tensor_type)))
 
     def convert_conv2d(self, op):
         """Convert TFLite conv2d"""
@@ -192,7 +190,7 @@ def convert_reshape(self, op):
 
         in_expr = self.get_expr(input_tensor_idx)
 
-        if input_shape_length == 1 or input_shape_length == 2:
+        if input_shape_length in (1, 2):
             # The rule is channel first (after N but before H, W).
             # length of 1 means N*H*W*C, do nothing.
             # length of 2 means N*H*W, C, do nothing.
@@ -275,7 +273,7 @@ def convert_squeeze(self, op):
         in_expr = self.get_expr(input_tensor_idx)
 
         # TFLite is N H W C, our layout is N C H W
-        if input_shape_length == 1 or input_shape_length == 2:
+        if input_shape_length in (1, 2):
             # The rule is channel first (after N but before H, W).
             # length of 1 means N*H*W*C, do nothing.
             # length of 2 means N*H*W, C, do nothing.
@@ -299,7 +297,7 @@ def convert_squeeze(self, op):
         # 3: N H W C, reshape to N H*W C, transpose to N C H*W
         # 4: N H W C, transpose to N C H W
         # add more if we need target shapes in future
-        if output_shape_length == 1 or output_shape_length == 2:
+        if output_shape_length in (1, 2):
             pass
         elif output_shape_length == 3:
             out = _op.transpose(out, axes=(0, 2, 1))
@@ -320,16 +318,15 @@ def convert_fused_activation_function(self, in_expr, fused_activation_fn):
         assert fused_activation_fn != ActivationFunctionType.NONE
         if fused_activation_fn == ActivationFunctionType.RELU6:
             return _op.clip(in_expr, a_min=0, a_max=6)
-        elif fused_activation_fn == ActivationFunctionType.RELU:
+        if fused_activation_fn == ActivationFunctionType.RELU:
             return _op.nn.relu(in_expr)
-        elif fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
+        if fused_activation_fn == ActivationFunctionType.RELU_N1_TO_1:
             return _op.clip(in_expr, a_min=-1, a_max=1)
-        elif fused_activation_fn == ActivationFunctionType.TANH:
+        if fused_activation_fn == ActivationFunctionType.TANH:
             return _op.tanh(in_expr)
-        else:
-            fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
-            raise NotImplementedError("Unsupported fused activation fn {}"
-                                      .format(fused_activation_fn_str))
+        fused_activation_fn_str = self.activation_fn_type[fused_activation_fn]
+        raise NotImplementedError("Unsupported fused activation fn {}"
+                                  .format(fused_activation_fn_str))
 
     def convert_conv(self, op, conv_type):
         """convolution implementation."""
@@ -401,7 +398,7 @@ def convert_conv(self, op, conv_type):
 
         # weight tensor type should be UINT8 (quantization) or FLOAT32
         weight_tensor_type = weight_tensor.tensor.Type()
-        assert weight_tensor_type == TensorType.UINT8 or weight_tensor_type == TensorType.FLOAT32
+        assert weight_tensor_type in (TensorType.UINT8, TensorType.FLOAT32)
         weight_tensor_type_str = self.get_tensor_type_str(weight_tensor_type)
 
         in_expr = self.get_expr(input_tensor_idx)
@@ -434,7 +431,7 @@ def convert_conv(self, op, conv_type):
             bias_tensor = input_tensors[2]
             bias_tensor_type = bias_tensor.tensor.Type()
             # bias tensor type should be INT32 (quantization) or FLOAT32
-            assert bias_tensor_type == TensorType.INT32 or bias_tensor_type == TensorType.FLOAT32
+            assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32)
             bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type)
             bias_expr = self.exp_tab.new_const(self.get_tensor_value(bias_tensor),
                                                dtype=bias_tensor_type_str)
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index 5d4cda162ee3..a4b41d92371e 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -57,7 +57,7 @@ def compute_conv2d(attrs, inputs, out_type, target):
     layout = attrs.data_layout
     kernel_layout = attrs.kernel_layout
     out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
 
     assert layout in ["NCHW", "NHWC", "NCHW4c"]
@@ -95,15 +95,15 @@ def schedule_conv2d(attrs, outs, target):
     with target:
         if groups == 1 and layout == "NCHW":
             return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NCHW4c":
+        if groups == 1 and layout == "NCHW4c":
             return topi.generic.schedule_conv2d_nchw(outs)
-        elif groups == 1 and layout == "NHWC":
+        if groups == 1 and layout == "NHWC":
             return topi.generic.schedule_conv2d_nhwc(outs)
-        elif groups != 1:
+        if groups != 1:
             if layout == "NCHW":
                 # TODO(leyuan, merrymercy, Huyuwei): fold depthwise topi into conv2d.
                 return topi.generic.schedule_depthwise_conv2d_nchw(outs)
-            elif layout == "NHWC" and kernel_layout == "HWOI":
+            if layout == "NHWC" and kernel_layout == "HWOI":
                 return topi.generic.schedule_depthwise_conv2d_nhwc(outs)
     raise ValueError("No compatible schedule")
 
@@ -127,7 +127,7 @@ def compute_conv2d_transpose(attrs, inputs, out_dtype, target):
     groups = attrs.groups
     layout = attrs.data_layout
     out_dtype = attrs.out_dtype
-    out_dtype = (inputs[0].dtype if (out_dtype == "same" or out_dtype == "")
+    out_dtype = (inputs[0].dtype if out_dtype in ("same", "")
                  else out_dtype)
     assert layout == "NCHW", "only support nchw for now"
     assert dilation == (1, 1), "not support dilate now"
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index d6d73242bb96..5fa83bd96f30 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -6,19 +6,18 @@
 @register_relay_attr_node
 class Conv2DAttrs(Attrs):
     """Attribute of nn.conv2d"""
-    pass
+
 
 @register_relay_attr_node
 class Conv2DWinogradAttrs(Attrs):
     """Attribute of nn.contrib_conv2d_winograd_without_weight_transform"""
-    pass
+
 
 @register_relay_attr_node
 class Conv2DWinogradWeightTransformAttrs(Attrs):
     """Attribute of nn.contrib_conv2d_winograd_weight_transform"""
-    pass
+
 
 @register_relay_attr_node
 class GlobalPool2DAttrs(Attrs):
     """Attribute of nn.global_pool"""
-    pass
diff --git a/python/tvm/relay/testing/inception_v3.py b/python/tvm/relay/testing/inception_v3.py
index 491b221fbe0a..7ac3ca35a0bd 100644
--- a/python/tvm/relay/testing/inception_v3.py
+++ b/python/tvm/relay/testing/inception_v3.py
@@ -29,11 +29,10 @@ def Conv(data, num_filter, kernel=(1, 1), stride=(1, 1), pad=(0, 0), name=None,
 def Pooling(data, kernel, stride, pad, pool_type, name):
     if pool_type == 'max':
         return relay.nn.max_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad)
-    elif pool_type == 'avg':
+    if pool_type == 'avg':
         return relay.nn.avg_pool2d(data=data, pool_size=kernel, strides=stride, padding=pad,
                                    count_include_pad=True)
-    else:
-        raise ValueError("Invalid pooling type: " + pool_type)
+    raise ValueError("Invalid pooling type: " + pool_type)
 
 def Inception7A(data,
                 num_1x1,
diff --git a/python/tvm/relay/ty.py b/python/tvm/relay/ty.py
index 1cfa96aa7213..96ade4124a00 100644
--- a/python/tvm/relay/ty.py
+++ b/python/tvm/relay/ty.py
@@ -172,7 +172,6 @@ def __init__(self, func, args):
 @register_relay_node
 class TypeConstraint(Type):
     """Abstract class representing a type constraint."""
-    pass
 
 
 @register_relay_node
diff --git a/python/tvm/rpc/proxy.py b/python/tvm/rpc/proxy.py
index cefffbfa9668..7f01fb1b7b02 100644
--- a/python/tvm/rpc/proxy.py
+++ b/python/tvm/rpc/proxy.py
@@ -389,7 +389,7 @@ def _handler_ready_proxy_mode(self, handler):
         if key in pool_src:
             self._pair_up(pool_src.pop(key), handler)
             return
-        elif key not in pool_dst:
+        if key not in pool_dst:
             pool_dst[key] = handler
             def cleanup():
                 """Cleanup client connection if timeout"""
diff --git a/python/tvm/rpc/tornado_util.py b/python/tvm/rpc/tornado_util.py
index eafea2e85394..cc0398182a0e 100644
--- a/python/tvm/rpc/tornado_util.py
+++ b/python/tvm/rpc/tornado_util.py
@@ -95,9 +95,8 @@ def _update_read(self):
             if msg:
                 self.on_message(msg)
                 return True
-            else:
-                # normal close, remote is closed
-                self.close()
+            # normal close, remote is closed
+            self.close()
         except socket.error as err:
             if err.args[0] in (errno.EAGAIN, errno.EWOULDBLOCK):
                 pass
diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py
index 1a06ed81ae4f..5644775ca416 100644
--- a/python/tvm/rpc/tracker.py
+++ b/python/tvm/rpc/tracker.py
@@ -86,7 +86,7 @@ def remove(self, value):
         value: object
             The resource to remove
         """
-        pass
+
 
     def summary(self):
         """Get summary information of the scheduler."""
diff --git a/python/tvm/schedule.py b/python/tvm/schedule.py
index 6c261a453457..e772735b5bfb 100644
--- a/python/tvm/schedule.py
+++ b/python/tvm/schedule.py
@@ -143,19 +143,16 @@ def vstore(self, begin, value):
 @register_node
 class Split(NodeBase):
     """Split operation on axis."""
-    pass
 
 
 @register_node
 class Fuse(NodeBase):
     """Fuse operation on axis."""
-    pass
 
 
 @register_node
 class Singleton(NodeBase):
     """Singleton axis."""
-    pass
 
 
 @register_node
diff --git a/python/tvm/stmt.py b/python/tvm/stmt.py
index 48d91dfa8044..f06958ab78ee 100644
--- a/python/tvm/stmt.py
+++ b/python/tvm/stmt.py
@@ -381,7 +381,7 @@ def stmt_list(stmt):
     """
     if isinstance(stmt, Block):
         return stmt_list(stmt.first) + stmt_list(stmt.rest)
-    elif isinstance(stmt, ProducerConsumer):
+    if isinstance(stmt, ProducerConsumer):
         return stmt_list(stmt.body)
     return [stmt]
 
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index e1345ad373bf..6e7a2b357a96 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -33,7 +33,6 @@ def dtype(self):
 @register_node
 class TensorIntrinCall(NodeBase):
     """Intermediate structure for calling a tensor intrinsic."""
-    pass
 
 
 itervar_cls = None
@@ -144,7 +143,6 @@ def input_tensors(self):
 @register_node
 class PlaceholderOp(Operation):
     """Placeholder operation."""
-    pass
 
 
 @register_node
@@ -164,7 +162,6 @@ def reduce_axis(self):
 @register_node
 class TensorComputeOp(Operation):
     """Tensor operation."""
-    pass
 
 
 @register_node
@@ -179,7 +176,7 @@ def scan_axis(self):
 @register_node
 class ExternOp(Operation):
     """Extern operation."""
-    pass
+
 
 @register_node
 class HybridOp(Operation):
diff --git a/topi/python/topi/arm_cpu/bitserial_conv2d.py b/topi/python/topi/arm_cpu/bitserial_conv2d.py
index cd6810af8177..ffef3ce81b98 100644
--- a/topi/python/topi/arm_cpu/bitserial_conv2d.py
+++ b/topi/python/topi/arm_cpu/bitserial_conv2d.py
@@ -61,7 +61,7 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    assert layout in ("NCHW", "NHWC"), "only support layouts NCHW and NHWC"
     if dorefa:
         assert layout == "NCHW", "Cannot support dorea with NHWC layout yet"
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index e402d808096a..fe77762b3ce9 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -562,7 +562,7 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
     data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
     layout = attrs[data_layout_key]
     out_dtype = attrs["out_dtype"]
-    if out_dtype == "" or out_dtype == "same":
+    if out_dtype in ("same", ""):
         out_dtype = tinfos[0].dtype
 
     if layout != 'NCHW':
diff --git a/topi/python/topi/cuda/conv2d.py b/topi/python/topi/cuda/conv2d.py
index 2b0f59ab8510..ca456134a6ce 100644
--- a/topi/python/topi/cuda/conv2d.py
+++ b/topi/python/topi/cuda/conv2d.py
@@ -92,10 +92,9 @@ def conv2d_cuda(cfg, data, kernel, strides, padding, dilation, layout='NCHW', ou
 
     if layout == 'NCHW':
         return nn.conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 @autotvm.register_topi_schedule(generic.schedule_conv2d_nchw, ["cuda", "gpu"],
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 2f2d0deab69d..a8d961ae062e 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -370,7 +370,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     data_layout_key = "data_layout" if "data_layout" in new_attrs else "layout"
     layout = attrs[data_layout_key]
     out_dtype = attrs["out_dtype"]
-    if out_dtype == "" or out_dtype == "same":
+    if out_dtype in ("", "same"):
         out_dtype = tinfos[0].dtype
 
     data, kernel = tinfos[0:2]
@@ -436,7 +436,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
         )
         dispatch_ctx.update(target, new_workload, cfg)
         return F.nn.contrib_conv2d_winograd_without_weight_transform(*copy_inputs, **new_attrs)
-    elif groups != CI:
+    if groups != CI:
         workload = autotvm.task.args_to_workload(
             [tinfos[0], tinfos[1], strides, padding, dilation, groups, out_dtype],
             group_conv2d_nchw)
diff --git a/topi/python/topi/cuda/reduction.py b/topi/python/topi/cuda/reduction.py
index 4c5d1a507660..e8c029d4a871 100644
--- a/topi/python/topi/cuda/reduction.py
+++ b/topi/python/topi/cuda/reduction.py
@@ -96,7 +96,7 @@ def traverse_before_reduce(operator):
         """Internal travserse function"""
         if isinstance(operator, tvm.tensor.PlaceholderOp):
             return
-        elif tag.is_injective(operator.tag):
+        if tag.is_injective(operator.tag):
             sch[operator].compute_inline()
             for tensor in operator.input_tensors:
                 if tensor.op not in scheduled_ops:
diff --git a/topi/python/topi/nn/bitserial_conv2d.py b/topi/python/topi/nn/bitserial_conv2d.py
index 545ad2f38ae5..d41a99a04a9d 100644
--- a/topi/python/topi/nn/bitserial_conv2d.py
+++ b/topi/python/topi/nn/bitserial_conv2d.py
@@ -92,14 +92,14 @@ def bitserial_conv2d(data, kernel, stride, padding, activation_bits, weight_bits
     if layout == 'NCHW':
         return spatial_pack_nchw(data, kernel, stride, padding, activation_bits, weight_bits,
                                  pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return spatial_pack_nhwc(data, kernel, stride, padding, activation_bits, weight_bits,
                                  pack_dtype=pack_dtype, out_dtype=out_dtype, dorefa=dorefa)
     raise ValueError("not support this layout {} yet".format(layout))
 
 def _get_workload(data, kernel, stride, padding, out_dtype, layout):
     """ Get the workload structure. """
-    assert layout == "NCHW" or layout == "NHWC", \
+    assert layout in ("NCHW", "NHWC"), \
         "Only support layouts NCHW and NHWC"
     if layout == "NCHW":
         _, CI, IH, IW = [x.value for x in data.shape]
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 977b80678524..559f132f19c2 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -48,12 +48,11 @@ def conv2d(input, filter, strides, padding, dilation, layout='NCHW', out_dtype=N
     # default declaration
     if layout == 'NCHW':
         return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 @tvm.target.generic_func
diff --git a/topi/python/topi/testing/upsampling_python.py b/topi/python/topi/testing/upsampling_python.py
index fc4ad652f900..341dd8f6ceb0 100644
--- a/topi/python/topi/testing/upsampling_python.py
+++ b/topi/python/topi/testing/upsampling_python.py
@@ -17,12 +17,11 @@ def upsampling_python(data, scale, layout='NCHW'):
             for c in range(oshape[1]):
                 output_np[b, c, :, :] = upsample_nearest(data[b, c, :, :], scale)
         return output_np
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         oshape = (ishape[0], ishape[1]*scale, ishape[1]*scale, ishape[3])
         output_np = np.zeros(oshape, dtype=data.dtype)
         for b in range(oshape[0]):
             for c in range(oshape[3]):
                 output_np[b, :, :, c] = upsample_nearest(data[b, :, :, c], scale)
         return output_np
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
diff --git a/topi/python/topi/x86/bitserial_conv2d.py b/topi/python/topi/x86/bitserial_conv2d.py
index 0b864c383ca4..327f15a49e07 100644
--- a/topi/python/topi/x86/bitserial_conv2d.py
+++ b/topi/python/topi/x86/bitserial_conv2d.py
@@ -59,7 +59,7 @@ def _declaration_bitserial_conv2d(data, kernel, stride, padding, activation_bits
     if out_dtype is None:
         out_dtype = data.dtype
     assert data.shape[0].value == 1, "only support batch size=1 convolution on rasp"
-    assert layout == "NCHW" or layout == "NHWC", "only support layouts NCHW and NHWC"
+    assert layout in ("NCHW", "NHWC"), "only support layouts NCHW and NHWC"
 
     wkl = _get_workload(data, kernel, stride, padding, out_dtype, layout)
     sch = _get_schedule(wkl, layout)
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index eefa5fec80df..e8ccee8bd818 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -71,12 +71,11 @@ def _declaration_conv(cfg, data, kernel, strides, padding, dilation, layout, out
             _get_default_config(cfg, data, kernel, strides, padding, out_dtype)
         return _declaration_conv_impl(cfg, data, kernel, strides,
                                       padding, dilation, layout, out_dtype)
-    elif layout == 'HWCN':
+    if layout == 'HWCN':
         return nn.conv2d_hwcn(data, kernel, strides, padding, dilation, out_dtype)
-    elif layout == 'NHWC':
+    if layout == 'NHWC':
         return nn.conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-    else:
-        raise ValueError("not support this layout {} yet".format(layout))
+    raise ValueError("not support this layout {} yet".format(layout))
 
 
 def _declaration_conv_impl(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
diff --git a/vta/python/vta/environment.py b/vta/python/vta/environment.py
index a77e29ac3a52..3a145be590d5 100644
--- a/vta/python/vta/environment.py
+++ b/vta/python/vta/environment.py
@@ -223,10 +223,9 @@ def target_host(self):
         """The target host"""
         if self.TARGET == "pynq":
             return "llvm -target=armv7-none-linux-gnueabihf"
-        elif self.TARGET == "sim":
+        if self.TARGET == "sim":
             return "llvm"
-        else:
-            raise ValueError("Unknown target %s" % self.TARGET)
+        raise ValueError("Unknown target %s" % self.TARGET)
 
 
 def get_env():
diff --git a/vta/python/vta/graph.py b/vta/python/vta/graph.py
index 7f2a26fdc4bf..0b746e0458af 100644
--- a/vta/python/vta/graph.py
+++ b/vta/python/vta/graph.py
@@ -169,7 +169,7 @@ def _clean_cast(node, target_type):
         op_name = node.attr("op_name")
         if op_name == "cast":
             return _clean_cast(node.get_children(), target_type)
-        elif op_name == "relu":
+        if op_name == "relu":
             data, has_clip = _clean_cast(
                 node.get_children(), target_type)
             data = nnvm.sym.relu(data)
diff --git a/vta/python/vta/intrin.py b/vta/python/vta/intrin.py
index b366287568e7..8255b8b7df2e 100644
--- a/vta/python/vta/intrin.py
+++ b/vta/python/vta/intrin.py
@@ -64,7 +64,7 @@ def instr(index):
                            dev.get_task_qid(dev.QID_COMPUTE))
             irb.scope_attr(dev.vta_axis, "coproc_uop_scope",
                            dev.vta_push_uop)
-            if index == 0 or index == 2:
+            if index in (0, 2):
                 irb.emit(tvm.call_extern(
                     "int32", "VTAUopPush",
                     0, 0,
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index c21ca6ed5bf4..379e1e24fa24 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -77,10 +77,9 @@ def _post_order(op):
                         args.append(m[1])
                 args += op.args[base_args+3:]
                 return tvm.call_extern("int32", "VTAUopPush", *args)
-            else:
-                if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
-                    raise RuntimeError("unexpected op %s" % op)
-                return op
+            if op.name not in ("VTATLSCommandHandle", "tvm_thread_context"):
+                raise RuntimeError("unexpected op %s" % op)
+            return op
 
         ret = tvm.ir_pass.IRTransform(
             stmt.body, None, _post_order, ["Call"])
@@ -165,22 +164,21 @@ def _post_order(op):
                 op.condition, let_stmt)
             del rw_info[buffer_var]
             return alloc
-        elif isinstance(op, tvm.expr.Load):
+        if isinstance(op, tvm.expr.Load):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
                 rw_info[buffer_var] = tvm.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.make.Load(op.dtype, new_var, op.index)
-        elif isinstance(op, tvm.stmt.Store):
+        if isinstance(op, tvm.stmt.Store):
             buffer_var = op.buffer_var
             if not buffer_var in rw_info:
                 rw_info[buffer_var] = tvm.var(
                     buffer_var.name + "_ptr", "handle")
             new_var = rw_info[buffer_var]
             return tvm.make.Store(new_var, op.value, op.index)
-        else:
-            raise RuntimeError("not reached")
+        raise RuntimeError("not reached")
     stmt = tvm.ir_pass.IRTransform(
         stmt_in, None, _post_order, ["Allocate", "Load", "Store"])
     for buffer_var, new_var in rw_info.items():
@@ -233,23 +231,20 @@ def _pre_order(op):
             if op.attr_key == "virtual_thread":
                 lift_stmt.append([])
 
-        return None
-
     def _post_order(op):
         if isinstance(op, tvm.stmt.Allocate):
             lift_stmt[-1].append(op)
             return op.body
-        elif isinstance(op, tvm.stmt.AttrStmt):
+        if isinstance(op, tvm.stmt.AttrStmt):
             if op.attr_key == "storage_scope":
                 lift_stmt[-1].append(op)
                 return op.body
-            elif op.attr_key == "virtual_thread":
+            if op.attr_key == "virtual_thread":
                 return _merge_block(lift_stmt.pop() + [op], op.body)
             return op
-        elif isinstance(op, tvm.stmt.For):
+        if isinstance(op, tvm.stmt.For):
             return _merge_block(lift_stmt.pop() + [op], op.body)
-        else:
-            raise RuntimeError("not reached")
+        raise RuntimeError("not reached")
     stmt = tvm.ir_pass.IRTransform(
         stmt_in, _pre_order, _post_order, ["Allocate", "AttrStmt", "For"])
     assert len(lift_stmt) == 1
@@ -297,7 +292,7 @@ def _do_fold(stmt):
             sync = tvm.make.Call(
                 "int32", "vta.coproc_sync", [], tvm.expr.Call.Intrinsic, None, 0)
             return tvm.make.Block(stmt.body, tvm.make.Evaluate(sync))
-        elif _match_pragma(stmt, "trim_loop"):
+        if _match_pragma(stmt, "trim_loop"):
             op = stmt.body
             assert isinstance(op, tvm.stmt.For)
             return tvm.make.For(
@@ -584,7 +579,7 @@ def _do_fold(stmt):
                            tvm.make.StringImm("VTAPushALUOp"))
             irb.emit(stmt)
             return irb.get()
-        elif _match_pragma(stmt, "skip_alu"):
+        if _match_pragma(stmt, "skip_alu"):
             return tvm.make.Evaluate(0)
         return stmt
 
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index ab06cadf8247..2fd11a887da0 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -293,10 +293,9 @@ def schedule_conv2d(attrs, outs, target):
         target = tvm.target.create(target)
         if target.device_name == "vta":
             return schedule_packed_conv2d(outs)
-        elif str(target).startswith("llvm"):
+        if str(target).startswith("llvm"):
             return tvm.create_schedule([x.op for x in outs])
-        else:
-            raise RuntimeError("not support target %s" % target)
+        raise RuntimeError("not support target %s" % target)
     return _nn.schedule_conv2d(attrs, outs, target)
 
 

From 9284d6eaf4f48239c2b74ecf52a955157e6da1d8 Mon Sep 17 00:00:00 2001
From: Hao Jin <haojin2@users.noreply.github.com>
Date: Thu, 21 Feb 2019 22:14:08 -0800
Subject: [PATCH 04/93] add MXNet converter for where operator for both NNVM
 and Relay (#2647)

---
 nnvm/python/nnvm/frontend/mxnet.py            |  2 +-
 .../python/frontend/mxnet/test_forward.py     | 42 +++++++++++++++++--
 python/tvm/relay/frontend/mxnet.py            |  1 +
 tests/python/frontend/mxnet/test_forward.py   | 39 +++++++++++++++++
 4 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 8c92cb99f37c..bdea6bb10fbc 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -317,7 +317,7 @@ def _argmin(inputs, attrs):
                   'flatten', 'log', 'log_softmax', 'max', 'min', 'negative',
                   'ones_like', 'relu', 'sigmoid', 'slice_like', 'softmax',
                   'sum', 'tanh', 'transpose', 'zeros_like', 'gather_nd',
-                  'reshape_like']
+                  'reshape_like', 'where']
 
 _convert_map = {
     '_copy'         : _rename('copy'),
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 66ae9d6e9de4..e9225a4c7c50 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -158,7 +158,7 @@ def test_forward_ones():
     ones = mx.sym.ones(shape=(2, 3, 4), dtype='float32')
     mx_sym = mx.sym.elemwise_add(data, ones)
     verify_mxnet_frontend_impl(mx_sym, (2, 3, 4), (2, 3, 4))
-    
+
 def test_forward_zeros():
     data = mx.sym.var('data')
     zeros = mx.sym.zeros(shape=(2, 3, 4), dtype='float32')
@@ -184,7 +184,42 @@ def test_forward_argmin():
     data = mx.sym.var('data')
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
-    
+
+def test_forward_where():
+    cond = mx.sym.var('cond')
+    x = mx.sym.var('x')
+    y = mx.sym.var('y')
+    dshape = (2, 2)
+    dtype = 'float32'
+    mx_sym = mx.sym.where(cond, x, y)
+    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
+    np_x = np.random.uniform(size=dshape).astype(dtype)
+    np_y = np.random.uniform(size=dshape).astype(dtype)
+    mx_cond = mx.nd.array(np_cond)
+    mx_x = mx.nd.array(np_x)
+    mx_y = mx.nd.array(np_y)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
+    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("cond", tvm.nd.array(np_cond))
+        m.set_input("x", tvm.nd.array(np_x))
+        m.set_input("y", tvm.nd.array(np_y))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -206,4 +241,5 @@ def test_forward_argmin():
     test_forward_zeros_like()
     test_forward_argmax()
     test_forward_argmin()
-    
+    test_forward_where()
+
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 540e139ff495..3a0885a3fcdf 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -290,6 +290,7 @@ def _mx_roi_align(inputs, attrs):
     "slice_like",
     "zeros_like",
     "ones_like",
+    "where",
 ]
 
 _convert_map = {
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 81a12b041ed7..e1f7e5509230 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -190,6 +190,44 @@ def test_forward_argmin():
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
 
+def test_forward_where():
+    cond = mx.sym.var('cond')
+    x = mx.sym.var('x')
+    y = mx.sym.var('y')
+    dshape = (2, 2)
+    dtype = 'float32'
+    mx_sym = mx.sym.where(cond, x, y)
+    np_cond = np.array([[0, 1], [-1, 0]]).astype(dtype)
+    np_x = np.random.uniform(size=dshape).astype(dtype)
+    np_y = np.random.uniform(size=dshape).astype(dtype)
+    mx_cond = mx.nd.array(np_cond)
+    mx_x = mx.nd.array(np_x)
+    mx_y = mx.nd.array(np_y)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
+    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
+    out_shape = dshape
+    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
+    new_sym, params = relay.frontend.from_mxnet(mx_sym,
+                                                shape_dict,
+                                                arg_params=args,
+                                                aux_params=auxs)
+    for target, ctx in ctx_list():
+        with relay.build_config(opt_level=3):
+            graph, lib, params = relay.build(new_sym, target, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("cond", tvm.nd.array(np_cond))
+        m.set_input("x", tvm.nd.array(np_x))
+        m.set_input("y", tvm.nd.array(np_y))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -212,3 +250,4 @@ def test_forward_argmin():
     test_forward_zeros_like()
     test_forward_argmax()
     test_forward_argmin()
+    test_forward_where()

From b84379ad3ae424390afd0f4aa34269e15e61f9cd Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Thu, 21 Feb 2019 22:29:52 -0800
Subject: [PATCH 05/93] [Quantization][RELAY] Add check against NCHWc ops in
 the quantization pass (#2646)

* check in

* fix typo

* fix typo

* change message

* change message

* typo

* lint
---
 python/tvm/relay/quantize/_annotate.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 7eb8af57a70b..912aa9a0a23c 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -1,6 +1,7 @@
 #pylint: disable=unused-argument
 """Internal module for registering attribute for annotation."""
 from __future__ import absolute_import
+import warnings
 
 import topi
 from . import _quantize
@@ -118,6 +119,14 @@ def attach_simulated_quantize(data, kind, sign=True, rounding="round"):
         data, dom_scale, clip_min, clip_max, kind, sign, rounding)
 
 
+@register_annotate_function("nn.contrib_conv2d_NCHWc")
+def conv2d_nchwc_rewrite(ref_call, new_args, ctx):
+    warnings.warn("NCHWc layout Conv2D detected, please use a lower "
+                  "optimization level before applying the quantization "
+                  "pass as quantization will have no effect here...")
+    return None
+
+
 @register_annotate_function("nn.conv2d")
 def conv2d_rewrite(ref_call, new_args, ctx):
     """Rewrite function for conv2d. Lhs of conv will be quantized to

From 8bb160a89dc88c95b034f796f1373feca1f76000 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Fri, 22 Feb 2019 18:03:18 +0000
Subject: [PATCH 06/93] Stop pylint complaining about useless import alias.
 (#2655)

Recent pylint warngs about import renames with no effect.  Remove
them.
---
 python/tvm/rpc/client.py  | 2 +-
 vta/python/vta/ir_pass.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/tvm/rpc/client.py b/python/tvm/rpc/client.py
index c975ec64aa76..cf2e118f06c4 100644
--- a/python/tvm/rpc/client.py
+++ b/python/tvm/rpc/client.py
@@ -9,7 +9,7 @@
 from . import base
 from ..contrib import util
 from .._ffi.base import TVMError
-from .._ffi import function as function
+from .._ffi import function
 from .._ffi import ndarray as nd
 from ..module import load as _load_module
 
diff --git a/vta/python/vta/ir_pass.py b/vta/python/vta/ir_pass.py
index 379e1e24fa24..9800cc6472b3 100644
--- a/vta/python/vta/ir_pass.py
+++ b/vta/python/vta/ir_pass.py
@@ -3,7 +3,7 @@
 from __future__ import absolute_import as _abs
 
 import tvm
-from topi import util as util
+from topi import util
 
 from .environment import get_env
 

From 3adb276f007d7a328059d88c019d89e3f7779bd7 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Fri, 22 Feb 2019 18:04:36 +0000
Subject: [PATCH 07/93] Explicitly disable pylint warning
 subprocess-popen-preexec-fn (#2656)

---
 python/tvm/rpc/server.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index f0cdd14abce6..2bc1ae7fde92 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -331,6 +331,15 @@ def __init__(self,
             if silent:
                 cmd += ["--silent"]
 
+            # prexec_fn is not thread safe and may result in deadlock.
+            # python 3.2 introduced the start_new_session parameter as
+            # an alternative to the common use case of
+            # prexec_fn=os.setsid.  Once the minimum version of python
+            # supported by TVM reaches python 3.2 this code can be
+            # rewritten in favour of start_new_session.  In the
+            # interim, stop the pylint diagnostic.
+            #
+            # pylint: disable=subprocess-popen-preexec-fn
             self.proc = subprocess.Popen(cmd, preexec_fn=os.setsid)
             time.sleep(0.5)
         elif not is_proxy:

From 5876fc990060d99c2588c1d20acff24b43d0029d Mon Sep 17 00:00:00 2001
From: Yida Wang <yidawa@gmail.com>
Date: Fri, 22 Feb 2019 10:05:23 -0800
Subject: [PATCH 08/93] [RELAY][PASS]use attribute registration style in the
 mac count pass (#2645)

---
 src/relay/pass/mac_count.cc               | 172 +++++++++++-----------
 tests/python/relay/test_pass_mac_count.py |   1 -
 2 files changed, 82 insertions(+), 91 deletions(-)

diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 5709d7d0ea31..500312117c5b 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -16,19 +16,88 @@
 namespace tvm {
 namespace relay {
 
-namespace {
+namespace mac_count {
 
-bool IsConv2DNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<Conv2DAttrs>();
+inline int64_t GetCartesianProd(Array<IndexExpr> arr) {
+  int64_t ret = 1;
+  for (size_t i = 0; i < arr.size(); i++) {
+    const auto* intImm = arr[i].as<IntImm>();
+    ret *= static_cast<int64_t>(intImm->value);
+  }
+  return ret;
+}
+
+/*
+ * \brief Preparation function for MAC count.
+ * \param call_node The call node.
+ * \return The number of MACs.
+ */
+using FMacCount = runtime::TypedPackedFunc<
+  int64_t(const Call& call_node)>;
+
+//----------------------------------------------
+// Per operator defs for MAC count
+//----------------------------------------------
+
+int64_t ConvMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK(args.size() == 2)
+      << "The number of input arguments of a CONV 2D node should be 2.";
+  const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
+  const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> data_shape = data_type->shape;
+  std::string data_layout = conv_2d_attr->data_layout;
+  int32_t C_ind = Layout(data_layout).Indexof('C');
+  int32_t c_ind = Layout(data_layout).Indexof('c');
+  CHECK(C_ind != -1)
+      << "There is no input channel dimension.";
+  int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
+  if (c_ind != -1)
+    input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
+  Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
+  CHECK(kernel_size.size() == 2)
+      << "The dimension of the kernel size in Conv 2D should be 2.";
+  const auto* expr = call_node->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> output_tensor = expr->shape;
+  CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
+      << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
+  int64_t count = input_channel * GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
+  return count;
 }
 
-bool IsDenseNode(const ExprNode* node) {
-  const auto* call_node = dynamic_cast<const CallNode*>(node);
-  return call_node != nullptr && call_node->attrs.as<DenseAttrs>();
+int64_t DenseMacCount(const Call& call_node) {
+  if (!call_node->checked_type_.defined()) {
+    LOG(WARNING) << "The infer type pass should be called before the mac count pass";
+    return 0;
+  }
+  Array<Expr> args = call_node->args;
+  CHECK(args.size() == 2)
+      << "The number of input arguments of a Dense node should be 2.";
+  const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
+  const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
+  Array<IndexExpr> data_shape = data_type->shape;
+  Array<IndexExpr> weight_shape = weight_type->shape;
+  CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
+      << "The dimension of an input tensor to Dense node should be 2.";
+  int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImm>()->value);
+  int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
+  int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
+  int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
+  CHECK(d2 == d4)
+      << "The dimensions of input arguments do not match.";
+  int64_t count = d1 * d2 * d3;
+  return count;
 }
 
-}  // namespace
+RELAY_REGISTER_OP("nn.conv2d")
+.set_attr<FMacCount>("FMacCount", ConvMacCount);
+
+RELAY_REGISTER_OP("nn.dense")
+.set_attr<FMacCount>("FMacCount", DenseMacCount);
 
 class MacCounter : private ExprVisitor {
  public:
@@ -44,91 +113,13 @@ class MacCounter : private ExprVisitor {
 
  private:
   void VisitExpr_(const CallNode* call_node) final {
-    if (IsConv2DNode(call_node)) {
-      count_ += ComputeConv2DMacs(call_node);
-    } else if (IsDenseNode(call_node)) {
-      count_ += ComputeDenseMacs(call_node);
-    }
+    static const auto& fprep =
+        Op::GetAttr<FMacCount>("FMacCount");
+    auto f = fprep.get(call_node->op, nullptr);
+    if (f != nullptr) count_ += f(GetRef<Call>(call_node));
     ExprVisitor::VisitExpr_(call_node);
   }
 
-  /*
-   * \brief Get the number of MACs of a CONV 2D node.
-   * \param call_node The CONV 2D call node.
-   * \return The number of MACs.
-   */
-  int64_t ComputeConv2DMacs(const CallNode* call_node) {
-    CHECK(IsConv2DNode(call_node))
-        << "The input call node must be a CONV 2D node.";
-    if (!call_node->checked_type_.defined()) {
-      LOG(WARNING) << "The infer type pass should be called before the mac count pass";
-      return 0;
-    }
-    Array<Expr> args = call_node->args;
-    CHECK(args.size() == 2)
-        << "The number of input arguments of a CONV 2D node should be 2.";
-    const auto* conv_2d_attr = call_node->attrs.as<Conv2DAttrs>();
-    const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> data_shape = data_type->shape;
-    std::string data_layout = conv_2d_attr->data_layout;
-    int32_t C_ind = Layout(data_layout).Indexof('C');
-    int32_t c_ind = Layout(data_layout).Indexof('c');
-    CHECK(C_ind != -1)
-        << "There is no input channel dimension.";
-    int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
-    if (c_ind != -1)
-      input_channel *= static_cast<int64_t>(data_shape[c_ind].as<IntImm>()->value);
-    Array<IndexExpr> kernel_size = conv_2d_attr->kernel_size;
-    CHECK(kernel_size.size() == 2)
-        << "The dimension of the kernel size in Conv 2D should be 2.";
-    const auto* expr = call_node->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> output_tensor = expr->shape;
-    CHECK(output_tensor.size() == 4 || output_tensor.size() == 5)
-        << "The dimension of the output tensor in Conv 2D should be 4 or 5.";
-    int64_t count = input_channel * GetCartesianProd(output_tensor) * GetCartesianProd(kernel_size);
-    return count;
-  }
-
-  /*
-   * \brief Get the number of MACs of a Dense node.
-   * \param call_node The Dense call node.
-   * \return The number of MACs.
-   */
-  int64_t ComputeDenseMacs(const CallNode* call_node) {
-    CHECK(IsDenseNode(call_node))
-        << "The input call node must be a Dense node.";
-    if (!call_node->checked_type_.defined()) {
-      LOG(WARNING) << "The infer type pass should be called before the mac count pass";
-      return 0;
-    }
-    Array<Expr> args = call_node->args;
-    CHECK(args.size() == 2)
-        << "The number of input arguments of a Dense node should be 2.";
-    const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
-    const auto* weight_type = args[1]->checked_type().as<TensorTypeNode>();
-    Array<IndexExpr> data_shape = data_type->shape;
-    Array<IndexExpr> weight_shape = weight_type->shape;
-    CHECK(data_shape.size() == 2 && weight_shape.size() == 2)
-        << "The dimension of an input tensor to Dense node should be 2.";
-    int64_t d1 = static_cast<int64_t>(data_shape[0].as<IntImm>()->value);
-    int64_t d2 = static_cast<int64_t>(data_shape[1].as<IntImm>()->value);
-    int64_t d3 = static_cast<int64_t>(weight_shape[0].as<IntImm>()->value);
-    int64_t d4 = static_cast<int64_t>(weight_shape[1].as<IntImm>()->value);
-    CHECK(d2 == d4)
-        << "The dimensions of input arguments do not match.";
-    int64_t count = d1 * d2 * d3;
-    return count;
-  }
-
-  int64_t GetCartesianProd(Array<IndexExpr> arr) {
-    int64_t ret = 1;
-    for (size_t i = 0; i < arr.size(); i++) {
-      const auto* intImm = arr[i].as<IntImm>();
-      ret *= static_cast<int64_t>(intImm->value);
-    }
-    return ret;
-  }
-
   int64_t count_;
 };
 
@@ -141,5 +132,6 @@ TVM_REGISTER_API("relay._ir_pass.GetTotalMacNumber")
   *ret = GetTotalMacNumber(args[0]);
 });
 
+}  // namespace mac_count
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_pass_mac_count.py b/tests/python/relay/test_pass_mac_count.py
index 56a0f5490cac..0c0144e246d3 100644
--- a/tests/python/relay/test_pass_mac_count.py
+++ b/tests/python/relay/test_pass_mac_count.py
@@ -1,7 +1,6 @@
 """Unit tests for MAC counter."""
 import tvm
 from tvm import relay
-import sys
 
 def test_gemm():
     n = 512

From b1994019c1c4561c42e55b9652a85c172282b86f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 22 Feb 2019 10:06:09 -0800
Subject: [PATCH 09/93] [Relay] fix anf for reference and pattern matching
 (#2637)

---
 src/relay/pass/to_anf.cc          | 65 +++++++++++++++++++++++++++++++
 tests/python/relay/test_to_anf.py | 43 +++++++++++++++++++-
 2 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/src/relay/pass/to_anf.cc b/src/relay/pass/to_anf.cc
index 6d65fe449fb0..912774162b51 100644
--- a/src/relay/pass/to_anf.cc
+++ b/src/relay/pass/to_anf.cc
@@ -120,6 +120,22 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
     Depend(n, t->tuple);
   }
 
+  void VisitExpr_(const RefCreateNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->value);
+  }
+
+  void VisitExpr_(const RefReadNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+  }
+
+  void VisitExpr_(const RefWriteNode* r) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(r)];
+    Depend(n, r->ref);
+    Depend(n, r->value);
+  }
+
   void VisitExpr_(const IfNode* i) final {
     DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(i)];
     DependencyGraph::Node* t = NewNode(true);
@@ -150,6 +166,21 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
     graph_.post_dfs_order.push_back(b);
   }
 
+  void VisitExpr_(const MatchNode* m) final {
+    DependencyGraph::Node* n = graph_.expr_node[GetRef<Expr>(m)];
+    Depend(n, m->data);
+    std::vector<DependencyGraph::Node*> v;
+    for (const Clause& c : m->clauses) {
+      DependencyGraph::Node* b = NewNode(true);
+      Depend(n, b);
+      Depend(b, c->rhs);
+      v.push_back(b);
+    }
+    for (auto it = v.rbegin(); it != v.rend(); ++it) {
+      graph_.post_dfs_order.push_back(*it);
+    }
+  }
+
   void VisitExpr_(const VarNode* v) final { }
 
   void VisitExpr_(const GlobalVarNode* v) final { }
@@ -157,6 +188,8 @@ class DependencyGraph::Creator : private ExprFunctor<void(const Expr& e)> {
   void VisitExpr_(const ConstantNode* c) final { }
 
   void VisitExpr_(const OpNode* o) final { }
+
+  void VisitExpr_(const ConstructorNode* c) final { }
 };
 
 DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body) {
@@ -305,6 +338,21 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
     return Compound(e, TupleGetItemNode::make(VisitExpr(t->tuple), t->index), v);
   }
 
+  Expr VisitExpr_(const RefCreateNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefCreateNode::make(VisitExpr(r->value)), v);
+  }
+
+  Expr VisitExpr_(const RefReadNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefReadNode::make(VisitExpr(r->ref)), v);
+  }
+
+  Expr VisitExpr_(const RefWriteNode* r, const Var& v) final {
+    Expr e = GetRef<Expr>(r);
+    return Compound(e, RefWriteNode::make(VisitExpr(r->ref), VisitExpr(r->value)), v);
+  }
+
   Expr VisitExpr_(const IfNode* i, const Var& v) final {
     Expr e = GetRef<Expr>(i);
     Expr ret = IfNode::make(VisitExpr(i->cond),
@@ -356,6 +404,23 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   Expr VisitExpr_(const OpNode* op, const Var& v) final {
     return GetRef<Expr>(op);
   }
+
+  Expr VisitExpr_(const ConstructorNode* c, const Var& v) final {
+    return GetRef<Expr>(c);
+  }
+
+  Expr VisitExpr_(const MatchNode* m, const Var& v) final {
+    Expr e = GetRef<Expr>(m);
+    Expr data = VisitExpr(m->data);
+    std::vector<Clause> clauses;
+    for (const Clause& c : m->clauses) {
+      clauses.push_back(ClauseNode::make(
+        c->lhs,
+        GetSubScope(e, 1 + clauses.size())->ll->Get(VisitExpr(c->rhs))));
+    }
+    Expr r = Compound(e, MatchNode::make(data, clauses), v);
+    return r;
+  }
 };
 
 Expr ToANFAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
diff --git a/tests/python/relay/test_to_anf.py b/tests/python/relay/test_to_anf.py
index 5da7e38a81f5..e8c7995cfd8e 100644
--- a/tests/python/relay/test_to_anf.py
+++ b/tests/python/relay/test_to_anf.py
@@ -3,7 +3,8 @@
 from tvm import relay
 from tvm.relay.ir_pass import to_anf, alpha_equal, infer_type
 from tvm.relay import op, create_executor
-from tvm.relay.backend.interpreter import Value, TupleValue
+from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
+from tvm.relay.prelude import Prelude
 
 
 def check_eval(expr, expected_result, mod=None, rtol=1e-07):
@@ -99,8 +100,48 @@ def test_recursion():
     check_eval(f(relay.const(5, 'int64')), 30.0, mod=mod)
 
 
+def test_ref():
+    i = relay.Var('i')
+    iv = relay.Var('iv')
+    u = relay.Var('u')
+    uv = relay.Var('uv')
+    body = relay.add(iv, uv)
+    body = relay.Let(uv, relay.RefRead(i), body)
+    body = relay.Let(u, relay.RefWrite(i, relay.const(2)), body)
+    body = relay.Let(iv, relay.RefRead(i), body)
+    body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
+    check_eval(body, 3)
+    check_eval(to_anf(body), 3)
+
+
+# this is an example of using the adt value in python side
+def count(n):
+    assert isinstance(n, ConstructorValue)
+    if n.constructor.name_hint == 's':
+        return 1 + count(n.fields[0])
+    else:
+        assert n.constructor.name_hint == 'z'
+        return 0
+
+
+def test_add():
+    mod = relay.Module()
+    p = Prelude(mod)
+    nat = p.nat
+    add = p.add
+    s = p.s
+    z = p.z
+    ctx = tvm.context("llvm", 0)
+    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+    assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
+    assert count(intrp.evaluate(add(s(z()), s(z())))) == 2
+    assert count(intrp.evaluate(to_anf(add(s(z()), s(z())), mod))) == 2
+    assert "let" in mod[add].astext()
+
 if __name__ == '__main__':
     test_explicit_bound()
     test_order()
     test_if()
     test_recursion()
+    test_ref()
+    test_add()

From f1adf2c046b615928045cde45e62a559a02c8557 Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng465@gmail.com>
Date: Fri, 22 Feb 2019 10:06:28 -0800
Subject: [PATCH 10/93] fix lint (#2649)

---
 python/tvm/hybrid/calls.py                    |  2 +-
 python/tvm/hybrid/parser.py                   | 75 +++++++++++++++----
 .../hybrid/{var_decl.py => preprocessor.py}   |  0
 tests/python/unittest/test_hybrid_script.py   | 38 ++++++----
 4 files changed, 83 insertions(+), 32 deletions(-)
 rename python/tvm/hybrid/{var_decl.py => preprocessor.py} (100%)

diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
index 84ae537d49ab..cd1e4e3a2085 100644
--- a/python/tvm/hybrid/calls.py
+++ b/python/tvm/hybrid/calls.py
@@ -45,8 +45,8 @@ def bind(func_id, args):
     _internal_assert(args.__len__() == 2, "A loop bind should only have 2 arguments!")
     _internal_assert(isinstance(args[0], str), \
                      "A loop bind's first argument should be a string!")
-    iter_var = _api.thread_axis(args[0])
     low, ext = _api.const(0, "int32"), args[1]
+    iter_var = _api.thread_axis((low, ext), args[0])
     for_type = None
     return iter_var, low, ext, for_type
 
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 147d164b61e1..0959c9df2e91 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -12,7 +12,7 @@
 from .util import _internal_assert
 from . import calls
 from . import util
-from .var_decl import determine_variable_usage
+from .preprocessor import determine_variable_usage
 from ..api import all as _all
 from ..api import any as _any
 from ..container import Array
@@ -61,6 +61,7 @@ class Symbol(Enum):
     BufferVar = 7
     LoopVar = 8
     ConstLoopVar = 9
+    ThreadBind = 10
 
 
 class HybridParser(ast.NodeVisitor):
@@ -117,7 +118,10 @@ def __init__(self, args, usage, symbols, func_name=None):
         self.symbols = {} # Symbol table
         for k, v in symbols.items():
             if isinstance(v, types.FunctionType):
-                self.symbols[k] = Symbol.Callable, v
+                self.add_symbol(k, Symbol.Callable, v)
+
+        self.binds = {} # Thread binds
+        self.device = 0 # Is it generating device
 
         self.func_name = func_name # The name of the function to be lowered
         self.outputs = [] # Output tensors' name
@@ -126,6 +130,25 @@ def __init__(self, args, usage, symbols, func_name=None):
         self.returned = False # If this function has a valid return
 
 
+    def add_symbol(self, key, ty, val): #pylint: disable=invalid-name
+        """Add value to the symbol table context"""
+        if key in self.symbols.keys():
+            old = str(self.symbols[key])
+            new = str((ty, val))
+            _internal_assert(False,
+                             "Name conflict in symbol table! [%s] %s -> %s" % (key, old, new))
+
+        self.symbols[key] = ty, val
+
+        if ty == Symbol.ThreadBind:
+            if val.var.name not in self.binds.keys():
+                self.binds[val.var.name] = val
+                return
+            val_ = self.binds[val.var.name]
+            _internal_assert(_ir_pass.Equal(val_.dom.extent, val.dom.extent),
+                             "Thread extents should be uniform!")
+            self.symbols[key] = ty, val_
+
 
     def wrap_up_realize(self, node, body):
         """Wrap up all the variables which will no longer be used"""
@@ -141,11 +164,14 @@ def wrap_up_realize(self, node, body):
                 continue
             elif 'Buffer' in ty.name:
                 _buf = entry
-                _scope = ty.name[:-6].lower() if ty is not Symbol.BufferVar else 'global'
+                _scope = 'global' if ty is Symbol.BufferVar else ty.name[:-6].lower()
                 to_pop.append(key)
             else:
                 continue
 
+            if _scope == 'global':
+                body = self.wrap_up_binds(body)
+
             _domain = [_make.range_by_min_extent(0, i) for i in _buf.shape]
             _dtype = _buf.dtype
             _true = _api.convert(True)
@@ -158,6 +184,14 @@ def wrap_up_realize(self, node, body):
         return body
 
 
+    def wrap_up_binds(self, body):
+        for _, iter_var in self.binds.items():
+            ext = iter_var.dom.extent
+            body = _make.AttrStmt(iter_var, 'thread_extent', ext, body)
+        self.binds = {}
+        return body
+
+
     #pylint: disable=invalid-name, missing-docstring
     def visit_Module(self, node):
         _internal_assert(len(node.body) == 1, \
@@ -173,10 +207,10 @@ def visit_FunctionDef(self, node):
             self.func_name = node.name
         for idx, arg in enumerate(node.args.args):
             _attr = 'id' if sys.version_info[0] < 3 else 'arg' # To make py2 and 3 compatible
-            self.symbols[getattr(arg, _attr)] = (Symbol.Input, self.args[idx])
+            self.add_symbol(getattr(arg, _attr), Symbol.Input, self.args[idx])
         res = visit_list_to_block(self.visit, node.body)
         res = self.wrap_up_realize(node, res)
-        return res
+        return self.wrap_up_binds(res)
 
 
     def visit_Expr(self, node):
@@ -189,6 +223,8 @@ def visit_Name(self, node):
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
         if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
             return entry
+        if ty is Symbol.ThreadBind:
+            return entry.var
         if ty is Symbol.ConstVar:
             return entry if isinstance(node.ctx, ast.Load) else None
         if ty is Symbol.BufferVar:
@@ -237,7 +273,7 @@ def visit_Assign(self, node):
             for i in range(rhs.num_outputs):
                 _internal_assert(isinstance(node.targets[i], ast.Name),
                                  "You should bind a pure name to the tensors")
-                self.symbols[node.targets[i].id] = Symbol.GlobalBuffer, rhs.output(i)
+                self.add_symbol(node.targets[i].id, Symbol.GlobalBuffer, rhs.output(i))
                 rmap[rhs.outputs[i].op] = rhs.output(i)
             return util.replace_io(rhs.body, rmap)
 
@@ -260,15 +296,19 @@ def visit_Assign(self, node):
                 if isinstance(rhs, tuple):
                     shape, dtype, scope = rhs
                     ph = _api.placeholder(shape, dtype=dtype, name=lhs)
-                    self.symbols[lhs] = getattr(Symbol, scope.title() + "Buffer"), ph
+                    self.add_symbol(lhs, getattr(Symbol, scope.title() + "Buffer"), ph)
                     if scope == 'output':
                         self.outputs.append(lhs)
                     return util.make_nop()
                 if isinstance(rhs, util.halide_imm_types) and ast.Store not in rw:
-                    self.symbols[lhs] = Symbol.ConstVar, rhs
+                    self.add_symbol(lhs, Symbol.ConstVar, rhs)
                 else:
+                    _internal_assert(self.device == 0,
+                                     "Single variable not supported in devices' side!\n" + \
+                                     "If you are using GPU, please allocate a 'local' spad " + \
+                                     "outside the bind body")
                     ph = _api.placeholder((1, ), dtype=rhs.dtype, name=lhs)
-                    self.symbols[lhs] = Symbol.BufferVar, ph
+                    self.add_symbol(lhs, Symbol.BufferVar, ph)
             lhs = self.visit(lhs_)
             if lhs is not None:
                 buf, args = lhs
@@ -356,7 +396,7 @@ def visit_If(self, node):
         if node.orelse:
             else_body = visit_list_to_block(self.visit, node.orelse)
         else:
-            else_body = util.make_nop()
+            else_body = None
         return _make.IfThenElse(cond, if_body, else_body)
 
 
@@ -445,28 +485,31 @@ def visit_For(self, node):
 
             bodies = []
             for i in range(low, low + ext):
-                self.symbols[_name] = Symbol.ConstLoopVar, i
+                self.add_symbol(_name, Symbol.ConstLoopVar, i)
                 body = visit_list_to_block(self.visit, node.body)
                 body = self.wrap_up_realize(node, body)
                 bodies.append(body)
+                self.symbols.pop(_name)
             return concat_list_to_block(bodies)
 
         if iter_var is None:
-            _internal_assert(for_type is not None, "The loop bind function parse error!")
+            _internal_assert(for_type is not None, "The loop iterating function parse error!")
             offset = iter_var = _api.var(_name)
             if not _ir_pass.Equal(low, _api.const(0, 'int32')):
                 offset = iter_var + low
-            self.symbols[_name] = Symbol.LoopVar, offset
+            self.add_symbol(_name, Symbol.LoopVar, offset)
             _body = visit_list_to_block(self.visit, node.body)
         else:
-            _internal_assert(for_type is None, "The loop iterating function parse error!")
-            self.symbols[_name] = Symbol.LoopVar, iter_var.var
+            _internal_assert(for_type is None, "The loop bind function parse error!")
+            self.add_symbol(_name, Symbol.ThreadBind, iter_var)
+            self.device += 1
             _body = visit_list_to_block(self.visit, node.body)
+            self.device -= 1
 
         _body = self.wrap_up_realize(node, _body)
 
         if for_type is None:
-            res = _make.AttrStmt(iter_var, 'thread_extent', ext, _body)
+            res = _body
         else:
             _internal_assert(not isinstance(for_type, tuple), \
                             "Micro expansion should be handled before!")
diff --git a/python/tvm/hybrid/var_decl.py b/python/tvm/hybrid/preprocessor.py
similarity index 100%
rename from python/tvm/hybrid/var_decl.py
rename to python/tvm/hybrid/preprocessor.py
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index 405577b05b3b..d35c8ab3a0df 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -300,6 +300,7 @@ def test_bind():
     if not tvm.gpu(0).exist:
         print('[Warning] No GPU found! Skip bind test!')
         return
+
     @script
     def vec_add(a, b):
         c = output_tensor((1000, ), 'float32')
@@ -326,23 +327,29 @@ def raw(a, b):
     func, ins, outs = run_and_check(raw, [a, b], sch=sch, outs=[c], target='cuda')
     run_and_check(func, ins, outs=outs, target='cuda')
 
-    # Test loop binds
+
     @tvm.hybrid.script
-    def goo(a, b):
-        c = output_tensor(a.shape, a.dtype)
-        len_b = len(b)
-        for i in const_range(len_b * 2):
-            if i < len_b:
-                c[i] = a[i] + b[i]
-            else:
-                c[i - len_b] = a[i - len_b] + b[i - len_b]
+    def foo(a):
+        c = output_tensor((a.shape[0],), a.dtype)
+        total = allocate((1,), a.dtype, 'local')
+        len_i = a.shape[0]
+        len_j = a.shape[1]
+        for i in bind('threadIdx.x', len_i):
+            total[0] = 0.
+            for k in const_range(len_j):
+                total[0] += a[i, k]
+            c[i] = total[0]
+    
         return c
-    a = tvm.placeholder((5, ), name='a', dtype='int32')
-    b = [1, 2, 3, 4, 5]
-    c = goo(a, tvm.convert(b))
-    sch = tvm.create_schedule(c.op)
-    func, ins, outs = run_and_check(goo, [a, b], sch=sch, outs=[c])
-    run_and_check(func, ins, outs=outs)
+    
+    a = tvm.placeholder((8, 4), 'float32')
+    c = foo(a)
+    s = tvm.create_schedule(c.op)
+    ir = tvm.lower(s, [a, c], simple_mode=True)
+    assert not isinstance(ir, tvm.stmt.AttrStmt)
+    func, ins, outs = run_and_check(foo, [a], target='cuda')
+    run_and_check(func, ins, outs=outs, target='cuda')
+
 
 def test_math_intrin():
     @script
@@ -455,6 +462,7 @@ def share_vec_add(a, b):
 
         a = tvm.placeholder((256, ), dtype='float32', name='a')
         b = tvm.placeholder((256, ), dtype='float32', name='b')
+        c = share_vec_add(a, b)
         func, ins, outs = run_and_check(share_vec_add, [a, b], target='cuda')
         run_and_check(func, ins, outs=outs, target='cuda')
     else:

From e0ec87d80a8cc085af5c8d95bfee5899e38f9617 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Fri, 22 Feb 2019 10:18:56 -0800
Subject: [PATCH 11/93] [RELAY/OP] Gradient of relay level1 ops (#2633)

---
 python/tvm/relay/expr.py                  |  3 +
 python/tvm/relay/op/__init__.py           |  1 +
 python/tvm/relay/op/_tensor.py            | 18 ------
 python/tvm/relay/op/_tensor_grad.py       | 79 +++++++++++++++++++++++
 python/tvm/relay/op/op.py                 |  2 +-
 tests/python/relay/test_op_grad_level1.py | 76 ++++++++++++++++++++++
 tests/python/relay/test_op_level1.py      | 16 ++---
 7 files changed, 168 insertions(+), 27 deletions(-)
 create mode 100644 python/tvm/relay/op/_tensor_grad.py
 create mode 100644 tests/python/relay/test_op_grad_level1.py

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 9257bad7dd58..bd28acc9e4b5 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -51,6 +51,9 @@ def astype(self, dtype):
         """
         return _make.cast(self, dtype)
 
+    def __neg__(self):
+        return _op_make.negative(self)
+
     def __add__(self, other):
         if isinstance(other, Expr):
             return _op_make.add(self, other)
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 13f521dad660..84b0ceef8524 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -18,6 +18,7 @@
 
 # operator registry
 from . import _tensor
+from . import _tensor_grad
 from . import _transform
 from . import _reduce
 from ..expr import Expr
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index d9b5e2e89ce0..39e1f7afbfa2 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -3,25 +3,7 @@
 from __future__ import absolute_import
 import topi
 from .op import register_compute, register_schedule, register_pattern
-from .op import register_gradient
 from .op import schedule_injective, OpPattern
-from .transform import collapse_sum_like
-from .tensor import negative
-
-
-def add_grad(orig, grad):
-    return [collapse_sum_like(grad, orig.args[0]), collapse_sum_like(grad, orig.args[1])]
-
-
-register_gradient("add", add_grad)
-
-
-def subtract_grad(orig, grad):
-    return [collapse_sum_like(grad, orig.args[0]),
-            collapse_sum_like(negative(grad), orig.args[1])]
-
-
-register_gradient("subtract", subtract_grad)
 
 schedule_broadcast = schedule_injective
 schedule_elemwise = schedule_injective
diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py
new file mode 100644
index 000000000000..173e97a00496
--- /dev/null
+++ b/python/tvm/relay/op/_tensor_grad.py
@@ -0,0 +1,79 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..expr import const
+from .op import register_gradient
+from .transform import collapse_sum_like, where
+from .tensor import exp, negative, power, less
+from .tensor import zeros_like, ones_like
+
+
+@register_gradient("log")
+def log_grad(orig, grad):
+    """Returns [grad * (1 / x)]"""
+    x = orig.args[0]
+    return [grad * ones_like(x) / x]
+
+
+@register_gradient("exp")
+def exp_grad(orig, grad):
+    """Returns [grad * exp(x)]"""
+    return [grad * exp(orig.args[0])]
+
+
+@register_gradient("sqrt")
+def sqrt_grad(orig, grad):
+    """Returns [grad * 0.5 * (x ^ -0.5)]"""
+    a = const(0.5)  # (TODO) type?
+    return [grad * a * power(orig.args[0], negative(a))]
+
+
+@register_gradient("sigmoid")
+def sigmoid_grad(orig, grad):
+    """Returns [grad * sigmoid(x) * (1 - sigmoid(x))]."""
+    return [grad * orig * (ones_like(orig) - orig)]
+
+
+@register_gradient("tanh")
+def tanh_grad(orig, grad):
+    """Returns grad * (1 - tanh(x) * tanh(x))."""
+    return [grad * ones_like(orig) - orig * orig]
+
+
+@register_gradient("nn.relu")
+def relu_grad(orig, grad):
+    """Returns grad * (select(x < 0, 0, 1))."""
+    x = orig.args[0]
+    zeros = zeros_like(x)
+    ones = ones_like(x)
+    return [where(less(x, zeros), zeros, ones * grad)]
+
+
+@register_gradient("add")
+def add_grad(orig, grad):
+    """Returns [grad, grad]"""
+    return [collapse_sum_like(grad, orig.args[0]),
+            collapse_sum_like(grad, orig.args[1])]
+
+
+@register_gradient("subtract")
+def subtract_grad(orig, grad):
+    """Returns [grad, -grad]"""
+    return [collapse_sum_like(grad, orig.args[0]),
+            collapse_sum_like(negative(grad), orig.args[1])]
+
+
+@register_gradient("multiply")
+def multiply_grad(orig, grad):
+    """Returns [grad * y, grad * x]"""
+    x, y = orig.args
+    return [collapse_sum_like(grad * y, x),
+            collapse_sum_like(grad * x, y)]
+
+
+@register_gradient("divide")
+def divide_grad(orig, grad):
+    """Returns [grad / y,  - grad * (x / y) / y]"""
+    x, y = orig.args
+    return [collapse_sum_like(grad / y, x),
+            collapse_sum_like(- (grad * orig / y), y)]
diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py
index e751a4e5565e..37f1fc1ee2b5 100644
--- a/python/tvm/relay/op/op.py
+++ b/python/tvm/relay/op/op.py
@@ -168,7 +168,7 @@ def register_pattern(op_name, pattern, level=10):
     """
     return register(op_name, "TOpPattern", pattern, level)
 
-def register_gradient(op_name, fgradient, level=10):
+def register_gradient(op_name, fgradient=None, level=10):
     """Register operator pattern for an op.
 
     Parameters
diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py
new file mode 100644
index 000000000000..a9d91f757407
--- /dev/null
+++ b/tests/python/relay/test_op_grad_level1.py
@@ -0,0 +1,76 @@
+import tvm
+import numpy as np
+from tvm import relay
+from tvm.relay.ir_pass import gradient, infer_type
+from tvm.relay.testing import ctx_list
+
+def sigmoid(x):
+    one = np.ones_like(x)
+    return one / (one + np.exp(-x))
+
+def relu(x):
+    x_copy = np.copy(x)
+    np.maximum(x_copy, 0, x_copy)
+    return x_copy
+
+def test_unary_op():
+    def check_single_op(opfunc, ref):
+        shape = (10, 4)
+        dtype = 'float32'
+        tp = relay.TensorType(shape, dtype)
+        x = relay.var("x", tp)
+        y = opfunc(x)
+
+        if ref is not None:
+            data = np.random.rand(*shape).astype(dtype)
+            ref_grad = ref(data)
+            fwd_func = relay.Function([x], y)
+            bwd_func = infer_type(gradient(fwd_func))
+
+            for target, ctx in ctx_list():
+                intrp = relay.create_executor(ctx=ctx, target=target)
+                op_res, (op_grad, ) = intrp.evaluate(bwd_func)(data)
+                np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01)
+
+    for opfunc, ref in [(tvm.relay.log, lambda x: 1 / x),
+                        (tvm.relay.exp, np.exp),
+                        (tvm.relay.sigmoid, lambda x: sigmoid(x) * (1 - sigmoid(x))),
+                        (tvm.relay.tanh, lambda x: 1 - np.tanh(x) * np.tanh(x)),
+                        (tvm.relay.sqrt, lambda x: 0.5 * np.power(x, -0.5)),
+                        (relay.nn.relu, lambda x: np.where(x < 0, np.zeros_like(x), np.ones_like(x)))]:
+        check_single_op(opfunc, ref)
+
+
+def test_binary_op():
+    def inst(vars, sh):
+        return [vars.get(s, s) for s in sh]
+
+    def check_binary_op(opfunc, ref):
+        s = (5, 10, 5)
+        t = relay.TensorType((5, 10, 5))
+        x = relay.var("x", t)
+        y = relay.var("y", t)
+        z = opfunc(x, y)
+
+        x_data = np.random.rand(*s).astype(t.dtype)
+        y_data = np.random.rand(*s).astype(t.dtype)
+        ref_grad0, ref_grad1 = ref(x_data, y_data)
+        fwd_func = relay.Function([x, y], z)
+        bwd_func = infer_type(gradient(fwd_func))
+
+        for target, ctx in ctx_list():
+            intrp = relay.create_executor(ctx=ctx, target=target)
+            op_res, (op_grad0, op_grad1) = intrp.evaluate(bwd_func)(x_data, y_data)
+            np.testing.assert_allclose(op_grad0.asnumpy(), ref_grad0, rtol=0.01)
+            np.testing.assert_allclose(op_grad1.asnumpy(), ref_grad1, rtol=0.01)
+
+    for opfunc, ref in [(relay.add, lambda x, y: [np.ones_like(x), np.ones_like(y)]),
+                        (relay.subtract, lambda x, y: [np.ones_like(x), -np.ones_like(y)]),
+                        (relay.multiply, lambda x, y: [y, x]),
+                        (relay.divide, lambda x, y: [1 / y, - x / (y**2)])]:
+        check_binary_op(opfunc, ref)
+
+
+if __name__ == "__main__":
+    test_unary_op()
+    test_binary_op()
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index 6a1662b65170..d29b808be0d1 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -39,11 +39,11 @@ def check_single_op(opfunc, ref):
 
 
     for opfunc, ref in [(tvm.relay.log, np.log),
-                   (tvm.relay.exp, np.exp),
-                   (tvm.relay.sqrt, np.sqrt),
-                   (tvm.relay.sigmoid, sigmoid),
-                   (tvm.relay.tanh, np.tanh),
-                   (relay.nn.relu, relu)]:
+                        (tvm.relay.exp, np.exp),
+                        (tvm.relay.sqrt, np.sqrt),
+                        (tvm.relay.sigmoid, sigmoid),
+                        (tvm.relay.tanh, np.tanh),
+                        (relay.nn.relu, relu)]:
         check_single_op(opfunc, ref)
 
 
@@ -84,9 +84,9 @@ def check_binary_op(opfunc, ref):
                 np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
 
     for opfunc, ref in [(relay.add, np.add),
-                   (relay.subtract, np.subtract),
-                   (relay.multiply, np.multiply),
-                   (relay.divide, np.divide)]:
+                        (relay.subtract, np.subtract),
+                        (relay.multiply, np.multiply),
+                        (relay.divide, np.divide)]:
         check_binary_op(opfunc, ref)
 
 

From 21f4f2d49d94be92ad1d7612d42858183725cb50 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Fri, 22 Feb 2019 14:30:59 -0800
Subject: [PATCH 12/93] Update community.rst

---
 docs/contribute/community.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/contribute/community.rst b/docs/contribute/community.rst
index 971d2d9a1cfb..52c9149d453f 100644
--- a/docs/contribute/community.rst
+++ b/docs/contribute/community.rst
@@ -25,7 +25,7 @@ Committers are individuals who are granted the write access to the project. A co
 - Quality of contributions: High-quality, readable code contributions indicated by pull requests that can be merged without a substantial code review.  History of creating clean, maintainable code and including good test cases. Informative code reviews to help other contributors that adhere to a good standard.
 - Community involvement: active participation in the discussion forum, promote the projects via tutorials, talks and outreach. We encourage committers to collaborate broadly, e.g. do code reviews and discuss designs with community members that they do not interact physically.
 
-The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to identify new candidates outside of their own organization.
+The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. Potential candidates are usually proposed via an internal discussion among PMCs, followed by a consensus approval, i.e. least 3 +1 votes, and no vetoes. Any veto must be accompanied by reasoning. PMCs should serve the community by upholding the community practices and guidelines TVM a better community for everyone. PMCs should strive to only nominate new candidates outside of their own organization.
 
 
 Reviewers

From 239facee5cc84664876a1a97a0fb6b3b0cdc56a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 22 Feb 2019 14:48:12 -0800
Subject: [PATCH 13/93] [Relay] GNF (#2492)

---
 include/tvm/relay/pass.h                      | 13 +++-
 python/tvm/relay/ir_pass.py                   | 19 +++++-
 .../pass/{to_anf.cc => to_a_normal_form.cc}   | 32 ++++-----
 src/relay/pass/to_graph_normal_form.cc        | 66 +++++++++++++++++++
 ...est_to_anf.py => test_to_a_normal_form.py} | 14 ++--
 .../python/relay/test_to_graph_normal_form.py | 51 ++++++++++++++
 6 files changed, 169 insertions(+), 26 deletions(-)
 rename src/relay/pass/{to_anf.cc => to_a_normal_form.cc} (93%)
 create mode 100644 src/relay/pass/to_graph_normal_form.cc
 rename tests/python/relay/{test_to_anf.py => test_to_a_normal_form.py} (92%)
 create mode 100644 tests/python/relay/test_to_graph_normal_form.py

diff --git a/include/tvm/relay/pass.h b/include/tvm/relay/pass.h
index b87f9319a3d3..75bfe92ec21c 100644
--- a/include/tvm/relay/pass.h
+++ b/include/tvm/relay/pass.h
@@ -320,7 +320,18 @@ struct StructuralHash {
  *
  * \return expression in A-Normal Form
  */
-Expr ToANF(const Expr& e, const Module& mod);
+Expr ToANormalForm(const Expr& e, const Module& mod);
+
+/*! \brief Remove let binding and directly share via pointer instead.
+ *
+ * It will remove all let binding,
+ * and turn all of the variable bound by let into direct pointer reference.
+ *
+ * \param e the expression.
+ *
+ * \return the expression in graph normal form.
+ */
+Expr ToGraphNormalForm(const Expr& e);
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 561c5d388788..02a6e8b5906e 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -490,7 +490,7 @@ def collect_device_annotation_ops(expr):
     return _ir_pass.CollectDeviceAnnotationOps(expr)
 
 
-def to_anf(expr, mod=None):
+def to_a_normal_form(expr, mod=None):
     """
     Turn Graph Normal Form expression into A Normal Form Expression.
 
@@ -513,7 +513,21 @@ def to_anf(expr, mod=None):
     expr: tvm.relay.Expr
       The output expression.
     """
-    return _ir_pass.to_anf(expr, mod)
+    return _ir_pass.to_a_normal_form(expr, mod)
+
+
+def to_graph_normal_form(expr):
+    """Turn A Normal Form expression into Graph Normal Form expression
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression
+    Returns
+    -------
+    expr : tvm.relay.Expr
+      The output expression
+    """
+    return _ir_pass.to_graph_normal_form(expr)
 
 
 def gradient(expr, mod=None):
@@ -534,6 +548,7 @@ def gradient(expr, mod=None):
     """
     return _ir_pass.first_order_gradient(expr, mod)
 
+
 def get_total_mac_number(expr):
     """
     Count the number of MACs (multiply-accumulate) of a model
diff --git a/src/relay/pass/to_anf.cc b/src/relay/pass/to_a_normal_form.cc
similarity index 93%
rename from src/relay/pass/to_anf.cc
rename to src/relay/pass/to_a_normal_form.cc
index 912774162b51..53e2c1c594f8 100644
--- a/src/relay/pass/to_anf.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -196,7 +196,7 @@ DependencyGraph DependencyGraph::Create(common::Arena* arena, const Expr& body)
   return Creator(arena).Create(body);
 }
 
-Expr ToANF(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
+Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv);
 
 struct ScopeNode;
 using Scope = std::shared_ptr<ScopeNode>;
@@ -258,11 +258,11 @@ bool IsPrimitiveFunction(const Expr& e) {
 
 class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
  public:
-  static Expr ToANF(const Expr& e,
-                    const Module& m,
-                    const DependencyGraph& dg,
-                    std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
-                    std::set<GlobalVar>* gv) {
+  static Expr ToANormalForm(const Expr& e,
+                            const Module& m,
+                            const DependencyGraph& dg,
+                            std::unordered_map<DependencyGraph::Node*, Scope>* node_scope,
+                            std::set<GlobalVar>* gv) {
     Fill fi(m, dg, node_scope, gv);
     return fi.GetScope(e)->ll->Get(fi.VisitExpr(e));
   }
@@ -396,7 +396,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
     GlobalVar gv = GetRef<GlobalVar>(gvn);
     if (visited_->count(gv) == 0) {
       visited_->insert(gv);
-      mod_->Update(gv, Downcast<Function>(relay::ToANF(mod_->Lookup(gv), mod_, visited_)));
+      mod_->Update(gv, Downcast<Function>(relay::ToANormalForm(mod_->Lookup(gv), mod_, visited_)));
     }
     return gv;
   }
@@ -423,7 +423,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 };
 
-Expr ToANFAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalFormAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   /* When you lift a lambda, what is inside is also being lift.
    *
    * So we must determine the scope of the lambda before determining the scope of it's body.
@@ -446,29 +446,29 @@ Expr ToANFAux(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
    * We do an additional pass to fill all the LetList and we are done.
    */
   std::unordered_map<DependencyGraph::Node*, Scope> node_scope = CalcScope(dg);
-  return Fill::ToANF(e, m, dg, &node_scope, gv);
+  return Fill::ToANormalForm(e, m, dg, &node_scope, gv);
 }
 
-Expr ToANF(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
+Expr ToANormalForm(const Expr& e, const Module& m, std::set<GlobalVar>* gv) {
   if (const auto* f = e.as<FunctionNode>()) {
     return FunctionNode::make(f->params,
-                              ToANFAux(f->body, m, gv),
+                              ToANormalFormAux(f->body, m, gv),
                               f->ret_type,
                               f->type_params,
                               f->attrs);
   } else {
-    return ToANFAux(e, m, gv);
+    return ToANormalFormAux(e, m, gv);
   }
 }
 
-Expr ToANF(const Expr& e, const Module& m) {
+Expr ToANormalForm(const Expr& e, const Module& m) {
   std::set<GlobalVar> gv;
-  return ToANF(e, m, &gv);
+  return ToANormalForm(e, m, &gv);
 }
 
-TVM_REGISTER_API("relay._ir_pass.to_anf")
+TVM_REGISTER_API("relay._ir_pass.to_a_normal_form")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
-    *ret = ToANF(args[0], args[1]);
+    *ret = ToANormalForm(args[0], args[1]);
   });
 
 }  // namespace relay
diff --git a/src/relay/pass/to_graph_normal_form.cc b/src/relay/pass/to_graph_normal_form.cc
new file mode 100644
index 000000000000..bc1630263e3f
--- /dev/null
+++ b/src/relay/pass/to_graph_normal_form.cc
@@ -0,0 +1,66 @@
+/*!
+ * Copyright (c) 2018 by Contributors
+ *
+ * \file to_gnf.cc
+ *
+ * \brief Turn A normal form into graph normal form.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include "let_list.h"
+
+namespace tvm {
+namespace relay {
+
+class UseVarVisitor : public ExprVisitor {
+ public:
+  explicit UseVarVisitor(const Var& v) : v(v) { }
+
+  static bool UseVar(const Var& v, const Expr& e) {
+    UseVarVisitor uv(v);
+    uv(e);
+    return uv.use_var;
+  }
+
+ private:
+  bool use_var = false;
+  Var v;
+
+  void VisitExpr_(const VarNode* vn) override {
+    use_var = use_var || (v == GetRef<Var>(vn));
+  }
+};
+
+class GNF : public ExprMutator {
+ private:
+  std::unordered_map<Var, Expr, NodeHash, NodeEqual> var_map_;
+  Expr VisitExpr_(const VarNode* vn) override {
+    Var v = GetRef<Var>(vn);
+    return var_map_.count(v) == 0 ? v : var_map_.at(v);
+  }
+
+  static bool UseVar(const Var& v, const Expr& e) {
+    return UseVarVisitor::UseVar(v, e);
+  }
+
+  static Expr WrapRec(const Var& var, const Expr& val) {
+    return UseVar(var, val) ? LetNode::make(var, val, var) : val;
+  }
+
+  Expr VisitExpr_(const LetNode* ln) override {
+    var_map_.insert(std::pair<Var, Expr>(ln->var, VisitExpr(WrapRec(ln->var, ln->value))));
+    return VisitExpr(ln->body);
+  }
+};
+
+Expr ToGraphNormalForm(const Expr& e) {
+  return GNF()(e);
+}
+
+TVM_REGISTER_API("relay._ir_pass.to_graph_normal_form")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  *ret = ToGraphNormalForm(args[0]);
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_to_anf.py b/tests/python/relay/test_to_a_normal_form.py
similarity index 92%
rename from tests/python/relay/test_to_anf.py
rename to tests/python/relay/test_to_a_normal_form.py
index e8c7995cfd8e..c15dc8ffc269 100644
--- a/tests/python/relay/test_to_anf.py
+++ b/tests/python/relay/test_to_a_normal_form.py
@@ -1,7 +1,7 @@
 import numpy as np
 import tvm
 from tvm import relay
-from tvm.relay.ir_pass import to_anf, alpha_equal, infer_type
+from tvm.relay.ir_pass import to_a_normal_form, alpha_equal, infer_type
 from tvm.relay import op, create_executor
 from tvm.relay.backend.interpreter import Value, TupleValue, ConstructorValue
 from tvm.relay.prelude import Prelude
@@ -21,7 +21,7 @@ def test_explicit_bound():
     z = op.add(y, y)
     f = relay.Function([], op.add(z, z))
     assert not "let" in f.astext() # assert the values are implicitly bounded
-    anf = to_anf(f)
+    anf = to_a_normal_form(f)
     assert "let" in anf.astext() # assert the values are explicitly bounded
     check_eval(f(), 8.0)
     check_eval(anf(), 8.0)
@@ -35,7 +35,7 @@ def test_order():
     x = relay.const(1)
     val = x + y * z
     check_eval(val, 7.0)
-    anf = infer_type(to_anf(val))
+    anf = infer_type(to_a_normal_form(val))
     a = relay.Var('a', relay.IncompleteType())
     b = relay.Var('b', relay.IncompleteType())
     c = relay.Var('c', relay.IncompleteType())
@@ -54,7 +54,7 @@ def test_order():
 def test_if():
     cond = relay.const(True)
     x = relay.If(cond, relay.const(2), relay.const(3))
-    anf = infer_type(to_anf(x))
+    anf = infer_type(to_a_normal_form(x))
     a = relay.Var('a', relay.IncompleteType())
     b = relay.Var('b', relay.IncompleteType())
     c = relay.Var('c', relay.IncompleteType())
@@ -96,7 +96,7 @@ def test_recursion():
     mod[f] = value
     check_eval(f(relay.const(5, 'int64')), 30.0, mod=mod)
     old_f = mod[f]
-    f = to_anf(f, mod=mod)
+    f = to_a_normal_form(f, mod=mod)
     check_eval(f(relay.const(5, 'int64')), 30.0, mod=mod)
 
 
@@ -111,7 +111,7 @@ def test_ref():
     body = relay.Let(iv, relay.RefRead(i), body)
     body = relay.Let(i, relay.RefCreate(relay.const(1)), body)
     check_eval(body, 3)
-    check_eval(to_anf(body), 3)
+    check_eval(to_a_normal_form(body), 3)
 
 
 # this is an example of using the adt value in python side
@@ -135,7 +135,7 @@ def test_add():
     intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
     assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat())
     assert count(intrp.evaluate(add(s(z()), s(z())))) == 2
-    assert count(intrp.evaluate(to_anf(add(s(z()), s(z())), mod))) == 2
+    assert count(intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
     assert "let" in mod[add].astext()
 
 if __name__ == '__main__':
diff --git a/tests/python/relay/test_to_graph_normal_form.py b/tests/python/relay/test_to_graph_normal_form.py
new file mode 100644
index 000000000000..ac86799b6b8c
--- /dev/null
+++ b/tests/python/relay/test_to_graph_normal_form.py
@@ -0,0 +1,51 @@
+import numpy as np
+import tvm
+from tvm import relay
+from tvm.relay.ir_pass import to_graph_normal_form, to_a_normal_form, alpha_equal
+from tvm.relay import op, create_executor
+from tvm.relay.backend.interpreter import Value, TupleValue
+
+
+def check_eval(expr, args, expected_result, mod=None, rtol=1e-07):
+    if mod is None:
+        mod = relay.Module()
+
+    ctx = tvm.context("llvm", 0)
+    intrp = create_executor(mod=mod, ctx=ctx, target="llvm")
+
+    result = intrp.evaluate(expr)(*args)
+    np.testing.assert_allclose(result.asnumpy(), expected_result, rtol=rtol)
+
+
+def test_implicit_share():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    z = relay.Var('z')
+    body = relay.Let(z, op.add(y, y), op.add(z, z))
+    body = relay.Let(y, op.add(x, x), body)
+    f = relay.Function([], relay.Let(x, relay.const(1), body))
+    g = to_graph_normal_form(f)
+    assert "let" in f.astext()
+    assert not "let" in g.astext()
+    check_eval(f, [], 8.0)
+    check_eval(g, [], 8.0)
+
+
+def test_round_trip():
+    x = relay.Var('x')
+    y = relay.Var('y')
+    z = relay.Var('z')
+    body = relay.Let(z, op.add(y, y), op.add(z, z))
+    body = relay.Let(y, op.add(x, x), body)
+    f = relay.Function([], relay.Let(x, relay.const(1), body))
+    g = to_graph_normal_form(f)
+    h = to_a_normal_form(g)
+    assert "let" in f.astext()
+    assert not "let" in g.astext()
+    check_eval(f, [], 8.0)
+    check_eval(g, [], 8.0)
+    check_eval(h, [], 8.0)
+
+if __name__ == '__main__':
+    test_implicit_share()
+    test_round_trip()

From 05161275deb96d0df3df5a5229ea920d62607b32 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Fri, 22 Feb 2019 18:59:57 -0800
Subject: [PATCH 14/93] add committer (#2661)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 4bc53f86fc08..772f4ab18646 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -19,6 +19,7 @@ We do encourage everyone to work anything they are interested in.
 - [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay
 - [Masahiro Masuda](https://github.com/masahi): @masahi - topi, relay
 - [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta
+- [Jared Roesch](https://github.com/jroesch): @jroesch - relay
 - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web

From e457cd726f8143d3ce1ba968269b46c8e08263e8 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Fri, 22 Feb 2019 22:49:15 -0800
Subject: [PATCH 15/93] [Relay/TOPI][OP] Add arange op in Relay and TOPI
 (#2621)

* Add arange op

* Update docs

* Fix bug

* add sanity check in relay and mxnet frontend mapping

* lint

* nits

* pylint

* don't allow empty output from arange

* Remove empty test for arange

* Fix bug and update doc
---
 docs/api/python/topi.rst                    |  2 +
 docs/langref/relay_op.rst                   |  2 +
 include/tvm/relay/attrs/transform.h         | 19 +++++++
 python/tvm/relay/frontend/mxnet.py          | 13 +++++
 python/tvm/relay/op/_transform.py           |  1 +
 python/tvm/relay/op/transform.py            | 52 ++++++++++++++++--
 src/relay/op/tensor/transform.cc            | 57 ++++++++++++++++++++
 tests/python/frontend/mxnet/test_forward.py | 60 ++++++++++++++-------
 tests/python/relay/test_op_level3.py        | 35 ++++++++++++
 topi/include/topi/transform.h               | 13 +++++
 topi/python/topi/transform.py               | 29 ++++++++++
 topi/src/topi.cc                            |  5 ++
 topi/tests/python/test_topi_transform.py    | 43 +++++++++++++++
 13 files changed, 309 insertions(+), 22 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 856bad198e88..ec5d600dab2b 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -67,6 +67,7 @@ List of operators
    topi.not_equal
    topi.greater_equal
    topi.less_equal
+   topi.arange
    topi.image.resize
 
 
@@ -123,6 +124,7 @@ topi
 .. autofunction:: topi.power
 .. autofunction:: topi.greater
 .. autofunction:: topi.less
+.. autofunction:: topi.arange
 
 topi.nn
 ~~~~~~~
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e1f38c61eb1f..d58ba2e66621 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -95,6 +95,7 @@ This level enables additional math and transform operators.
    tvm.relay.full_like
    tvm.relay.cast
    tvm.relay.split
+   tvm.relay.arange
 
 
 **Level 4: Broadcast and Reductions**
@@ -216,6 +217,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.full_like
 .. autofunction:: tvm.relay.cast
 .. autofunction:: tvm.relay.split
+.. autofunction:: tvm.relay.arange
 
 
 Level 4 Definitions
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fa27a4d437d2..d76bfceb59e8 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -96,6 +96,25 @@ struct InitOpAttrs : public tvm::AttrsNode<InitOpAttrs> {
   }
 };  // struct InitOpAttrs
 
+/*! \brief Attributes used in arange operators */
+struct ArangeAttrs : public tvm::AttrsNode<ArangeAttrs> {
+  tvm::Expr start;
+  tvm::Expr stop;
+  tvm::Expr step;
+  DataType dtype;
+
+  TVM_DECLARE_ATTRS(ArangeAttrs, "relay.attrs.ArangeAttrs") {
+    TVM_ATTR_FIELD(start).set_default(make_const(Float(32), 0))
+        .describe("Start of interval. The interval includes this value.");
+    TVM_ATTR_FIELD(stop)
+        .describe("Stop of interval. The interval does not include this value.");
+    TVM_ATTR_FIELD(step).set_default(make_const(Float(32), 1))
+        .describe("Spacing between values.");
+    TVM_ATTR_FIELD(dtype).set_default(NullValue<DataType>())
+        .describe("Target data type.");
+  }
+};  // struct ArangeAttrs
+
 /*! \brief Attributes used in squeeze operators */
 struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
   // use axis to make the name numpy compatible.
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 3a0885a3fcdf..c48a116a9d0e 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -268,6 +268,18 @@ def _mx_multibox_detection(inputs, attrs):
     return _op.vision.nms(ret[0], ret[1], **new_attrs1)
 
 
+def _mx_arange(inputs, attrs):
+    assert len(inputs) == 0
+    if attrs.get_int("repeat", 1) != 1:
+        raise RuntimeError("arange doesn't support repeat")
+    new_attrs = {}
+    new_attrs["start"] = attrs.get_float("start", 0)
+    new_attrs["stop"] = attrs.get_float("stop")
+    new_attrs["step"] = attrs.get_float("step", 1)
+    new_attrs["dtype"] = attrs.get_str("dtype", "float32")
+    return _op.arange(**new_attrs)
+
+
 def _mx_roi_align(inputs, attrs):
     new_attrs = {}
     new_attrs["pooled_size"] = attrs.get_int_tuple("pooled_size")
@@ -362,6 +374,7 @@ def _mx_roi_align(inputs, attrs):
     "Concat"        : _mx_concat,
     "concat"        : _mx_concat,
     "LeakyReLU"     : _mx_leaky_relu,
+    "_arange"       : _mx_arange,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
     # vision
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index abf0b5317b48..b8c00b90d40e 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -19,6 +19,7 @@
 _reg.register_schedule("reshape_like", schedule_injective)
 _reg.register_schedule("full", schedule_injective)
 _reg.register_schedule("full_like", schedule_injective)
+_reg.register_schedule("arange", schedule_injective)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 78efd3cfd4d9..cf1ae0573716 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -166,8 +166,9 @@ def reshape_like(data, shape_like):
     """Reshapes the input array by the size of another array.
     For an input array with shape ``(d1, d2, ..., dk)``, `reshape_like` operation reshapes
     the input array into an output array with the same shape as the second input array.
+
     .. note::
-    Sizes for both array should be compatible.
+        Sizes for both array should be compatible.
 
     Parameters
     ----------
@@ -249,10 +250,57 @@ def full_like(data, fill_value):
     return _make.full_like(data, fill_value)
 
 
+def arange(start, stop=None, step=1, dtype="float32"):
+    """Return evenly spaced values within a given interval.
+
+    .. note::
+        Similar to ``numpy.arange``, when only one argument is given, it is used
+        as `stop` instead of `start` while `start` takes default value 0.
+
+        Warning: Undefined behavior when dtype is incompatible with start/stop/step.
+        It could lead to different results compared to numpy, MXNet, pytorch, etc.
+
+    Parameters
+    ----------
+    start : tvm.Expr, optional
+        Start of interval. The interval includes this value. The default start
+        value is 0.
+
+    stop : tvm.Expr
+        Stop of interval. The interval does not include this value.
+
+    step : tvm.Expr, optional
+        Spacing between values. The default step size is 1.
+
+    dtype : str, optional
+        The target data type.
+
+    Returns
+    -------
+    result : relay.Expr
+        The resulting tensor.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        relay.arange(5) = [0, 1, 2, 3, 4]
+        relay.arange(1, 5) = [1, 2, 3, 4]
+        relay.arange(1, 5, 1.5) = [1, 2.5, 4]
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return _make.arange(start, stop, step, dtype)
+
+
 def where(condition, x, y):
     """Selecting elements from either x or y depending on the value of the
     condition.
 
+    .. note::
+        The shape of condition, x, and y needs to be the same.
+
     Parameters
     ----------
     condition : relay.Expr
@@ -282,8 +330,6 @@ def where(condition, x, y):
 
         condition = [1, 0]
         relay.where(conditon, x, y) = [[1, 2], [7, 8]]
-
-    Note that the shape of condition, x, and y needs to be the same.
     """
     return _make.where(condition, x, y)
 
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index b38999d1b1b7..48c97b91dfda 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -880,6 +880,63 @@ and type as the input array.
 .set_attr<FTVMCompute>("FTVMCompute", FullLikeCompute)
 .set_attr<TOpPattern>("TOpPattern", kElemWise);
 
+// arange operator
+TVM_REGISTER_NODE_TYPE(ArangeAttrs);
+
+bool ArangeRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 1);
+  const ArangeAttrs* param = attrs.as<ArangeAttrs>();
+  IndexExpr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
+      tvm::cast(tvm::Float(32), param->stop - param->start) / param->step));
+  if (const tvm::ir::IntImm* val = num_elem.as<tvm::ir::IntImm>()) {
+    CHECK_GT(val->value, 0)
+        << "Invalid arange attributes (start, stop, step): " << param->start
+        << ", " << param->stop << ", " << param->step;
+  }
+  reporter->Assign(types[0], TensorTypeNode::make({num_elem}, param->dtype));
+  return true;
+}
+
+Array<Tensor> ArangeCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const ArangeAttrs* param = attrs.as<ArangeAttrs>();
+  return { topi::arange(param->start, param->stop, param->step, param->dtype) };
+}
+
+Expr MakeArange(tvm::Expr start,
+                tvm::Expr stop,
+                tvm::Expr step,
+                DataType dtype) {
+  auto attrs = make_node<ArangeAttrs>();
+  attrs->start = std::move(start);
+  attrs->stop = std::move(stop);
+  attrs->step = std::move(step);
+  attrs->dtype = std::move(dtype);
+  static const Op& op = Op::Get("arange");
+  return CallNode::make(op, {}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.arange")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 4>(MakeArange, args, rv);
+});
+
+RELAY_REGISTER_OP("arange")
+.describe(R"code(Returns evenly spaced values within a given interval.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.ArangeAttrs")
+.set_num_inputs(0)
+.set_support_level(3)
+.add_type_rel("Arange", ArangeRel)
+.set_attr<FTVMCompute>("FTVMCompute", ArangeCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
 // where operator
 bool WhereRel(const Array<Type>& types,
               int num_inputs,
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index e1f7e5509230..ca1bdbbbefc9 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -203,30 +203,51 @@ def test_forward_where():
     mx_cond = mx.nd.array(np_cond)
     mx_x = mx.nd.array(np_x)
     mx_y = mx.nd.array(np_y)
+    shapes = {'cond': dshape, 'x': dshape, 'y': dshape}
     mod = mx.mod.Module(mx_sym, label_names=None, data_names=['cond', 'x', 'y'])
-    mod.bind(data_shapes=[('cond', dshape), ('x', dshape), ('y', dshape)], for_training=False)
+    mod.bind(data_shapes=shapes.items(), for_training=False)
     mod.init_params()
     args, auxs = mod.get_params()
     mx_out = mx.nd.where(mx_cond, mx_x, mx_y).asnumpy()
-    out_shape = dshape
-    shape_dict = {'cond': dshape, 'x': dshape, 'y': dshape}
-    new_sym, params = relay.frontend.from_mxnet(mx_sym,
-                                                shape_dict,
-                                                arg_params=args,
-                                                aux_params=auxs)
+
+    new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, args, auxs)
     for target, ctx in ctx_list():
-        with relay.build_config(opt_level=3):
-            graph, lib, params = relay.build(new_sym, target, params=params)
-        m = graph_runtime.create(graph, lib, ctx)
-        # set inputs
-        m.set_input("cond", tvm.nd.array(np_cond))
-        m.set_input("x", tvm.nd.array(np_x))
-        m.set_input("y", tvm.nd.array(np_y))
-        m.set_input(**params)
-        m.run()
-        # get outputs
-        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
-        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            op_res = intrp.evaluate(new_sym)(np_cond, np_x, np_y)
+            tvm.testing.assert_allclose(op_res.asnumpy(), mx_out)
+
+
+def test_forward_arange():
+    def _mx_symbol(F, start, stop, step):
+        if start is None and step is None:
+            sym = F.arange(stop)
+        elif start is None:
+            sym = F.arange(stop, step=step)
+        elif step is None:
+            sym = F.arange(start, stop)
+        else:
+            sym = F.arange(start, stop, step)
+        return sym
+
+    def verify(start, stop, step):
+        ref_res = _mx_symbol(mx.nd, start, stop, step).asnumpy()
+        mx_sym = _mx_symbol(mx.sym, start, stop, step)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)()
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res)
+    verify(0, 20, None)
+    verify(0, 20, 2)
+    verify(1, 20, None)
+    verify(1, 20, 2)
+    verify(1, 20, 1.5)
+    verify(1, 20.5, None)
+    verify(1, 20, 3)
+    verify(20, 1, -1)
+    verify(20, 1, -1.5)
 
 
 if __name__ == '__main__':
@@ -251,3 +272,4 @@ def test_forward_where():
     test_forward_argmax()
     test_forward_argmin()
     test_forward_where()
+    test_forward_arange()
diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py
index 550637023d43..e762c7d3a1a0 100644
--- a/tests/python/relay/test_op_level3.py
+++ b/tests/python/relay/test_op_level3.py
@@ -457,6 +457,40 @@ def test_infer_type_prelu():
     verify_infer_type_prelu((1, 3, 2, 2), None, 1, (1, 3, 2, 2))
     verify_infer_type_prelu((1, 2, 2, 3), None, 3, (1, 2, 2, 3))
 
+
+def test_arange():
+    def verify_arange(start, stop, step):
+        dtype = "float32"
+        if start is None and step is None:
+            x = relay.arange(stop)
+            ref_res = np.arange(stop)
+        elif start is None:
+            x = relay.arange(stop, step=step)
+            ref_res = np.arange(stop, step=step)
+        elif step is None:
+            x = relay.arange(start, stop)
+            ref_res = np.arange(start, stop)
+        else:
+            x = relay.arange(start, stop, step)
+            ref_res = np.arange(start, stop, step)
+
+        func = relay.Function([], x)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)()
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+    verify_arange(None, 20, None)
+    verify_arange(None, 20, 2)
+    verify_arange(1, 20, None)
+    verify_arange(1, 20, 2)
+    verify_arange(1, 20, 1.5)
+    verify_arange(1, 20.5, None)
+    verify_arange(1, 20, 3)
+    verify_arange(20, 1, -1)
+    verify_arange(20, 1, -1.5)
+
+
 if __name__ == "__main__":
     test_cast()
     test_zeros_ones()
@@ -480,3 +514,4 @@ def test_infer_type_prelu():
     test_squeeze_infer_type()
     test_squeeze_bad_axes_infer_type()
     test_split_infer_type()
+    test_arange()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 04759a582a67..e399b8c6978c 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -868,6 +868,19 @@ inline Tensor tensordot(const Tensor& A,
   return compute(output_shape, func, name, tag);
 }
 
+inline Tensor arange(const Expr start,
+                     const Expr stop,
+                     const Expr step,
+                     Type dtype,
+                     std::string name = "tensor",
+                     std::string tag = kInjective) {
+  Expr num_elem = tvm::cast(tvm::Int(32), tvm::ceil(
+      tvm::cast(tvm::Float(32), stop - start) / step));
+  Array<Expr> shape;
+  return compute({num_elem}, [&](const Array<Var>& indices) {
+    return tvm::cast(dtype, start + step * indices[0]);
+  }, name, tag);
+}
 
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index b9a7bd4f2992..2fb20162a5a7 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -289,3 +289,32 @@ def tensordot(a, b, axes):
     if isinstance(axes[0], int):
         return cpp.tensordot(a, b, (axes[0],), (axes[1],))
     return cpp.tensordot(a, b, axes[0], axes[1])
+
+
+def arange(start, stop=None, step=1, dtype="float32"):
+    """Creates a tensor with evenly spaced values within a given interval.
+
+    Parameters
+    ----------
+    start : tvm.Expr, optional
+        Start of interval. The interval includes this value. The default start
+        value is 0.
+
+    stop : tvm.Expr
+        Stop of interval. The interval does not include this value.
+
+    step : tvm.Expr, optional
+        Spacing between values. The default step size is 1.
+
+    dtype : str, optional
+        The target data type.
+
+    Returns
+    -------
+    result : tvm.Tensor
+        The resulting tensor.
+    """
+    if stop is None:
+        stop = start
+        start = 0
+    return cpp.arange(start, stop, step, dtype)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index d56174fda5c5..e0f16239d561 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -290,6 +290,11 @@ TVM_REGISTER_GLOBAL("topi.where")
   *rv = where(args[0], args[1], args[2]);
 });
 
+TVM_REGISTER_GLOBAL("topi.arange")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = arange(args[0], args[1], args[2], args[3]);
+});
+
 TVM_REGISTER_GLOBAL("topi.gather_nd")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = gather_nd(args[0], args[1]);
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 84d4aa6dc952..dad527e3951f 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -304,6 +304,36 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+def verify_arange(start, stop, step):
+    if start is None and step is None:
+        A = topi.arange(stop)
+        a_np = np.arange(stop)
+    elif start is None:
+        A = topi.arange(stop, step=step)
+        a_np = np.arange(stop, step=step)
+    elif step is None:
+        A = topi.arange(start, stop)
+        a_np = np.arange(start, stop)
+    else:
+        A = topi.arange(start, stop, step)
+        a_np = np.arange(start, stop, step)
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(A)
+        f = tvm.build(s, [A], device, name="arange")
+        a_nd = tvm.nd.empty(a_np.shape, dtype='float32', ctx=ctx)
+        f(a_nd)
+        tvm.testing.assert_allclose(a_nd.asnumpy(), a_np)
+
+    for device in get_all_backend():
+        check_device(device)
+
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
@@ -407,6 +437,18 @@ def test_gather_nd():
         verify_gather_nd((2, 3, 4, 5), [[1, 0], [2, 1], [3, 2], [4, 2]],
                          indices_dtype)
 
+def test_arange():
+    verify_arange(None, 20, None)
+    verify_arange(None, 20, 2)
+    verify_arange(1, 20, None)
+    verify_arange(1, 20, 2)
+    verify_arange(1, 20, 1.5)
+    verify_arange(1, 20.5, None)
+    verify_arange(1, 20, 3)
+    verify_arange(20, 1, -1)
+    verify_arange(20, 1, -1.5)
+
+
 if __name__ == "__main__":
     test_strided_slice()
     test_concatenate()
@@ -419,3 +461,4 @@ def test_gather_nd():
     test_expand_like()
     test_take()
     test_gather_nd()
+    test_arange()

From d555b88d410cba658adddc474dc56c6747bf76ae Mon Sep 17 00:00:00 2001
From: Junru Shao <junrushao1994@gmail.com>
Date: Sun, 24 Feb 2019 14:24:32 -0600
Subject: [PATCH 16/93] Fix -Wreturn-std-move and -Wself-assign-overloaded
 (#2669)

---
 nnvm/src/pass/infer_shape_type.cc   | 2 +-
 src/codegen/verilog/verilog_ir.cc   | 4 ++--
 src/pass/combine_context_call.cc    | 2 +-
 src/relay/ir/expr_functor.cc        | 2 +-
 src/relay/ir/type_functor.cc        | 2 +-
 src/relay/pass/alter_op_layout.cc   | 2 +-
 src/relay/pass/fuse_ops.cc          | 2 +-
 src/relay/pass/to_a_normal_form.cc  | 2 +-
 src/relay/pass/type_infer.cc        | 2 +-
 topi/include/topi/nn/l2_normalize.h | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc
index 0f322f12e9c4..d7ab212f3e9a 100644
--- a/nnvm/src/pass/infer_shape_type.cc
+++ b/nnvm/src/pass/infer_shape_type.cc
@@ -199,7 +199,7 @@ Graph InferAttr(Graph &&ret,
   ret.attrs[attr_name] = std::make_shared<any>(std::move(rshape));
   // number of nodes who knows the shape.
   ret.attrs[unknown_name] = std::make_shared<any>(num_unknown);
-  return ret;
+  return std::move(ret);
 }
 
 NNVM_REGISTER_PASS(InferShape)
diff --git a/src/codegen/verilog/verilog_ir.cc b/src/codegen/verilog/verilog_ir.cc
index 0cc4b9cf3c21..e3be4c8c8b59 100644
--- a/src/codegen/verilog/verilog_ir.cc
+++ b/src/codegen/verilog/verilog_ir.cc
@@ -46,7 +46,7 @@ class StageInputReplacer : public IRMutator {
     Var new_var(it->second->var->name_hint + ".sync", op->type);
     inputs_.Set(new_var, it->second);
     replace_[op] = new_var;
-    return new_var;
+    return std::move(new_var);
   }
   Expr Mutate_(const Load* op, const Expr& e) final {
     CHECK(is_zero(op->index))
@@ -60,7 +60,7 @@ class StageInputReplacer : public IRMutator {
     Var data(it->second->var->name_hint + ".load.sync", op->type);
     inputs_.Set(data, it->second);
     replace_[op->buffer_var.get()] = data;
-    return data;
+    return std::move(data);
   }
   // inputs that get replaced.
   Map<Var, StageInput> inputs_;
diff --git a/src/pass/combine_context_call.cc b/src/pass/combine_context_call.cc
index d60256bcfcf0..d3cbb2842134 100644
--- a/src/pass/combine_context_call.cc
+++ b/src/pass/combine_context_call.cc
@@ -39,7 +39,7 @@ class ContextCallCombiner final : public IRMutator {
         }
         Var ctx_var(name, ctx.type());
         ctx_map_[ctx] = ctx_var;
-        return ctx_var;
+        return std::move(ctx_var);
       }
     } else {
       return IRMutator::Mutate_(op, e);
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 6265873d8310..8d2163e0ecc8 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -364,7 +364,7 @@ class ExprBinder : public ExprMutator {
     if (it != args_map_.end()) {
       return (*it).second;
     } else {
-      return id;
+      return std::move(id);
     }
   }
 
diff --git a/src/relay/ir/type_functor.cc b/src/relay/ir/type_functor.cc
index b88d0ee0e3ab..a05da3a980f4 100644
--- a/src/relay/ir/type_functor.cc
+++ b/src/relay/ir/type_functor.cc
@@ -192,7 +192,7 @@ class TypeBinder : public TypeMutator {
     if (it != args_map_.end()) {
       return (*it).second;
     } else {
-      return id;
+      return std::move(id);
     }
   }
 
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index b33d68a174bc..6d988eb2bcdf 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -34,7 +34,7 @@ Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
   attrs->src_layout = src_layout.name();
   attrs->dst_layout = dst_layout.name();
   Call transform = CallNode::make(transform_op, {raw}, Attrs{attrs});
-  return transform;
+  return std::move(transform);
 }
 
 // Memorize layout transform so we can reuse internal transformed nodes
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 99a5421e2ff9..11a376b2b657 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -773,7 +773,7 @@ class FuseMutator : private ExprMutator {
       } else {
         // This is an intermediate node of a fused function
         // simply return the new call.
-        return new_call;
+        return std::move(new_call);
       }
     } else {
       return ExprMutator::VisitExpr_(call);
diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index 53e2c1c594f8..e5da2dee2e03 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -398,7 +398,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
       visited_->insert(gv);
       mod_->Update(gv, Downcast<Function>(relay::ToANormalForm(mod_->Lookup(gv), mod_, visited_)));
     }
-    return gv;
+    return std::move(gv);
   }
 
   Expr VisitExpr_(const OpNode* op, const Var& v) final {
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index fa3cea610c68..b6bdedc04473 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -724,7 +724,7 @@ Expr InferType(const Expr& expr, const Module& mod_ref) {
     // FromExpr wraps a naked expression as a function, we will unbox
     // it here.
     if (expr.as<FunctionNode>()) {
-      return func;
+      return std::move(func);
     } else {
       return func->body;
     }
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index 6d98a75ec157..a9fd49cbee64 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -39,7 +39,7 @@ inline Tensor l2_normalize(const Tensor& data,
                       topi::sqrt(tvm::compute(expand_sum->shape,
                                               [&](const Array<Var>& i){
                                                 return (max(expand_sum(i), eps));
-                                              }, name = name, tag = tag)));
+                                              }, name, tag)));
 }
 }  // namespace nn
 }  // namespace topi

From 5f1e59dddb69327279cefd114a023424b16a8c38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sun, 24 Feb 2019 21:36:17 -0800
Subject: [PATCH 17/93] [Relay] add more function to prelude (#2660)

---
 python/tvm/relay/backend/interpreter.py |  7 ++-
 python/tvm/relay/prelude.py             | 59 ++++++++++++++++++++++++-
 src/relay/ir/module.cc                  |  1 +
 src/relay/pass/type_solver.cc           |  7 +--
 tests/python/relay/test_adt.py          | 17 +++++++
 5 files changed, 83 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 46bb82d1a725..7a70a6e45e17 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -250,12 +250,15 @@ def optimize(self, expr):
             The optimized expression.
         """
         # TODO: We need to move this optimization code into the optimizer/pass manager
-        ck_expr = ir_pass.infer_type(expr, mod=self.mod)
+        wrapped_expr = expr if isinstance(expr, Function) else Function([], expr)
+        if self.mod:
+            self.mod[self.mod.entry_func] = wrapped_expr
+        ck_expr = ir_pass.infer_type(wrapped_expr, mod=self.mod)
         simp_expr = ir_pass.simplify_inference(ck_expr)
         ck_simp = ir_pass.infer_type(simp_expr, mod=self.mod)
         fused_expr = ir_pass.fuse_ops(ck_simp)
         ck_fused = ir_pass.infer_type(fused_expr, mod=self.mod)
-        return ck_fused
+        return ck_fused if isinstance(expr, Function) else Call(ck_fused, [])
 
     def _make_executor(self, expr):
         def _interp_wrapper(*args, **kwargs):
diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 99b6c8d1c766..034b58ef1c7e 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -340,7 +340,10 @@ def define_tree_map(self):
                                        Match(t, [rose_case]), self.tree(b), [a, b])
 
     def define_tree_size(self):
-        """Defines a function that computes the size of a tree as a nat."""
+        """Defines a function that computes the size of a tree as a nat.
+
+        Signature: fn<a>(t : tree[a]) -> nat
+        """
         self.size = GlobalVar("size")
         a = TypeVar("a")
         t = Var("t", self.tree(a))
@@ -351,6 +354,56 @@ def define_tree_size(self):
         self.mod[self.size] = Function([t],
                                        Match(t, [rose_case]), self.nat(), [a])
 
+    def define_id(self):
+        """Defines a function that return it's argument.
+
+        Signature: fn<a>(x : a) -> a
+        """
+        self.id = GlobalVar("id")
+        a = TypeVar("a")
+        x = Var("x", a)
+        self.mod[self.id] = Function([x], x, a, [a])
+
+
+    def define_compose(self):
+        """Defines a function that compose two function.
+
+        Signature: fn<a, b, c>(f : fn(b) -> c, g : fn(a) -> b) -> fn(a) -> c
+        """
+        self.compose = GlobalVar("compose")
+        a = TypeVar("a")
+        b = TypeVar("b")
+        c = TypeVar("c")
+        f = Var("f", FuncType([b], c))
+        g = Var("g", FuncType([a], b))
+        x = Var("x")
+        self.mod[self.compose] = Function([f, g],
+                                          Function([x], f(g(x))),
+                                          FuncType([a], c),
+                                          [a, b, c])
+
+
+    def define_iterate(self):
+        """Define a function that take a number n, a function f,
+        and return a closure that apply f n time on it's argument.
+
+        Signature: fn<a>(n : nat, f : fn(a) -> a) -> fn(a) -> a
+        """
+        self.iterate = GlobalVar("iterate")
+        a = TypeVar("a")
+        f = Var("f", FuncType([a], a))
+        x = Var("x", self.nat())
+        y = Var("y", self.nat())
+        z = Var("z")
+        z_case = Clause(PatternConstructor(self.z), Function([z], z))
+        # todo: fix typechecker so Function([z], z) can be replaced by self.id
+        s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
+                        self.compose(f, self.iterate(f, y)))
+        self.mod[self.iterate] = Function([f, x],
+                                          Match(x, [z_case, s_case]),
+                                          FuncType([a], a),
+                                          [a])
+
     def __init__(self, mod):
         self.mod = mod
         self.define_list_adt()
@@ -377,3 +430,7 @@ def __init__(self, mod):
         self.define_tree_adt()
         self.define_tree_map()
         self.define_tree_size()
+
+        self.define_id()
+        self.define_compose()
+        self.define_iterate()
diff --git a/src/relay/ir/module.cc b/src/relay/ir/module.cc
index da273265ae33..dc7b3074d2ef 100644
--- a/src/relay/ir/module.cc
+++ b/src/relay/ir/module.cc
@@ -83,6 +83,7 @@ void ModuleNode::Add(const GlobalVar& var,
     CHECK(AlphaEqual(type, old_type))
         << "Module#update changes type, not possible in this mode.";
   }
+  var->checked_type_ = type;
   AddUnchecked(var, checked_func);
 }
 
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index fd15c91e79f7..179f90a2fe15 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -400,11 +400,8 @@ Type TypeSolver::Unify(const Type& dst, const Type& src, const NodeRef&) {
 }
 
 void TypeSolver::ReportError(const Error& err, const NodeRef& location)  {
-    this->err_reporter_->ReportAt(
-      this->current_func,
-      location,
-      err);
-  }
+  err_reporter_->ReportAt(current_func, location, err);
+}
 
 // Add type constraint to the solver.
 void TypeSolver::AddConstraint(const TypeConstraint& constraint, const NodeRef& loc) {
diff --git a/tests/python/relay/test_adt.py b/tests/python/relay/test_adt.py
index 5acae6c70295..a67ee7542e8a 100644
--- a/tests/python/relay/test_adt.py
+++ b/tests/python/relay/test_adt.py
@@ -43,6 +43,9 @@
 tmap = p.tmap
 size = p.size
 
+compose = p.compose
+iterate = p.iterate
+
 # this is an example of using the adt value in python side
 def count(n):
     assert isinstance(n, ConstructorValue)
@@ -93,6 +96,7 @@ def tree_to_dict(t):
 
 def test_nat_value():
     assert count(make_nat(10)) == 10
+    assert count(intrp.evaluate(s(s(z())))) == 2
 
 
 def test_nat_constructor():
@@ -577,6 +581,17 @@ def test_nested_pattern_match():
 
     assert count(res) == 2
 
+def test_compose():
+    n = relay.Var('n')
+    inc = relay.Function([n], s(n))
+    x = relay.Var('x')
+    res = intrp.evaluate(relay.Call(compose(inc, double), [s(s(z()))]))
+    assert count(res) == 5
+
+def test_iterate():
+    expr = relay.Call(iterate(double, build_nat(2)), [build_nat(3)])
+    res = intrp.evaluate(relay.Function([], expr)())
+    assert count(res) == 12
 
 if __name__ == "__main__":
     test_nat_constructor()
@@ -598,3 +613,5 @@ def test_nested_pattern_match():
     test_sum()
     test_tmap()
     test_size()
+    test_compose()
+    test_iterate()

From 722bcc87298a1c48d44fb84e45a50f16681f5679 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Mon, 25 Feb 2019 09:30:54 -0800
Subject: [PATCH 18/93] [BUILD] Simplify after bind device type (#2670)

---
 src/pass/make_api.cc                         | 41 ++++++++++++++++++--
 tests/python/unittest/test_codegen_c_host.py |  5 +--
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/src/pass/make_api.cc b/src/pass/make_api.cc
index 41f92ad24085..d4f033143d6a 100644
--- a/src/pass/make_api.cc
+++ b/src/pass/make_api.cc
@@ -176,13 +176,13 @@ class DeviceTypeBinder: public IRMutator {
   explicit DeviceTypeBinder(int device_type)
       : device_type_(device_type) {}
 
-  Stmt Mutate_(const AttrStmt* op, const Stmt &s) final {
+  Stmt Mutate_(const AttrStmt* op, const Stmt& s) final {
     if (op->attr_key == attr::device_context_type) {
       if (const Variable* var = op->value.as<Variable>()) {
-        std::unordered_map<const Variable*, Expr> dmap;
+        var_ = var;
         Expr value = make_const(op->value.type(), device_type_);
-        dmap[var] = value;
-        Stmt body = Substitute(s, dmap);
+        Stmt body = IRMutator::Mutate_(op, s);
+        var_ = nullptr;
         std::ostringstream os;
         os << "device_type need to be " << device_type_;
         return AssertStmt::make(op->value == value, os.str(), body);
@@ -191,7 +191,40 @@ class DeviceTypeBinder: public IRMutator {
     return IRMutator::Mutate_(op, s);
   }
 
+  Stmt Mutate_(const IfThenElse* op, const Stmt& s) final {
+    // eager simplify if guard.
+    Stmt res = IRMutator::Mutate_(op, s);
+    op = res.as<IfThenElse>();
+    if (is_zero(op->condition)) {
+      if (op->else_case.defined()) return op->else_case;
+      return Evaluate::make(0);
+    }
+    if (is_one(op->condition)) {
+      return op->then_case;
+    }
+    return res;
+  }
+
+  Expr Mutate_(const NE* op, const Expr& e) final {
+    // eager check NE for device check
+    Expr res = IRMutator::Mutate_(op, e);
+    op = res.as<NE>();
+    if (ir::Equal(op->a, op->b)) {
+      return make_const(op->type, false);
+    }
+    return res;
+  }
+
+  Expr Mutate_(const Variable* op, const Expr& e) final {
+    if (op == var_) {
+      return make_const(op->type, device_type_);
+    } else {
+      return e;
+    }
+  }
+
  public:
+  const Variable* var_{nullptr};
   int device_type_;
 };
 
diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py
index 00acbeb88fcf..f6a69c3a7b13 100644
--- a/tests/python/unittest/test_codegen_c_host.py
+++ b/tests/python/unittest/test_codegen_c_host.py
@@ -11,10 +11,7 @@ def test_add():
     s = tvm.create_schedule(C.op)
 
     def check_c():
-        f1 = tvm.lower(s, [A, B, C], name="fadd")
-        fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)]
-        fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0])
-        mhost = tvm.codegen.build_module(fsplits[0], "c")
+        mhost = tvm.build(s, [A, B, C], "c", name="fadd")
         temp = util.tempdir()
         path_dso = temp.relpath("temp.so")
         mhost.export_library(path_dso)

From 16623e7f0e147d6beda74033f210ece4fdedf70d Mon Sep 17 00:00:00 2001
From: Jian Weng <jian.weng465@gmail.com>
Date: Mon, 25 Feb 2019 18:18:41 -0800
Subject: [PATCH 19/93] [Hybrid Script] Add `max_num_threads` (#2672)

* i think it works for now?

* fix lint

* fix 2/3 compat

* fix py2 again

* fine, i gave up
---
 python/tvm/hybrid/calls.py                  | 14 ++++-
 python/tvm/hybrid/parser.py                 | 27 +++++----
 python/tvm/hybrid/preprocessor.py           |  3 +
 python/tvm/hybrid/runtime.py                | 61 ++++++++++++---------
 src/contrib/hybrid/codegen_hybrid.cc        |  3 +
 tests/python/unittest/test_hybrid_script.py | 16 ++++++
 6 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/python/tvm/hybrid/calls.py b/python/tvm/hybrid/calls.py
index cd1e4e3a2085..56a73f784fa0 100644
--- a/python/tvm/hybrid/calls.py
+++ b/python/tvm/hybrid/calls.py
@@ -4,6 +4,7 @@
 from .. import api as _api
 from .. import expr as _expr
 from .. import make as _make
+from .. import target as _tgt
 from ..container import Array
 from .. import ir_pass
 from ..stmt import For
@@ -123,7 +124,7 @@ def ceil_div(func_id, args):
     _internal_assert(isinstance(args[0], _expr.Expr), "Only expressions can div")
     _internal_assert(isinstance(args[1], _expr.Expr), "Only expressions can div")
     a, b = args[0], args[1]
-    return (a + b - 1) / b
+    return (a + b - 1) // b
 
 
 def likely(func_id, args):
@@ -131,3 +132,14 @@ def likely(func_id, args):
                      "Only one expression can be likely")
     _internal_assert(func_id == "likely", "This function cannot be directly invoked!")
     return call_pure_intrin(args[0].dtype, 'likely', *args)
+
+
+def max_num_threads(func_id, args):
+    _internal_assert(func_id == "max_num_threads", "This function cannot be directly invoked!")
+    _internal_assert(args.__len__() <= 1, "At most one argument accepted!")
+    if args.__len__() == 0:
+        res = _tgt.current_target().max_num_threads
+    else:
+        _internal_assert(isinstance(args[0], _expr.UIntImm), "In tvm bool should be uint")
+        res = _tgt.current_target(args[0].value).max_num_threads
+    return _api.convert(res)
diff --git a/python/tvm/hybrid/parser.py b/python/tvm/hybrid/parser.py
index 0959c9df2e91..67a6f6632d16 100644
--- a/python/tvm/hybrid/parser.py
+++ b/python/tvm/hybrid/parser.py
@@ -219,6 +219,8 @@ def visit_Expr(self, node):
 
     def visit_Name(self, node):
         name = node.id
+        if sys.version_info[0] == 2 and name in ['True', 'False']:
+            return _api.convert(eval(name)) #pylint: disable=eval-used
         ty, entry = self.symbols[name]
         _internal_assert(name in self.symbols, "Unknown symbol %s!" % name)
         if ty in [Symbol.LoopVar, Symbol.Input, Symbol.ConstLoopVar]:
@@ -248,6 +250,10 @@ def visit_Num(self, node):
         return _api.const(node.n, dtype)
 
 
+    def visit_NameConstant(self, node):
+        return _api.convert(node.value)
+
+
     def visit_AugAssign(self, node):
         buf = self.visit(node.target)
         rhs = self.visit(node.value)
@@ -450,17 +456,18 @@ def visit_Call(self, node):
 
         func_id = node.func.id
         args = [self.visit(i) for i in node.args]
-        try:
+        # Intrinsics'
+        if hasattr(calls, func_id):
             return getattr(calls, func_id)(func_id, args)
-        except AttributeError:
-            _internal_assert(func_id in self.symbols.keys(), \
-                             "The function called is not in the context either!")
-            ty, entry = self.symbols[func_id]
-            _internal_assert(ty is Symbol.Callable, \
-                             "Are you sure what you call is a function?!")
-            outs = entry(*args)
-            op = outs.op if isinstance(outs, Tensor) else outs[0].op
-            return op
+        # Contexts'
+        _internal_assert(func_id in self.symbols.keys(), \
+                         "The function called (%s) is not in the context either!" % func_id)
+        ty, entry = self.symbols[func_id]
+        _internal_assert(ty is Symbol.Callable, \
+                         "Are you sure what you call is a function?!")
+        outs = entry(*args)
+        op = outs.op if isinstance(outs, Tensor) else outs[0].op
+        return op
 
 
     def visit_For(self, node):
diff --git a/python/tvm/hybrid/preprocessor.py b/python/tvm/hybrid/preprocessor.py
index 50b610567c74..a83fb2eae287 100644
--- a/python/tvm/hybrid/preprocessor.py
+++ b/python/tvm/hybrid/preprocessor.py
@@ -59,6 +59,9 @@ def visit_AugAssign(self, node):
 
 
     def visit_Name(self, node):
+        # If it is True or False, we do not worry about it!
+        if sys.version_info[0] == 2 and node.id in ['True', 'False']:
+            return
         # If it is from the argument list or loop variable, we do not worry about it!
         if node.id in self._args.keys():
             return
diff --git a/python/tvm/hybrid/runtime.py b/python/tvm/hybrid/runtime.py
index 293e069c24ea..b3c744f42652 100644
--- a/python/tvm/hybrid/runtime.py
+++ b/python/tvm/hybrid/runtime.py
@@ -1,6 +1,7 @@
 """Intrinsics of TVM-Python Hybrid Script for Python emulation runtime"""
 
 import numpy
+from .. import target
 
 
 class bind(object): #pylint: disable=invalid-name
@@ -72,34 +73,40 @@ def sigmoid(x):
     return 1 / (1 + numpy.exp(-x))
 
 
+def max_num_threads(allow_none=True):
+    """Get max number of threads for GPU targets."""
+    return target.current_target(allow_none).max_num_threads
+
+
 HYBRID_GLOBALS = {
-    'unroll'       : range,
-    'vectorize'    : range,
-    'parallel'     : range,
-    'const_range'  : range,
-    'bind'         : bind,
-    'allocate'     : allocate,
-    'output_tensor': allocate,
-    'sqrt'         : numpy.sqrt,
-    'log'          : numpy.log,
-    'tanh'         : numpy.tanh,
-    'power'        : numpy.power,
-    'exp'          : numpy.exp,
-    'sigmoid'      : sigmoid,
-    'popcount'     : popcount,
-    'likely'       : lambda cond: cond,
-    'uint8'        : numpy.uint8,
-    'uint16'       : numpy.uint16,
-    'uint32'       : numpy.uint32,
-    'uint64'       : numpy.uint64,
-    'int8'         : numpy.int8,
-    'int16'        : numpy.int16,
-    'int32'        : numpy.int32,
-    'int64'        : numpy.int64,
-    'float16'      : numpy.float16,
-    'float32'      : numpy.float32,
-    'float64'      : numpy.float64,
-    'ceil_div'     : lambda a, b: (a + b - 1) / b
+    'unroll'         : range,
+    'vectorize'      : range,
+    'parallel'       : range,
+    'const_range'    : range,
+    'bind'           : bind,
+    'allocate'       : allocate,
+    'output_tensor'  : allocate,
+    'sqrt'           : numpy.sqrt,
+    'log'            : numpy.log,
+    'tanh'           : numpy.tanh,
+    'power'          : numpy.power,
+    'exp'            : numpy.exp,
+    'sigmoid'        : sigmoid,
+    'popcount'       : popcount,
+    'likely'         : lambda cond: cond,
+    'uint8'          : numpy.uint8,
+    'uint16'         : numpy.uint16,
+    'uint32'         : numpy.uint32,
+    'uint64'         : numpy.uint64,
+    'int8'           : numpy.int8,
+    'int16'          : numpy.int16,
+    'int32'          : numpy.int32,
+    'int64'          : numpy.int64,
+    'float16'        : numpy.float16,
+    'float32'        : numpy.float32,
+    'float64'        : numpy.float64,
+    'ceil_div'       : lambda a, b: (a + b - 1) // b,
+    'max_num_threads': max_num_threads
 }
 
 
diff --git a/src/contrib/hybrid/codegen_hybrid.cc b/src/contrib/hybrid/codegen_hybrid.cc
index 2117d471eeee..56564d668001 100644
--- a/src/contrib/hybrid/codegen_hybrid.cc
+++ b/src/contrib/hybrid/codegen_hybrid.cc
@@ -400,6 +400,8 @@ void CodeGenHybrid::ReserveKeywords() {
   GetUniqueName("for");
   GetUniqueName("in");
   GetUniqueName("range");
+  GetUniqueName("True");
+  GetUniqueName("False");
   GetUniqueName("unroll");
   GetUniqueName("const_range");
   GetUniqueName("parallel");
@@ -434,6 +436,7 @@ void CodeGenHybrid::ReserveKeywords() {
   GetUniqueName("float32");
   GetUniqueName("float64");
   GetUniqueName("ceil_div");
+  GetUniqueName("max_num_threads");
 }
 
 void CodeGenHybrid::DumpStmt(const Stmt &stmt,
diff --git a/tests/python/unittest/test_hybrid_script.py b/tests/python/unittest/test_hybrid_script.py
index d35c8ab3a0df..5bed58c8f617 100644
--- a/tests/python/unittest/test_hybrid_script.py
+++ b/tests/python/unittest/test_hybrid_script.py
@@ -350,6 +350,22 @@ def foo(a):
     func, ins, outs = run_and_check(foo, [a], target='cuda')
     run_and_check(func, ins, outs=outs, target='cuda')
 
+    @tvm.hybrid.script
+    def max_threads(a):
+        b = output_tensor(a.shape, a.dtype)
+        n = a.shape[0]
+        m = max_num_threads(True)
+        for i in bind('threadIdx.x', m):
+            for j in bind('blockIdx.x', ceil_div(n, m)):
+                if i * m + j < n:
+                    b[i * m + j] = a[i * m + j] + a[i * m + j]
+        return b
+
+    a = tvm.placeholder((10000, ), 'float32')
+    with tvm.target.create('cuda'):
+        func, ins, outs = run_and_check(max_threads, [a], target='cuda')
+        run_and_check(func, ins, outs=outs, target='cuda')
+
 
 def test_math_intrin():
     @script

From f713ba0dd5e7807580a9e9c0894eb32d8567b763 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 25 Feb 2019 18:38:56 -0800
Subject: [PATCH 20/93] fix (#2674)

---
 src/relay/ir/alpha_equal.cc | 90 ++++++++++++++++++-------------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/src/relay/ir/alpha_equal.cc b/src/relay/ir/alpha_equal.cc
index 96517f8dd445..aa9336b29153 100644
--- a/src/relay/ir/alpha_equal.cc
+++ b/src/relay/ir/alpha_equal.cc
@@ -217,20 +217,20 @@ class AlphaEqualHandler:
     return false;
   }
 
-  bool VisitType_(const GlobalTypeVarNode* op, const Type& t2) final {
-    return GetRef<Type>(op) == t2;
+  bool VisitType_(const GlobalTypeVarNode* lhs, const Type& other) final {
+    return GetRef<Type>(lhs) == other;
   }
 
-  bool VisitType_(const TypeCallNode* op, const Type& t2) final {
-    const TypeCallNode* pt = t2.as<TypeCallNode>();
-    if (pt == nullptr
-        || op->args.size() != pt->args.size()
-        || !TypeEqual(op->func, pt->func)) {
+  bool VisitType_(const TypeCallNode* lhs, const Type& other) final {
+    const TypeCallNode* rhs = other.as<TypeCallNode>();
+    if (rhs == nullptr
+        || lhs->args.size() != rhs->args.size()
+        || !TypeEqual(lhs->func, rhs->func)) {
       return false;
     }
 
-    for (size_t i = 0; i < op->args.size(); ++i) {
-      if (!TypeEqual(op->args[i], pt->args[i])) {
+    for (size_t i = 0; i < lhs->args.size(); ++i) {
+      if (!TypeEqual(lhs->args[i], rhs->args[i])) {
         return false;
       }
     }
@@ -369,8 +369,8 @@ class AlphaEqualHandler:
     }
   }
 
-  bool VisitExpr_(const OpNode* op, const Expr& other) final {
-    return op == other.get();
+  bool VisitExpr_(const OpNode* lhs, const Expr& other) final {
+    return lhs == other.get();
   }
 
   bool VisitExpr_(const ConstantNode* lhs, const Expr& other) final {
@@ -389,80 +389,80 @@ class AlphaEqualHandler:
     }
   }
 
-  bool VisitExpr_(const RefCreateNode* op, const Expr& e2) final {
-    if (const RefCreateNode* nr = e2.as<RefCreateNode>()) {
-      return ExprEqual(op->value, nr->value);
+  bool VisitExpr_(const RefCreateNode* lhs, const Expr& other) final {
+    if (const RefCreateNode* rhs = other.as<RefCreateNode>()) {
+      return ExprEqual(lhs->value, rhs->value);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const RefReadNode* op, const Expr& e2) final {
-    if (const RefReadNode* r = e2.as<RefReadNode>()) {
-      return ExprEqual(op->ref, r->ref);
+  bool VisitExpr_(const RefReadNode* lhs, const Expr& other) final {
+    if (const RefReadNode* rhs = other.as<RefReadNode>()) {
+      return ExprEqual(lhs->ref, rhs->ref);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const RefWriteNode* op, const Expr& e2) final {
-    if (const RefWriteNode* r = e2.as<RefWriteNode>()) {
-      return ExprEqual(op->ref, r->ref) && ExprEqual(op->value, r->value);
+  bool VisitExpr_(const RefWriteNode* lhs, const Expr& other) final {
+    if (const RefWriteNode* rhs = other.as<RefWriteNode>()) {
+      return ExprEqual(lhs->ref, rhs->ref) && ExprEqual(lhs->value, rhs->value);
     } else {
       return false;
     }
   }
 
-  bool VisitExpr_(const ConstructorNode* op, const Expr& e2) final {
-    return GetRef<Expr>(op) == e2;
+  bool VisitExpr_(const ConstructorNode* lhs, const Expr& other) final {
+    return GetRef<Expr>(lhs) == other;
   }
 
-  bool ClauseEqual(const Clause& l, const Clause& r) {
-    return PatternEqual(l->lhs, r->lhs) && ExprEqual(l->rhs, r->rhs);
+  bool ClauseEqual(const Clause& lhs, const Clause& rhs) {
+    return PatternEqual(lhs->lhs, rhs->lhs) && ExprEqual(lhs->rhs, rhs->rhs);
   }
 
-  bool PatternEqual(const Pattern& l, const Pattern& r) {
-    return VisitPattern(l, r);
+  bool PatternEqual(const Pattern& lhs, const Pattern& rhs) {
+    return VisitPattern(lhs, rhs);
   }
 
-  bool VisitPattern_(const PatternWildcardNode* op, const Pattern& r) final {
-    return r.as<PatternWildcardNode>();
+  bool VisitPattern_(const PatternWildcardNode* lhs, const Pattern& other) final {
+    return other.as<PatternWildcardNode>();
   }
 
-  bool VisitPattern_(const PatternVarNode* op, const Pattern& e2) final {
-    if (const auto* r = e2.as<PatternVarNode>()) {
-      return MergeVarDecl(op->var, r->var);
+  bool VisitPattern_(const PatternVarNode* lhs, const Pattern& other) final {
+    if (const auto* rhs = other.as<PatternVarNode>()) {
+      return MergeVarDecl(lhs->var, rhs->var);
     }
     return false;
   }
 
-  bool VisitPattern_(const PatternConstructorNode* op, const Pattern& e2) final {
-    const auto* r = e2.as<PatternConstructorNode>();
-    if (r == nullptr
-        || !ExprEqual(op->constructor, r->constructor)
-        || op->patterns.size() != r->patterns.size()) {
+  bool VisitPattern_(const PatternConstructorNode* lhs, const Pattern& other) final {
+    const auto* rhs = other.as<PatternConstructorNode>();
+    if (rhs == nullptr
+        || !ExprEqual(lhs->constructor, rhs->constructor)
+        || lhs->patterns.size() != rhs->patterns.size()) {
       return false;
     }
 
-    for (size_t i = 0; i < op->patterns.size(); i++) {
-      if (!PatternEqual(op->patterns[i], r->patterns[i])) {
+    for (size_t i = 0; i < lhs->patterns.size(); i++) {
+      if (!PatternEqual(lhs->patterns[i], rhs->patterns[i])) {
         return false;
       }
     }
     return true;
   }
 
-  bool VisitExpr_(const MatchNode* op, const Expr& e2) final {
-    const MatchNode* r = e2.as<MatchNode>();
+  bool VisitExpr_(const MatchNode* lhs, const Expr& other) final {
+    const MatchNode* rhs = other.as<MatchNode>();
 
-    if (r == nullptr
-        || !ExprEqual(op->data, r->data)
-        || op->clauses.size() != r->clauses.size()) {
+    if (rhs == nullptr
+        || !ExprEqual(lhs->data, rhs->data)
+        || lhs->clauses.size() != rhs->clauses.size()) {
       return false;
     }
 
-    for (size_t i = 0; i < op->clauses.size(); ++i) {
-      if (!ClauseEqual(op->clauses[i], r->clauses[i])) {
+    for (size_t i = 0; i < lhs->clauses.size(); ++i) {
+      if (!ClauseEqual(lhs->clauses[i], rhs->clauses[i])) {
         return false;
       }
     }

From 526e6921b5855325f54d498bb851ac8e4a3407c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 25 Feb 2019 18:40:46 -0800
Subject: [PATCH 21/93] [Relay] fix error in ANF (too agressively inline atomic
 expression and create free variable). (#2665)

---
 src/relay/pass/to_a_normal_form.cc          | 30 ++++++++++++++-------
 tests/python/relay/test_to_a_normal_form.py | 10 +++++++
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/relay/pass/to_a_normal_form.cc b/src/relay/pass/to_a_normal_form.cc
index e5da2dee2e03..46a4b92ac9b9 100644
--- a/src/relay/pass/to_a_normal_form.cc
+++ b/src/relay/pass/to_a_normal_form.cc
@@ -256,6 +256,10 @@ bool IsPrimitiveFunction(const Expr& e) {
   return e.as<FunctionNode>() && Downcast<Function>(e)->IsPrimitive();
 }
 
+/* Special care is needed to handle local recursion.
+ * Fill additionally take a (possibly null) Var argument,
+ * If it is not null, Fill is required to bind the transformed result to that var.
+ */
 class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
  public:
   static Expr ToANormalForm(const Expr& e,
@@ -307,12 +311,18 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 
   Expr VisitExpr(const Expr& e) {
-    Var v = VarNode::make(std::string("x"), IncompleteTypeNode::make(Kind::kType));
-    return this->VisitExpr(e, v);
+    return this->VisitExpr(e, Var());
+  }
+
+  Expr Atomic(const Expr& orig, const Expr& now, const Var& v) {
+    return v.defined() ? GetScope(orig)->ll->Push(v, now) : now;
   }
 
   Expr Compound(const Expr& orig, const Expr& now, const Var& v) {
-    return GetScope(orig)->ll->Push(v, now);
+    Var var = v.defined() ?
+      v :
+      VarNode::make(std::string("x"), IncompleteTypeNode::make(Kind::kType));
+    return GetScope(orig)->ll->Push(var, now);
   }
 
   Expr VisitExpr_(const CallNode* c, const Var& v) final {
@@ -389,7 +399,8 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
   }
 
   Expr VisitExpr_(const VarNode* vn, const Var& v) final {
-    return GetRef<Expr>(vn);
+    Expr e = GetRef<Expr>(vn);
+    return Atomic(e, e, v);
   }
 
   Expr VisitExpr_(const GlobalVarNode* gvn, const Var& v) final {
@@ -398,15 +409,17 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
       visited_->insert(gv);
       mod_->Update(gv, Downcast<Function>(relay::ToANormalForm(mod_->Lookup(gv), mod_, visited_)));
     }
-    return std::move(gv);
+    return Atomic(gv, gv, v);
   }
 
   Expr VisitExpr_(const OpNode* op, const Var& v) final {
-    return GetRef<Expr>(op);
+    Expr e = GetRef<Expr>(op);
+    return Atomic(e, e, v);
   }
 
   Expr VisitExpr_(const ConstructorNode* c, const Var& v) final {
-    return GetRef<Expr>(c);
+    Expr e = GetRef<Expr>(c);
+    return Atomic(e, e, v);
   }
 
   Expr VisitExpr_(const MatchNode* m, const Var& v) final {
@@ -418,8 +431,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)> {
         c->lhs,
         GetSubScope(e, 1 + clauses.size())->ll->Get(VisitExpr(c->rhs))));
     }
-    Expr r = Compound(e, MatchNode::make(data, clauses), v);
-    return r;
+    return Compound(e, MatchNode::make(data, clauses), v);
   }
 };
 
diff --git a/tests/python/relay/test_to_a_normal_form.py b/tests/python/relay/test_to_a_normal_form.py
index c15dc8ffc269..392e1769e57d 100644
--- a/tests/python/relay/test_to_a_normal_form.py
+++ b/tests/python/relay/test_to_a_normal_form.py
@@ -138,6 +138,15 @@ def test_add():
     assert count(intrp.evaluate(to_a_normal_form(add(s(z()), s(z())), mod))) == 2
     assert "let" in mod[add].astext()
 
+def test_let():
+    x = relay.Var("x")
+    y = relay.Var("y")
+    d = relay.const(4.0, 'float32')
+    body = relay.Let(y, x, x + y)
+    body = relay.Let(x, d, body)
+    check_eval(body, 8)
+    check_eval(to_a_normal_form(body), 8)
+
 if __name__ == '__main__':
     test_explicit_bound()
     test_order()
@@ -145,3 +154,4 @@ def test_add():
     test_recursion()
     test_ref()
     test_add()
+    test_let()

From 8d66e4dad3fe75aa13349583a93b1eaad208d3f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=9C=A8=E5=8E=9F=E4=BD=90=E4=B8=BA?=
 <ariwaranosai@users.noreply.github.com>
Date: Tue, 26 Feb 2019 11:17:24 +0800
Subject: [PATCH 22/93] Add CONCATENATION to tflite frontend, support Inception
 V3 (#2643)

* Add CONCATENATION to tflite frontend

* fix typo

* Fix codestyle

* Fix code style

* simplify convert map

* Update
---
 python/tvm/relay/frontend/tflite.py          | 46 +++++++++++-
 tests/python/frontend/tflite/test_forward.py | 75 +++++++++++++++++++-
 2 files changed, 119 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index d63b470d48ab..d45bb33859b2 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -35,6 +35,8 @@ def __init__(self, model, subgraph, exp_tab):
         self.builtin_op_code = build_str_map(BuiltinOperator())
         self.activation_fn_type = build_str_map(ActivationFunctionType())
         self.builtin_options = build_str_map(BuiltinOptions())
+
+        # Add more operators
         self.convert_map = {
             'CONV_2D': self.convert_conv2d,
             'DEPTHWISE_CONV_2D': self.convert_depthwise_conv2d,
@@ -43,7 +45,7 @@ def __init__(self, model, subgraph, exp_tab):
             'SOFTMAX': self.convert_softmax,
             'SQUEEZE': self.convert_squeeze,
             'MAX_POOL_2D': self.convert_max_pool2d,
-            # Add more operators
+            "CONCATENATION": self.convert_concatenation
         }
 
     def check_unsupported_ops(self):
@@ -245,6 +247,48 @@ def convert_softmax(self, op):
 
         return out
 
+    def convert_concatenation(self, op):
+        """ convert TFLite concatenation"""
+        try:
+            from tflite.Operator import Operator
+            from tflite.ConcatenationOptions import ConcatenationOptions
+            from tflite.BuiltinOptions import BuiltinOptions
+            from tflite.ActivationFunctionType import ActivationFunctionType
+        except ImportError:
+            raise ImportError("The tflite package must be installed")
+
+        assert isinstance(op, Operator)
+        input_tensors = self.get_input_tensors(op)
+        assert len(input_tensors) >= 1, "input tensors should greater than 1"
+        in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors]
+
+        output_tensors = self.get_output_tensors(op)
+        assert len(output_tensors) == 1, "output tensors should be 1"
+
+        assert op.BuiltinOptionsType() == BuiltinOptions.ConcatenationOptions
+        op_options = op.BuiltinOptions()
+        concatenation_options = ConcatenationOptions()
+        concatenation_options.Init(op_options.Bytes, op_options.Pos)
+        concatenation_axis = concatenation_options.Axis()
+        fused_activation_fn = concatenation_options.FusedActivationFunction()
+        input_shape_length = len(input_tensors[0].tensor.ShapeAsNumpy())
+
+        # TFLite is N H W C, our layout is N C H W
+        if input_shape_length <= 4:
+            axis_convert_map = [0] + list(range(2, input_shape_length)) + [1]
+            concatenation_axis = axis_convert_map[concatenation_axis]
+        else:
+            raise NotImplementedError("Not support input shape length {} of concatenatio : "
+                                      .format(str(input_shape_length)))
+
+        # with axis in N H W C
+        out = _op.concatenate(in_exprs, axis=concatenation_axis)
+
+        # if we have activation fn
+        if fused_activation_fn != ActivationFunctionType.NONE:
+            out = self.convert_fused_activation_function(out, fused_activation_fn)
+        return out
+
     def convert_squeeze(self, op):
         """Convert TFLite squeeze"""
         try:
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index 3c048435fba8..0b314cced520 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -283,6 +283,53 @@ def test_forward_reshape():
     _test_reshape(np.arange(6), [-1])
 
 
+#######################################################################
+# Concatenation
+# -------------
+
+def _test_concatenation(data, axis):
+    """ One iteration of concatenation """
+
+    assert len(data) >= 1
+    need_transpose = False
+    if len(data[0].shape) == 1 or len(data[0].shape) == 2:
+        tvm_data = data
+    elif len(data[0].shape) == 3:
+        #need_transpose = True
+        tvm_data = [np.transpose(d, axes=(0, 2, 1)) for d in data]
+    elif len(data[0].shape) == 4:
+        need_transpose = True
+        tvm_data = [np.transpose(d, axes=(0, 3, 1, 2)) for d in data]
+    else:
+        raise NotImplementedError("Not support input shape {} of reshape : ".
+                                  format(str(len(data))))
+
+    with tf.Graph().as_default():
+        in_data = [
+            array_ops.placeholder(shape=tensor.shape, dtype=tensor.dtype, name="in_{}".format(idx))
+            for idx, tensor in enumerate(data)]
+        out = array_ops.concat(in_data, axis=axis)
+        name = ["in_{}:0".format(idx) for idx in range(len(data))]
+
+        compare_tflite_with_tvm(data, tvm_data, name, in_data, [out], need_transpose)
+
+
+def test_forward_concatenation():
+
+    _test_concatenation(
+        [np.arange(6).reshape((1, 2, 1, 3)),
+        np.arange(6).reshape((1, 2, 1, 3))], 1)
+
+    _test_concatenation(
+        [np.arange(6).reshape((3, 2)),
+         np.arange(6).reshape((3, 2))], 1)
+
+    _test_concatenation(
+        [np.arange(6).reshape((2, 1, 1, 3)),
+         np.arange(6).reshape((2, 1, 1, 3)),
+         np.arange(6).reshape((2, 1, 1, 3))], 1)
+
+
 #######################################################################
 # Squeeze
 # -------
@@ -340,6 +387,7 @@ def test_forward_softmax():
 #######################################################################
 # Mobilenet
 # ---------
+
 def test_forward_mobilenet():
     '''test mobilenet v1 tflite model'''
     # MobilenetV1
@@ -347,19 +395,43 @@ def test_forward_mobilenet():
     tflite_model_file = tf_testing.get_workload_official(
         "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
         "mobilenet_v1_1.0_224.tflite", temp)
-    tflite_model_buf = open(tflite_model_file, "rb").read()
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
     data = np.random.uniform(size=(1, 224, 224, 3)).astype('float32')
     tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
     tflite_output = run_tflite_graph(tflite_model_buf, data)
     tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
     tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
                                 rtol=1e-5, atol=1e-5)
+    temp.remove()
+
+#######################################################################
+# Inception V3
+# ------------
+
+def test_forward_inception_v3_net():
+    '''test inception v3 tflite model'''
+    # InceptionV3
+    temp = util.tempdir()
+    tflite_model_file = tf_testing.get_workload_official(
+        "https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz",
+        "inception_v3.tflite", temp)
+    with open(tflite_model_file, "rb") as f:
+        tflite_model_buf = f.read()
+    data = np.random.uniform(size=(1, 299, 299, 3)).astype('float32')
+    tvm_data = np.transpose(data, axes=(0, 3, 1, 2))
+    tflite_output = run_tflite_graph(tflite_model_buf, data)
+    tvm_output = run_tvm_graph(tflite_model_buf, tvm_data, 'input')
+    tvm.testing.assert_allclose(np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]),
+                                rtol=1e-5, atol=1e-5)
+    temp.remove()
 
 #######################################################################
 # Main
 # ----
 if __name__ == '__main__':
     # Transforms
+    test_forward_concatenation()
     test_forward_reshape()
     test_forward_squeeze()
 
@@ -370,3 +442,4 @@ def test_forward_mobilenet():
 
     # End to End
     test_forward_mobilenet()
+    test_forward_inception_v3_net()

From 85dd805109137dd1e5e646b2df1db2b74bf5b165 Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Wed, 27 Feb 2019 10:32:21 +0900
Subject: [PATCH 23/93] [AUTOTVM][Bugfix] Fix history loader for heterogeneous
 execution

---
 python/tvm/autotvm/task/dispatcher.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/tvm/autotvm/task/dispatcher.py b/python/tvm/autotvm/task/dispatcher.py
index f497ddc038db..d2ef480b44ee 100644
--- a/python/tvm/autotvm/task/dispatcher.py
+++ b/python/tvm/autotvm/task/dispatcher.py
@@ -294,7 +294,8 @@ def load(self, records):
             # use model as key to build best map
             key = (inp.target.model, inp.task.workload)
             if key not in best_by_model:
-                best_by_model[key] = (inp, res)
+                if inp.target.model != 'unknown':
+                    best_by_model[key] = (inp, res)
             else:
                 _, other_res = best_by_model[key]
                 if np.mean(other_res.costs) > np.mean(res.costs):

From a39f27a885c86b1adb08f4e7aace969b1fdb5b6d Mon Sep 17 00:00:00 2001
From: hlu1 <14827759+hlu1@users.noreply.github.com>
Date: Tue, 26 Feb 2019 18:32:27 -0800
Subject: [PATCH 24/93] [Graph Runtime] Run_individual for benchmarking
 individual layers (#2569)

---
 python/tvm/contrib/debugger/debug_runtime.py  |  4 ++
 .../graph/debug/graph_runtime_debug.cc        | 69 +++++++++++++++++++
 .../unittest/test_runtime_graph_debug.py      |  4 ++
 3 files changed, 77 insertions(+)

diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py
index a627a32dbd16..725f212fce00 100644
--- a/python/tvm/contrib/debugger/debug_runtime.py
+++ b/python/tvm/contrib/debugger/debug_runtime.py
@@ -89,6 +89,7 @@ def __init__(self, module, ctx, graph_json_str, dump_root):
         self._dump_path = None
         self._debug_run = module["debug_run"]
         self._get_output_by_layer = module["get_output_by_layer"]
+        self._run_individual = module["run_individual"]
         graph_runtime.GraphModule.__init__(self, module)
         self._create_debug_env(graph_json_str, ctx)
 
@@ -222,6 +223,9 @@ def run(self, **input_dict):
         # Step 3. Display the collected information
         self.debug_datum.display_debug_result()
 
+    def run_individual(self, number, repeat=1, min_repeat_ms=0):
+        self._run_individual(number, repeat, min_repeat_ms)
+
     def exit(self):
         """Exits the dump folder and all its contents"""
         self._remove_dump_root()
diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc
index 7140a647070f..71a869e13ae6 100644
--- a/src/runtime/graph/debug/graph_runtime_debug.cc
+++ b/src/runtime/graph/debug/graph_runtime_debug.cc
@@ -38,6 +38,65 @@ class GraphRuntimeDebug : public GraphRuntime {
     return time;
   }
 
+  /*!
+   * \brief Run each operation in the graph and print out the runtime per op.
+   * \param number The number of times to run this function for taking average.
+   * \param repeat The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warmed up and will be discarded in case
+            there is lazy initialization.
+   * \param min_repeat_ms The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+   */
+  void RunIndividual(int number, int repeat, int min_repeat_ms) {
+    // warmup run
+    GraphRuntime::Run();
+
+    std::vector<double> time_per_op(op_execs_.size(), 0);
+    for (int i = 0; i < repeat; ++i) {
+      std::chrono::time_point<
+        std::chrono::high_resolution_clock, std::chrono::nanoseconds> tbegin, tend;
+      double duration_ms = 0.0;
+      do {
+        std::fill(time_per_op.begin(), time_per_op.end(), 0);
+        if (duration_ms > 0.0) {
+          number = static_cast<int>(
+              std::max((min_repeat_ms / (duration_ms / number) + 1),
+                       number * 1.618));  // 1.618 is chosen by random
+        }
+        tbegin = std::chrono::high_resolution_clock::now();
+        for (int k = 0; k < number; k++) {
+          for (size_t index = 0; index < op_execs_.size(); ++index) {
+            if (op_execs_[index]) {
+              const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx;
+              auto op_tbegin = std::chrono::high_resolution_clock::now();
+              op_execs_[index]();
+              TVMSynchronize(ctx.device_type, ctx.device_id, nullptr);
+              auto op_tend = std::chrono::high_resolution_clock::now();
+              double op_duration = std::chrono::duration_cast<
+                  std::chrono::duration<double> >(op_tend - op_tbegin).count();
+              time_per_op[index] += op_duration * 1000;  // ms
+            }
+          }
+        }
+        tend = std::chrono::high_resolution_clock::now();
+        duration_ms = std::chrono::duration_cast<std::chrono::duration<double> >
+            (tend - tbegin).count() * 1000;
+      } while (duration_ms < min_repeat_ms);
+
+      LOG(INFO) << "Repeat: " << i;
+      int op = 0;
+      for (size_t index = 0; index < time_per_op.size(); index++) {
+        if (op_execs_[index]) {
+          time_per_op[index] /= number;
+          LOG(INFO) << "Op #" << op++ << ": " << time_per_op[index] << " ms/iter";
+        }
+      }
+    }
+  }
+
   /*!
    * \brief Run each operation and get the output.
    * \param index The index of op which needs to be returned.
@@ -119,6 +178,16 @@ PackedFunc GraphRuntimeDebug::GetFunction(
           this->DebugGetNodeOutput(args[0], args[1]);
         }
       });
+  } else if (name == "run_individual") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      int number = args[0];
+      int repeat = args[1];
+      int min_repeat_ms = args[2];
+      CHECK_GT(number, 0);
+      CHECK_GT(repeat, 0);
+      CHECK_GE(min_repeat_ms, 0);
+      this->RunIndividual(number, repeat, min_repeat_ms);
+    });
   } else {
     return GraphRuntime::GetFunction(name, sptr_to_self);
   }
diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
index b9d8b689cb9e..4bbe6509c40c 100644
--- a/tests/python/unittest/test_runtime_graph_debug.py
+++ b/tests/python/unittest/test_runtime_graph_debug.py
@@ -68,6 +68,9 @@ def check_verify():
         out = mod.get_output(0, tvm.nd.empty((n,)))
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
+        #test individual run
+        mod.run_individual(20, 2, 1)
+
         mod.exit()
         #verify dump root delete after cleanup
         assert(not os.path.exists(directory))
@@ -94,6 +97,7 @@ def check_remote():
         mod.run(x=tvm.nd.array(a, ctx))
         out = tvm.nd.empty((n,), ctx=ctx)
         out = mod.get_output(0, out)
+        mod.run_individual(20, 2, 1)
         np.testing.assert_equal(out.asnumpy(), a + 1)
 
     check_verify()

From 1b703150261dbf8cf2f4a184eef4b6c8378645c3 Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 27 Feb 2019 09:41:25 +0530
Subject: [PATCH 25/93] REGION op removed from topi and added in darkent
 frontend (#2275)

---
 nnvm/python/nnvm/frontend/darknet.py        |  36 ++++---
 nnvm/python/nnvm/top/vision.py              |  21 ----
 nnvm/src/top/vision/yolo/region.cc          |  35 -------
 nnvm/src/top/vision/yolo/region.h           | 101 --------------------
 topi/include/topi/cuda/vision.h             |  95 ------------------
 topi/include/topi/rocm/vision.h             |  33 -------
 topi/include/topi/vision/yolo/region.h      |  81 ----------------
 topi/python/topi/cuda/vision.py             |  18 ----
 topi/python/topi/generic/vision.py          |  36 -------
 topi/python/topi/rocm/__init__.py           |   1 -
 topi/python/topi/rocm/vision.py             |  25 -----
 topi/python/topi/testing/__init__.py        |   2 -
 topi/python/topi/testing/region_python.py   |  69 -------------
 topi/python/topi/testing/shortcut_python.py |  47 ---------
 topi/python/topi/vision/__init__.py         |   3 +-
 topi/python/topi/vision/shortcut.py         |  45 ---------
 topi/python/topi/vision/yolo/__init__.py    |   5 -
 topi/python/topi/vision/yolo/region.py      |  39 --------
 topi/src/topi.cc                            |  18 ----
 topi/tests/python/test_topi_region.py       |  49 ----------
 topi/tests/python/test_topi_shortcut.py     |  48 ----------
 21 files changed, 25 insertions(+), 782 deletions(-)
 delete mode 100644 nnvm/src/top/vision/yolo/region.cc
 delete mode 100644 nnvm/src/top/vision/yolo/region.h
 delete mode 100644 topi/include/topi/cuda/vision.h
 delete mode 100644 topi/include/topi/rocm/vision.h
 delete mode 100644 topi/include/topi/vision/yolo/region.h
 delete mode 100644 topi/python/topi/rocm/vision.py
 delete mode 100644 topi/python/topi/testing/region_python.py
 delete mode 100644 topi/python/topi/testing/shortcut_python.py
 delete mode 100644 topi/python/topi/vision/shortcut.py
 delete mode 100644 topi/python/topi/vision/yolo/__init__.py
 delete mode 100644 topi/python/topi/vision/yolo/region.py
 delete mode 100644 topi/tests/python/test_topi_region.py
 delete mode 100644 topi/tests/python/test_topi_shortcut.py

diff --git a/nnvm/python/nnvm/frontend/darknet.py b/nnvm/python/nnvm/frontend/darknet.py
index 39470bfb02ec..154c83c90ec6 100644
--- a/nnvm/python/nnvm/frontend/darknet.py
+++ b/nnvm/python/nnvm/frontend/darknet.py
@@ -302,18 +302,29 @@ def _darknet_reorg(inputs, attrs):
 
 def _darknet_region(inputs, attrs):
     """Process the region operation."""
-    op_name, new_attrs = 'yolo_region', {}
-    if 'n' in attrs:
-        new_attrs['n'] = attrs.get('n', 1)
-    if 'classes' in attrs:
-        new_attrs['classes'] = attrs.get('classes', 1)
-    if 'coords' in attrs:
-        new_attrs['coords'] = attrs.get('coords', 0)
-    if 'background' in attrs:
-        new_attrs['background'] = attrs.get('background', 0)
-    if 'softmax' in attrs:
-        new_attrs['softmax'] = attrs.get('softmax', 0)
-    return _darknet_get_nnvm_op(op_name)(*inputs, **new_attrs), None
+    num = attrs.get('n', 1)
+    classes = attrs.get('classes', 1)
+    coords = attrs.get('coords', 0)
+    background = attrs.get('background', 0)
+    softmax = attrs.get('softmax', True)
+    input_shape = attrs.get('shape')
+
+    split_size = classes + coords + 1
+    intermediate_shape = (input_shape[0], num, split_size, input_shape[2], input_shape[3])
+    data_block = _sym.reshape(inputs[0], shape=intermediate_shape)
+    split_indices = (2, 4, 5)
+    split_res = _sym.split(data_block, indices_or_sections=split_indices, axis=2)
+    split_res0 = _sym.sigmoid(split_res[0])
+    if not background:
+        split_res2 = _sym.sigmoid(split_res[2])
+    else:
+        split_res2 = split_res[2]
+    if softmax:
+        split_res3 = _sym.softmax(split_res[3], axis=2)
+    concat_list = [split_res0, split_res[1], split_res2, split_res3]
+    out = _sym.concatenate(*concat_list, axis=2)
+    return _sym.reshape(out, shape=input_shape), None
+
 
 def _darknet_yolo(inputs, attrs):
     """Process the yolo operation."""
@@ -638,6 +649,7 @@ def _get_darknet_attrs(self, layer, layer_num):
             attr.update({'coords' : layer.coords})
             attr.update({'background' : layer.background})
             attr.update({'softmax' : layer.softmax})
+            attr.update({'shape' : (1, layer.c, layer.h, layer.w)})
 
         elif LAYERTYPE.YOLO == layer.type:
             attr.update({'n' : layer.n})
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index 42cb32214abf..d12c82c1fc88 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -21,27 +21,6 @@ def schedule_reorg(attrs, outs, target):
 
 reg.register_pattern("yolo_reorg", OpPattern.INJECTIVE)
 
-@reg.register_compute("yolo_region")
-def compute_region(attrs, inputs, _):
-    """Compute definition of region"""
-    n = attrs.get_int("n")
-    classes = attrs.get_int("classes")
-    coords = attrs.get_int("coords")
-    background = attrs.get_int("background")
-    softmax = attrs.get_int("softmax")
-    with tvm.target.create(attrs.get_str("target")):
-        return topi.vision.yolo.region(inputs[0], n, classes, coords,
-                                       background, softmax)
-
-
-@reg.register_schedule("yolo_region")
-def schedule_region(attrs, outs, target):
-    """Schedule definition of region"""
-    with tvm.target.create(target):
-        return topi.generic.vision.schedule_region(outs)
-
-reg.register_pattern("yolo_region", OpPattern.OPAQUE)
-
 # multibox_prior
 @reg.register_schedule("multibox_prior")
 def schedule_multibox_prior(_, outs, target):
diff --git a/nnvm/src/top/vision/yolo/region.cc b/nnvm/src/top/vision/yolo/region.cc
deleted file mode 100644
index 182c9b2ab3bc..000000000000
--- a/nnvm/src/top/vision/yolo/region.cc
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file region.cc
- * \brief Property def of pooling operators.
- */
-#include <nnvm/op.h>
-#include <nnvm/node.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/top/nn.h>
-#include "../../op_common.h"
-#include "region.h"
-
-namespace nnvm {
-namespace top {
-
-NNVM_REGISTER_OP(yolo_region)
-.describe(R"code(Region layer
-)code" NNVM_ADD_FILELINE)
-.set_num_inputs(1)
-.set_num_outputs(1)
-.set_support_level(5)
-.add_argument("data", "Tensor", "Input data")
-.set_attr<FInferType>("FInferType", RegionType<1, 1>)
-.set_attr<FInferShape>("FInferShape", RegionShape<1, 1>)
-.set_attr<FInplaceOption>(
-    "FInplaceOption",
-    [](const NodeAttrs &attrs) {
-      return std::vector<std::pair<int, int>>{{0, 0}, {1, 0}};
-    })
-.set_attr<FGradient>("FGradient", [](const NodePtr &n,
-                                     const std::vector<NodeEntry> &ograds) {
-  return std::vector<NodeEntry>{ograds[0], ograds[0]};
-});
-}  // namespace top
-}  // namespace nnvm
diff --git a/nnvm/src/top/vision/yolo/region.h b/nnvm/src/top/vision/yolo/region.h
deleted file mode 100644
index f9dc87c59c6c..000000000000
--- a/nnvm/src/top/vision/yolo/region.h
+++ /dev/null
@@ -1,101 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file region.h
- */
-#ifndef NNVM_TOP_VISION_YOLO_REGION_H_
-#define NNVM_TOP_VISION_YOLO_REGION_H_
-
-#include <string>
-#include <vector>
-#include <utility>
-#include <iostream>
-#include <sstream>
-
-namespace nnvm {
-namespace top {
-
-template <typename AttrType,
-          bool (*is_none)(const AttrType &),
-          bool (*assign)(AttrType *,
-          const AttrType &),
-          bool reverse_infer,
-          std::string (*attr_string)(const AttrType &),
-          int n_in = -1,
-          int n_out = -1>
-inline bool RegionAttr(const nnvm::NodeAttrs &attrs,
-                       std::vector<AttrType> *in_attrs,
-                       std::vector<AttrType> *out_attrs,
-                       const AttrType &none) {
-  AttrType dattr = none;
-  size_t in_size = in_attrs->size();
-  size_t out_size = out_attrs->size();
-  if (n_in != -1) {
-    in_size = static_cast<size_t>(n_in);
-  }
-  if (n_out != -1) {
-    out_size = static_cast<size_t>(n_out);
-  }
-
-  auto deduce = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
-    for (size_t i = 0; i < size; ++i) {
-      if (i == 0)
-        CHECK(assign(&dattr, (*vec)[i]))
-            << "Incompatible attr in node " << attrs.name << " at " << i
-            << "-th " << name << ": "
-            << "expected " << attr_string(dattr) << ", got "
-            << attr_string((*vec)[i]);
-    }
-  };
-  deduce(in_attrs, in_size, "input");
-
-  auto write = [&](std::vector<AttrType> *vec, size_t size, const char *name) {
-    for (size_t i = 0; i < size; ++i) {
-      CHECK(assign(&(*vec)[i], dattr))
-          << "Incompatible attr in node " << attrs.name << " at " << i << "-th "
-          << name << ": "
-          << "expected " << attr_string(dattr) << ", got "
-          << attr_string((*vec)[i]);
-    }
-  };
-  write(out_attrs, out_size, "output");
-
-  if (is_none(dattr)) {
-    return false;
-  }
-  return true;
-}
-
-template <int n_in, int n_out>
-inline bool RegionShape(const NodeAttrs &attrs,
-                        std::vector<TShape> *in_attrs,
-                        std::vector<TShape> *out_attrs) {
-  if (n_in != -1) {
-    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
-        << " in operator " << attrs.name;
-  }
-  if (n_out != -1) {
-    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
-        << " in operator " << attrs.name;
-  }
-  return RegionAttr<TShape, shape_is_none, shape_assign, true, shape_string>(
-      attrs, in_attrs, out_attrs, TShape());
-}
-
-template <int n_in, int n_out>
-inline bool RegionType(const NodeAttrs &attrs,
-                       std::vector<int> *in_attrs,
-                       std::vector<int> *out_attrs) {
-  if (n_in != -1) {
-    CHECK_EQ(in_attrs->size(), static_cast<size_t>(n_in))
-        << " in operator " << attrs.name;
-  }
-  if (n_out != -1) {
-    CHECK_EQ(out_attrs->size(), static_cast<size_t>(n_out))
-        << " in operator " << attrs.name;
-  }
-  return RegionAttr<int, type_is_none, type_assign, true, type_string>(
-      attrs, in_attrs, out_attrs, -1);
-}
-}  // namespace top
-}  // namespace nnvm
-#endif  // NNVM_TOP_VISION_YOLO_REGION_H_
diff --git a/topi/include/topi/cuda/vision.h b/topi/include/topi/cuda/vision.h
deleted file mode 100644
index 4dd8b7cee15d..000000000000
--- a/topi/include/topi/cuda/vision.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*!
-*  Copyright (c) 2018 by Contributors
-* \file cuda/vision.h
-* \brief CUDA schedule for vision operations
-*/
-#ifndef TOPI_CUDA_VISION_H_
-#define TOPI_CUDA_VISION_H_
-
-#include "tvm/tvm.h"
-#include "tvm/build_module.h"
-#include "topi/tags.h"
-#include "topi/detail/array_utils.h"
-#include "topi/contrib/cublas.h"
-#include "topi/generic/extern.h"
-
-namespace topi {
-using namespace tvm;
-namespace cuda {
-/*!
-* \brief Create a CUDA schedule for region
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the given ops.
-*/
-inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
-  Array<Operation> out_ops;
-  for (auto t : outs) {
-    out_ops.push_back(t->op);
-  }
-  auto s = create_schedule(out_ops);
-  auto output = outs[0]->op.output(0);
-  auto num_thread = 64;
-
-  auto _schedule_softmax = [&](const Operation& softmax_op) {
-    auto softmax_inputs = softmax_op->InputTensors();
-    auto softmax = softmax_inputs[0];
-    auto max_elem = softmax_inputs[1];
-    auto expsum = softmax_inputs[2];
-
-    auto block_x = tvm::thread_axis(Range(), "blockIdx.x");
-    auto thread_x = tvm::thread_axis(Range(0, num_thread), "threadIdx.x");
-
-    s[max_elem].bind(max_elem->op.as<ComputeOpNode>()->axis[0], block_x);
-    auto k = expsum->op.as<ComputeOpNode>()->reduce_axis[0];
-    IterVar ko, ki;
-    s[expsum].split(k, num_thread, &ko, &ki);
-    auto ef = s.rfactor(expsum, ki)[0];
-
-    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->axis[0], block_x);
-    s[expsum].bind(s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0], thread_x);
-    s[ef].compute_at(s[expsum], s[expsum]->op.as<ComputeOpNode>()->reduce_axis[0]);
-
-    s[expsum].set_store_predicate(static_cast<Expr>(thread_x) == 0);
-    IterVar tx, xi;
-    s[softmax_op].split_by_nparts(softmax_op.as<ComputeOpNode>()->axis[1], num_thread, &tx, &xi);
-    s[softmax_op].bind(tx, thread_x);
-
-    return max_elem->op.as<ComputeOpNode>()->InputTensors()[0];
-  };
-
-  std::function<void(Operation)> traverse;
-  traverse = [&](const Operation& op) {
-    // Inline all one-to-one-mapping operators except the last stage (output)
-    if (is_injective(op->tag)) {
-      if (!detail::contains(s->outputs, op)) {
-        s[op].compute_inline();
-      }
-      for (auto tensor : op->InputTensors()) {
-        if (tensor->op->InputTensors().size() > 0) {
-          traverse(tensor->op);
-        }
-      }
-    } else if (op->tag == "softmax_output") {
-      auto tensor = _schedule_softmax(op);
-      if (tensor->op->InputTensors().size() > 0) {
-        traverse(tensor->op);
-      }
-    } else {
-      LOG(ERROR) << "Unsupported operator " << op->tag;
-    }
-  };
-
-  traverse(outs[0]->op);
-  auto k = output->op.as<ComputeOpNode>()->axis[0];
-  IterVar bx, tx;
-  s[output].split(k, num_thread, &bx, &tx);
-  s[output].bind(bx, tvm::thread_axis(Range(), "blockIdx.x"));
-  s[output].bind(tx, tvm::thread_axis(Range(), "threadIdx.x"));
-  return s;
-}
-}  // namespace cuda
-}  // namespace topi
-#endif  // TOPI_CUDA_VISION_H_
diff --git a/topi/include/topi/rocm/vision.h b/topi/include/topi/rocm/vision.h
deleted file mode 100644
index 4178a180deb4..000000000000
--- a/topi/include/topi/rocm/vision.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*!
-*  Copyright (c) 2018 by Contributors
-* \file rocm/vision.h
-* \brief rocm schedule for region operation
-*/
-#ifndef TOPI_ROCM_VISION_H_
-#define TOPI_ROCM_VISION_H_
-
-#include "tvm/tvm.h"
-#include "tvm/build_module.h"
-#include "topi/tags.h"
-#include "topi/detail/array_utils.h"
-#include "topi/contrib/rocblas.h"
-#include "topi/generic/extern.h"
-#include "topi/cuda/vision.h"
-
-namespace topi {
-using namespace tvm;
-namespace rocm {
-/*!
-* \brief Create a rocm schedule for region
-*
-* \param target The target to generate a schedule for.
-* \param outs The output tensors.
-*
-* \return A schedule for the given ops.
-*/
-inline Schedule schedule_region(const Target &target, const Array<Tensor>& outs) {
-  return topi::cuda::schedule_region(target, outs);
-}
-}  // namespace rocm
-}  // namespace topi
-#endif  // TOPI_ROCM_VISION_H_
diff --git a/topi/include/topi/vision/yolo/region.h b/topi/include/topi/vision/yolo/region.h
deleted file mode 100644
index 7d303f445ac4..000000000000
--- a/topi/include/topi/vision/yolo/region.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \brief Region op constructions
- * \file vision/yolo/region.h
- */
-#ifndef TOPI_VISION_YOLO_REGION_H_
-#define TOPI_VISION_YOLO_REGION_H_
-
-#include <algorithm>
-#include <string>
-
-#include "topi/detail/constant_utils.h"
-#include "topi/reduction.h"
-#include "topi/tags.h"
-#include "topi/transform.h"
-#include "topi/nn/softmax.h"
-#include "tvm/tvm.h"
-
-
-namespace topi {
-namespace vision {
-namespace yolo {
-using namespace tvm;
-using namespace nn;
-
-/*!
-* \brief region operation
-*
-* \param data The input tensor. Can be any dimension
-* \param num Darknet layer parameter n
-* \param classes number of classes in the yolo model
-* \param coords Darknet layer parameter coords
-* \param background Darknet layer parameter background
-* \param l_softmax if true apply softmax
-* \param name The name of the operation
-* \param tag The tag to mark the operation
-*
-* \return A Tensor whose op member is the region operation
-*/
-inline Tensor region(const Tensor &data,
-                     int num,
-                     int classes,
-                     int coords,
-                     int background,
-                     int l_softmax,
-                     std::string name = "tensor",
-                     std::string tag = "region_output") {
-  auto input_shape = data->shape;
-  int split_size = classes + coords + 1;
-  Array <Expr> intermediate_shape = {input_shape[0],
-                                     num,
-                                     split_size,
-                                     input_shape[2],
-                                     input_shape[3]};
-  auto data_block = reshape(data, intermediate_shape);
-  Array <Integer> split_indices;
-  for (int i = 1; i < split_size; ++i) {
-    split_indices.push_back(i);
-  }
-  Array <Tensor> split_res = split(data_block, split_indices, 2);
-  split_res.Set(0, sigmoid(split_res[0]));
-  split_res.Set(1, sigmoid(split_res[1]));
-  if (!background) {
-    split_res.Set(coords, sigmoid(split_res[coords]));
-  }
-
-  if (l_softmax) {
-    int offset = coords + static_cast<int>(!background);
-    Array <Tensor> softmax_input(split_res.begin() + offset, split_res.end());
-    auto softmax_output = softmax(concatenate(softmax_input, 2), 2);
-    Array <Tensor> data_block_1(split_res.begin(), split_res.begin() + offset);
-    data_block_1.push_back(softmax_output);
-    split_res = data_block_1;
-  }
-  Tensor out = concatenate(split_res, 2);
-  return reshape(out, input_shape);
-}
-}  // namespace yolo
-}  // namespace vision
-}  // namespace topi
-#endif  // TOPI_VISION_YOLO_REGION_H_
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index abcbdb50074e..17497abc0d8b 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -61,24 +61,6 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.cuda.schedule_injective(cpp_target, outs)
 
-@generic.schedule_region.register(["cuda", "gpu"])
-def schedule_region(outs):
-    """Schedule for region operator.
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of region
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for region.
-    """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.cuda.schedule_region(cpp_target, outs)
-
 @generic.schedule_nms.register(["cuda", "gpu"])
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index 9a1e06aa30e8..76e8545bfc52 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -17,23 +17,6 @@ def _default_schedule(outs, auto_inline):
         s[x].fuse(s[x].op.axis)
     return s
 
-@tvm.target.generic_func
-def schedule_shortcut(outs):
-    """Schedule for shortcut
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of shortcut
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    return _default_schedule(outs, False)
-
 @tvm.target.generic_func
 def schedule_reorg(outs):
     """Schedule for reorg
@@ -53,25 +36,6 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
-@tvm.target.generic_func
-def schedule_region(outs):
-    """Schedule for region
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-      The computation graph description of region
-      in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-      The computation schedule for the op.
-    """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.generic.default_schedule(cpp_target, outs, False)
-
 @tvm.target.generic_func
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
diff --git a/topi/python/topi/rocm/__init__.py b/topi/python/topi/rocm/__init__.py
index 96a04794c680..9440b5c94bda 100644
--- a/topi/python/topi/rocm/__init__.py
+++ b/topi/python/topi/rocm/__init__.py
@@ -4,5 +4,4 @@
 
 from .conv2d import *
 from .dense import *
-from .vision import *
 from .nn import *
diff --git a/topi/python/topi/rocm/vision.py b/topi/python/topi/rocm/vision.py
deleted file mode 100644
index 84ae436e3531..000000000000
--- a/topi/python/topi/rocm/vision.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""Schedule for vision operator"""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import generic
-from .. import cpp
-
-@generic.schedule_region.register(["rocm"])
-def schedule_region(outs):
-    """Schedule for region operator.
-
-    Parameters
-    ----------
-    outs: Array of Tensor
-        The computation graph description of region
-        in the format of an array of tensors.
-
-    Returns
-    -------
-    s: Schedule
-        The computation schedule for region.
-    """
-    target = tvm.target.current_target(allow_none=False)
-    cpp_target = cpp.TEST_create_target(target.target_name)
-    return cpp.rocm.schedule_region(cpp_target, outs)
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 5ea9683f72ef..81dd379257e0 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -14,9 +14,7 @@
 from .upsampling_python import upsampling_python
 from .bilinear_resize_python import bilinear_resize_python
 from .reorg_python import reorg_python
-from .region_python import region_python
 from .roi_align_python import roi_align_nchw_python
-from .shortcut_python import shortcut_python
 from .lrn_python import lrn_python
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
diff --git a/topi/python/topi/testing/region_python.py b/topi/python/topi/testing/region_python.py
deleted file mode 100644
index 3bab53892607..000000000000
--- a/topi/python/topi/testing/region_python.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Region in python"""
-import numpy as np
-
-def entry_index(batch, w, h, outputs, classes, coords, location, entry):
-    n = int(location/(w*h))
-    loc = location%(w*h)
-    return batch*outputs + n*w*h*(coords+classes+1) + entry*w*h + loc
-
-def region_python(a_np, N, classes, coords, background, softmax):
-    """Region operator
-    Parameters
-    ----------
-    a_np : numpy.ndarray
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    N : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    coords : int
-        Darknet layer parameter coords
-
-    background : int
-        Darknet layer parameter background
-
-    softmax : int
-        Darknet layer parameter softmax
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    batch, in_channel, in_height, in_width = a_np.shape
-    a_np_temp = np.reshape(a_np, batch*in_channel*in_height*in_width)
-    outputs = batch*in_channel*in_height*in_width
-    b_np = np.zeros(batch*in_channel*in_height*in_width)
-    for i in range(batch*in_channel*in_height*in_width):
-        b_np[i] = a_np_temp[i]
-    for b in range(batch):
-        for n in range(N):
-            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, 0)
-            b_np[index: index+2*in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+2*in_width*in_height]))
-            index = entry_index(b, in_width, in_height, outputs, classes, coords, n*in_width*in_height, coords)
-            if not background:
-                b_np[index: index+in_width*in_height] = 1/(1+np.exp(-1*b_np[index: index+in_width*in_height]))
-
-    b_np = np.reshape(b_np, (batch, in_channel, in_height, in_width))
-    def local_softmax(data_in):
-        data_c, data_h, data_w = data_in.shape
-        largest = np.max(data_in, axis=1)
-        data_out = np.zeros((data_c, data_h, data_w))
-        for i in range(data_h):
-            for j in range(data_w):
-                data_out[:, i, j] = np.exp(data_in[:, i, j] - largest[i, j])
-        return data_out/data_out.sum(axis=0)
-
-    if softmax:
-        index = coords + int(not background)
-        for b in range(batch):
-            for i in range(N):
-                b_np_index = int(i*(in_channel/N) + index)
-                b_np[b, b_np_index: b_np_index + classes+background, :, :] = local_softmax(b_np[b, b_np_index:b_np_index + classes+background, :, :])
-
-    return b_np
diff --git a/topi/python/topi/testing/shortcut_python.py b/topi/python/topi/testing/shortcut_python.py
deleted file mode 100644
index 575c28b61c2c..000000000000
--- a/topi/python/topi/testing/shortcut_python.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# pylint: disable=invalid-name, line-too-long, unused-variable, too-many-locals
-"""Shortcut in python"""
-import numpy as np
-
-def shortcut_python(a_np1, a_np2):
-    """Reorg operator
-
-    Parameters
-    ----------
-    a_np1 : numpy.ndarray
-        4-D with shape [batch1, in_channel1, in_height1, in_width1]
-
-    a_np2 : numpy.ndarray
-        4-D with shape [batch2, in_channel2, in_height2, in_width2]
-
-    Returns
-    -------
-    b_np : np.ndarray
-        4-D with shape [batch1, out_channel1, out_height1, out_width1]
-    """
-
-    batch1, in_channel1, in_height1, in_width1 = a_np1.shape
-    batch2, in_channel2, in_height2, in_width2 = a_np2.shape
-    a_np1_temp = np.reshape(a_np1, batch1*in_channel1*in_height1*in_width1)
-    a_np2_temp = np.reshape(a_np2, batch2*in_channel2*in_height2*in_width2)
-    b_np = np.zeros(batch1*in_channel1*in_height1*in_width1)
-    stride = int(in_width1/in_width2)
-    sample = int(in_width2/in_width1)
-    if stride < 1:
-        stride = 1
-    if sample < 1:
-        sample = 1
-    minw = min(in_width1, in_width2)
-    minh = min(in_height1, in_height2)
-    minc = min(in_channel1, in_channel2)
-
-    for i in range((batch1*in_channel1*in_height1*in_width1)):
-        b_np[i] = a_np1_temp[i]
-    for b in range(batch1):
-        for k in range(minc):
-            for j in range(minh):
-                for i in range(minw):
-                    out_index = i*sample + in_width2*(j*sample + in_height2*(k + in_channel2*b))
-                    add_index = i*stride + in_width1*(j*stride + in_height1*(k + in_channel1*b))
-                    b_np[out_index] = a_np1_temp[out_index] + a_np2_temp[add_index]
-    b_np = np.reshape(b_np, (batch1, in_channel1, in_height1, in_width1))
-    return b_np
diff --git a/topi/python/topi/vision/__init__.py b/topi/python/topi/vision/__init__.py
index e3aa847972ac..c10f7c68bf36 100644
--- a/topi/python/topi/vision/__init__.py
+++ b/topi/python/topi/vision/__init__.py
@@ -2,8 +2,7 @@
 """VISION network operators"""
 from __future__ import absolute_import as _abs
 
-from . import yolo, ssd
-from .shortcut import *
+from . import ssd
 from .reorg import *
 from .nms import *
 from .rcnn import *
diff --git a/topi/python/topi/vision/shortcut.py b/topi/python/topi/vision/shortcut.py
deleted file mode 100644
index 529360190a4e..000000000000
--- a/topi/python/topi/vision/shortcut.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""Shortcut operators (short-cut connections)."""
-from __future__ import absolute_import as _abs
-import tvm
-from .. import util
-from .. import transform
-
-@tvm.target.generic_func
-def shortcut(inp1, inp2):
-    """Shortcut forward operators.
-
-    Parameters
-    ----------
-    First Input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Second Input : tvm.Tensor
-        4-D with shape [batch, in_channel, in_height, in_width]
-
-    Returns
-    -------
-    Output : tvm.Tensor
-        4-D with shape [batch, out_channel, out_height, out_width]
-    """
-
-    _, inp1_c, inp1_h, inp1_w = util.get_const_tuple(inp1.shape)
-    batch, inp2_c, inp2_h, inp2_w = util.get_const_tuple(inp2.shape)
-
-    stride = int(max(inp2_w / inp1_w, 1))
-    sample = int(max(inp1_w / inp2_w, 1))
-    minc = min(inp2_c, inp1_c)
-    minh = min(inp2_h, inp1_h)
-    minw = min(inp2_w, inp1_w)
-
-    out = tvm.compute((batch, minc, minh, minw), lambda b, c, h, w:
-                      inp1[b, c, h * sample, w * sample] +
-                      inp2[b, c, h * stride, w * stride],
-                      tag="shortcut")
-
-    split_indices = int(inp1_c / minc)
-    if split_indices > 1:
-        split_res = transform.split(inp1, split_indices, 1)
-        split_res[0] = out
-        out = transform.concatenate(split_res, 1)
-
-    return out
diff --git a/topi/python/topi/vision/yolo/__init__.py b/topi/python/topi/vision/yolo/__init__.py
deleted file mode 100644
index c0e9899a41aa..000000000000
--- a/topi/python/topi/vision/yolo/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# pylint: disable=wildcard-import
-"""VISION network operators"""
-from __future__ import absolute_import as _abs
-
-from .region import *
diff --git a/topi/python/topi/vision/yolo/region.py b/topi/python/topi/vision/yolo/region.py
deleted file mode 100644
index 77c1c86a8d06..000000000000
--- a/topi/python/topi/vision/yolo/region.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# pylint: disable=invalid-name, unused-variable
-"""
-REGION Operator
-====================
-Region operator, used in darknet.
-"""
-from __future__ import absolute_import as _abs
-import tvm
-from ... import cpp
-
-@tvm.target.generic_func
-def region(data, num, classes, coords, background, softmax=True):
-    """Region forward operators.
-    Parameters
-    ----------
-    data : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-
-    num : int
-        Darknet layer parameter n
-
-    classes : int
-        Darknet layer parameter classes
-
-    coords : int
-        Darknet layer parameter coords
-
-    background : int
-        Darknet layer parameter background
-
-    softmax : boolean
-        Darknet layer parameter softmax
-
-    Returns
-    -------
-    out : tvm.Tensor
-        4-D with shape [batch, c_in, h_in, w_in]
-    """
-    return cpp.yolo.region(data, num, classes, coords, background, softmax)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index e0f16239d561..e3fec08cb491 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -30,7 +30,6 @@
 
 #include <topi/vision/reorg.h>
 #include <topi/image/resize.h>
-#include <topi/vision/yolo/region.h>
 #include <topi/generic/default.h>
 #include <topi/generic/extern.h>
 #include <topi/generic/injective.h>
@@ -41,7 +40,6 @@
 #include <topi/cuda/pooling.h>
 #include <topi/cuda/reduction.h>
 #include <topi/cuda/softmax.h>
-#include <topi/cuda/vision.h>
 #include <topi/cuda/normalization.h>
 
 #include <topi/x86/bnn.h>
@@ -49,7 +47,6 @@
 #include <topi/x86/injective.h>
 
 #include <topi/rocm/dense.h>
-#include <topi/rocm/vision.h>
 #include <topi/rocm/normalization.h>
 
 namespace topi {
@@ -416,11 +413,6 @@ TVM_REGISTER_GLOBAL("topi.vision.reorg")
   *rv = vision::reorg(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.vision.yolo.region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = vision::yolo::region(args[0], args[1], args[2], args[3], args[4], args[5]);
-  });
-
 /* Ops from image/resize.h */
 TVM_REGISTER_GLOBAL("topi.image.bilinear_sample_nchw")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -488,11 +480,6 @@ TVM_REGISTER_GLOBAL("topi.rocm.schedule_dense")
   *rv = topi::rocm::schedule_dense(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.rocm.schedule_region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::rocm::schedule_region(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.rocm.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::rocm::schedule_lrn(args[0], args[1]);
@@ -544,11 +531,6 @@ TVM_REGISTER_GLOBAL("topi.cuda.schedule_softmax")
   *rv = topi::cuda::schedule_softmax(args[0], args[1]);
   });
 
-TVM_REGISTER_GLOBAL("topi.cuda.schedule_region")
-.set_body([](TVMArgs args, TVMRetValue *rv) {
-  *rv = topi::cuda::schedule_region(args[0], args[1]);
-  });
-
 TVM_REGISTER_GLOBAL("topi.cuda.schedule_lrn")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = topi::cuda::schedule_lrn(args[0], args[1]);
diff --git a/topi/tests/python/test_topi_region.py b/topi/tests/python/test_topi_region.py
deleted file mode 100644
index 3357382b232e..000000000000
--- a/topi/tests/python/test_topi_region.py
+++ /dev/null
@@ -1,49 +0,0 @@
-"""Example code to do region."""
-import numpy as np
-import topi
-from topi.util import get_const_tuple
-import tvm
-import topi.testing
-
-def verify_region(batch, in_size, in_channel, n, classes, coords, background, l_softmax):
-    '''Verify region operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A = tvm.placeholder((batch, in_channel, in_height, in_width), name='A')
-    B = topi.vision.yolo.region(A, n, classes, coords, background, l_softmax)
-
-    a_shape = get_const_tuple(A.shape)
-    dtype = A.dtype
-
-    def get_ref_data_region():
-        a_np = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.region_python(a_np, n, classes, coords, background, l_softmax)
-        return a_np, b_np
-
-    a_np, b_np = get_ref_data_region()
-    def check_device(device):
-        '''Cheching devices is enabled or not'''
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            if device == 'llvm':
-                s = topi.generic.vision.schedule_region([B])
-            else:
-                s = topi.cuda.vision.schedule_region([B])
-        a = tvm.nd.array(a_np, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A, B], device)
-        func(a, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['llvm', 'cuda']:
-        check_device(device)
-
-def test_region():
-    verify_region(1, 19, 425, 5, 80, 4, 0, 1)
-
-if __name__ == "__main__":
-    test_region()
diff --git a/topi/tests/python/test_topi_shortcut.py b/topi/tests/python/test_topi_shortcut.py
deleted file mode 100644
index f89aa46a1e66..000000000000
--- a/topi/tests/python/test_topi_shortcut.py
+++ /dev/null
@@ -1,48 +0,0 @@
-"""Example code to do shortcut."""
-import numpy as np
-import topi
-from topi.util import get_const_tuple
-import tvm
-
-def verify_shortcut(batch, in_size, in_channel):
-    '''Verify shortcut operator by comparing outputs from tvm and numpy implementation'''
-    in_height = in_width = in_size
-
-    A1 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A1')
-    A2 = tvm.placeholder((batch, in_channel, in_height, in_width), name='A2')
-    B = topi.vision.shortcut(A1, A2)
-
-    a_shape = get_const_tuple(A1.shape)
-    dtype = A1.dtype
-    def get_ref_data_shortcut():
-        a_np1 = np.random.uniform(size=a_shape).astype(dtype)
-        a_np2 = np.random.uniform(size=a_shape).astype(dtype)
-        b_np = topi.testing.shortcut_python(a_np1, a_np2)
-        return a_np1, a_np2, b_np
-
-    a_np1, a_np2, b_np = get_ref_data_shortcut()
-    def check_device(device):
-        '''Cheching devices is enabled or not'''
-        ctx = tvm.context(device, 0)
-        if not ctx.exist:
-            print("Skip because %s is not enabled" % device)
-            return
-        print("Running on target: %s" % device)
-        with tvm.target.create(device):
-            s = topi.generic.schedule_injective([B])
-
-        a1 = tvm.nd.array(a_np1, ctx)
-        a2 = tvm.nd.array(a_np2, ctx)
-        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx)
-        func = tvm.build(s, [A1, A2, B], device)
-        func(a1, a2, b)
-        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
-
-    for device in ['llvm', 'cuda']:
-        check_device(device)
-
-def test_shortcut():
-    verify_shortcut(1, 144, 32)
-
-if __name__ == "__main__":
-    test_shortcut()

From d85e7807c1c2df03cb8f68a41c8571814942619c Mon Sep 17 00:00:00 2001
From: Siju <sijusamuel@gmail.com>
Date: Wed, 27 Feb 2019 09:49:45 +0530
Subject: [PATCH 26/93] yolo reorg op for relay (#1941)

---
 docs/langref/relay_op.rst              |  2 +
 include/tvm/relay/attrs/vision.h       | 11 ++++
 python/tvm/relay/op/vision/__init__.py |  2 +
 python/tvm/relay/op/vision/_yolo.py    |  9 +++
 python/tvm/relay/op/vision/yolo.py     | 34 +++++++++++
 src/relay/op/vision/yolo.cc            | 78 ++++++++++++++++++++++++++
 tests/python/relay/test_op_level5.py   | 37 +++++++++++-
 7 files changed, 172 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/op/vision/_yolo.py
 create mode 100644 python/tvm/relay/op/vision/yolo.py
 create mode 100644 src/relay/op/vision/yolo.cc

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index d58ba2e66621..e2da42b6ab32 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -135,6 +135,7 @@ This level enables additional math and transform operators.
    tvm.relay.vision.multibox_prior
    tvm.relay.vision.multibox_transform_loc
    tvm.relay.vision.nms
+   tvm.relay.vision.yolo_reorg
 
 
 **Level 10: Temporary Operators**
@@ -251,6 +252,7 @@ Level 5 Definitions
 .. autofunction:: tvm.relay.vision.multibox_prior
 .. autofunction:: tvm.relay.vision.multibox_transform_loc
 .. autofunction:: tvm.relay.vision.nms
+.. autofunction:: tvm.relay.vision.yolo_reorg
 
 
 Level 10 Definitions
diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index d1a5ea41bc69..73b7339e2edb 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -98,6 +98,17 @@ struct ROIAlignAttrs : public tvm::AttrsNode<ROIAlignAttrs> {
   }
 };
 
+/*! \brief Attributes used in yolo reorg operators */
+struct YoloReorgAttrs : public tvm::AttrsNode<YoloReorgAttrs> {
+  Integer stride;
+
+  TVM_DECLARE_ATTRS(YoloReorgAttrs, "relay.attrs.YoloReorgAttrs") {
+    TVM_ATTR_FIELD(stride)
+      .set_default(1)
+      .describe("Stride value for yolo reorg");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 710adfeb4955..10cf6c2fd3ee 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -5,5 +5,7 @@
 from .multibox import *
 from .nms import *
 from .rcnn import *
+from .yolo import *
 from . import _multibox
 from . import _rcnn
+from . import _yolo
diff --git a/python/tvm/relay/op/vision/_yolo.py b/python/tvm/relay/op/vision/_yolo.py
new file mode 100644
index 000000000000..749ebfa26dd0
--- /dev/null
+++ b/python/tvm/relay/op/vision/_yolo.py
@@ -0,0 +1,9 @@
+#pylint: disable=invalid-name, unused-argument
+"""Backend compiler related feature registration"""
+from __future__ import absolute_import
+from ..op import  register_schedule, register_pattern
+from ..op import schedule_injective, OpPattern
+
+# reorg
+register_pattern("vision.yolo_reorg", OpPattern.INJECTIVE)
+register_schedule("vision.yolo_reorg", schedule_injective)
diff --git a/python/tvm/relay/op/vision/yolo.py b/python/tvm/relay/op/vision/yolo.py
new file mode 100644
index 000000000000..71b7918dca0f
--- /dev/null
+++ b/python/tvm/relay/op/vision/yolo.py
@@ -0,0 +1,34 @@
+"""Yolo operations."""
+from . import _make
+
+def yolo_reorg(data, stride):
+    """Yolo reorg operation used in darknet models.
+    This layer shuffles the input tensor values based on the stride value.
+    Along with the shuffling, it does the shape transform.
+    If '(n, c, h, w)' is the data shape and 's' is stride, output shape is '(n, c*s*s, h/s, w/s)'
+    Example: data(1, 4, 2, 2) = [[[[ 0  1] [ 2  3]]
+                                  [[ 4  5] [ 6  7]]
+                                  [[ 8  9] [10 11]]
+                                  [[12 13] [14 15]]]]
+             stride = 2
+             ret(1, 16, 1, 1) = [[[[ 0]]  [[ 2]]  [[ 8]]  [[10]]
+                                  [[ 1]]  [[ 3]]  [[ 9]]  [[11]]
+                                  [[ 4]]  [[ 6]]  [[12]]  [[14]]
+                                  [[ 5]]  [[ 7]]  [[13]]  [[15]]]]
+
+    Note: stride=1 has no significance for reorg operation.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data tensor.
+
+    stride : int
+        The stride value for reorganisation.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.yolo_reorg(data, stride)
diff --git a/src/relay/op/vision/yolo.cc b/src/relay/op/vision/yolo.cc
new file mode 100644
index 000000000000..b826d4c6e8e2
--- /dev/null
+++ b/src/relay/op/vision/yolo.cc
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2018 by Contributors
+ * \file yolo.cc
+ * \brief Yolo related operators
+ */
+#include <tvm/relay/op.h>
+#include <tvm/relay/attrs/vision.h>
+#include <topi/vision/reorg.h>
+#include <vector>
+#include "../op_common.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(YoloReorgAttrs);
+
+/*!
+* \brief YoloReorgRel Output type and shape relation evaluation function.
+* \param num_inputs Number of input types in the args.
+* \param attrs The additional attributes of the operator.
+* \param reporter The reporter to report solution to.
+* \return false if This relation cannot be resolved. true if this relation has been resolved.
+*/
+bool YoloReorgRel(const Array<Type>& types,
+                  int num_inputs,
+                  const Attrs& attrs,
+                  const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) return false;
+
+  const YoloReorgAttrs* param = attrs.as<YoloReorgAttrs>();
+  CHECK(param != nullptr);
+
+  CHECK(data->shape.size() == 4) << "Yolo reorg supports only 4 dimension.";
+  std::vector<IndexExpr>&& oshape = AsVector(data->shape);
+  oshape[1] = oshape[1] * param->stride * param->stride;
+  oshape[2] = oshape[2] / param->stride;
+  oshape[3] = oshape[3] / param->stride;
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Expr MakeYoloReorg(Expr data,
+                   Integer stride) {
+  auto attrs = make_node<YoloReorgAttrs>();
+  attrs->stride = stride;
+  static const Op& op = Op::Get("vision.yolo_reorg");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.yolo_reorg")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeYoloReorg, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.yolo_reorg")
+.describe(R"doc("Yolo reorg operation. This layer reorganize the output.
+Its function is mostly shape transform.")doc" TVM_ADD_FILELINE)
+.add_argument("data", "Tensor", "The input tensor.")
+.set_num_inputs(1)
+.set_support_level(5)
+.set_attrs_type_key("relay.attrs.YoloReorgAttrs")
+.add_type_rel("YoloReorg", YoloReorgRel)
+.set_attr<FTVMCompute>("FTVMCompute", [](const Attrs& attrs,
+                                         const Array<Tensor>& inputs,
+                                         const Type& out_type,
+                                         const Target& target) {
+  const auto* params = attrs.as<YoloReorgAttrs>();
+  CHECK(params != nullptr);
+  return Array<Tensor>{ topi::vision::reorg(inputs[0], params->stride) };
+});
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 1d91d92a6abc..8db6d747ef5e 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -7,7 +7,6 @@
 from tvm.relay.testing import ctx_list
 import topi.testing
 
-
 def test_resize_infer_type():
     n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
     x = relay.var("x", relay.TensorType((n, c, h, w), "int8"))
@@ -307,6 +306,40 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
     verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2)
 
 
+def test_yolo_reorg_infer_shape():
+    def verify_yolo_reorg(shape, stride, out_shape):
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.vision.yolo_reorg(x, stride=stride)
+        zz = relay.ir_pass.infer_type(z)
+        assert "stride=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(out_shape, "float32")
+
+    n, c, h, w = tvm.var("n"), tvm.var("c"), tvm.var("h"), tvm.var("w")
+    verify_yolo_reorg((n, c, 20, 20), 10, (n, c*10*10, 2, 2))
+    verify_yolo_reorg((n, c, h, w), 2, (n, c*2*2, h/2, w/2))
+
+def test_yolo_reorg():
+    def verify_yolo_reorg(shape, stride):
+        x_data = np.random.uniform(low=-1, high=1, size=shape).astype("float32")
+        ref_res = topi.testing.reorg_python(x_data, stride)
+
+        x = relay.var("x", relay.TensorType(shape, "float32"))
+        z = relay.vision.yolo_reorg(x, stride=stride)
+        zz = relay.ir_pass.infer_type(z)
+        assert "stride=" in z.astext()
+        assert zz.checked_type == relay.ty.TensorType(ref_res.shape, "float32")
+
+        func = relay.Function([x], z)
+
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(func)(x_data)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5)
+
+    verify_yolo_reorg((1, 100, 20, 20), 10)
+    verify_yolo_reorg((1, 4, 6, 6), 2)
+
 if __name__ == "__main__":
     test_resize_infer_type()
     test_resize()
@@ -314,3 +347,5 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
     test_multibox_transform_loc()
     test_nms()
     test_roi_align()
+    test_yolo_reorg_infer_shape()
+    test_yolo_reorg()

From 8332af8b19078aa3788c8267fa515b3a56344876 Mon Sep 17 00:00:00 2001
From: "Steven S. Lyubomirsky" <slyubomirsky@gmail.com>
Date: Tue, 26 Feb 2019 20:38:50 -0800
Subject: [PATCH 27/93] [Relay] Ensure nested higher-order functions are
 treated correctly (#2676)

---
 python/tvm/relay/prelude.py           |  4 +--
 src/relay/pass/type_infer.cc          | 29 ++++++++++++++-
 tests/python/relay/test_type_infer.py | 52 +++++++++++++++++++++++++++
 3 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/prelude.py b/python/tvm/relay/prelude.py
index 034b58ef1c7e..41d1be284f8e 100644
--- a/python/tvm/relay/prelude.py
+++ b/python/tvm/relay/prelude.py
@@ -394,9 +394,7 @@ def define_iterate(self):
         f = Var("f", FuncType([a], a))
         x = Var("x", self.nat())
         y = Var("y", self.nat())
-        z = Var("z")
-        z_case = Clause(PatternConstructor(self.z), Function([z], z))
-        # todo: fix typechecker so Function([z], z) can be replaced by self.id
+        z_case = Clause(PatternConstructor(self.z), self.id)
         s_case = Clause(PatternConstructor(self.s, [PatternVar(y)]),
                         self.compose(f, self.iterate(f, y)))
         self.mod[self.iterate] = Function([f, x],
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index b6bdedc04473..8dd02f39adce 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -121,7 +121,17 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
   Type Unify(const Type& t1, const Type& t2, const NodeRef& expr) {
     // TODO(tqchen, jroesch): propagate span to solver
     try {
-      return solver_.Unify(t1, t2, expr);
+      // instantiate higher-order func types when unifying because
+      // we only allow polymorphism at the top level
+      Type first = t1;
+      Type second = t2;
+      if (auto* ft1 = t1.as<FuncTypeNode>()) {
+        first = InstantiateFuncType(ft1);
+      }
+      if (auto* ft2 = t2.as<FuncTypeNode>()) {
+        second = InstantiateFuncType(ft2);
+      }
+      return solver_.Unify(first, second, expr);
     } catch (const dmlc::Error &e) {
       this->ReportFatalError(
         expr,
@@ -351,6 +361,20 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
     return Downcast<FuncType>(inst_ty);
   }
 
+  // instantiates starting from incompletes
+  FuncType InstantiateFuncType(const FuncTypeNode* fn_ty) {
+    if (fn_ty->type_params.size() == 0) {
+      return GetRef<FuncType>(fn_ty);
+    }
+
+    Array<Type> type_args;
+    for (size_t i = 0; i < fn_ty->type_params.size(); i++) {
+      type_args.push_back(IncompleteTypeNode::make(Kind::kType));
+    }
+    return InstantiateFuncType(fn_ty, type_args);
+  }
+
+
   void AddTypeArgs(const Expr& expr, Array<Type> type_args) {
     auto type_info = type_map_.find(expr);
     if (type_info == type_map_.end()) {
@@ -464,6 +488,9 @@ class TypeInferencer : private ExprFunctor<Type(const Expr&)>,
       arg_types.push_back(GetType(param));
     }
     Type rtype = GetType(f->body);
+    if (auto* ft = rtype.as<FuncTypeNode>()) {
+      rtype = InstantiateFuncType(ft);
+    }
     if (f->ret_type.defined()) {
       rtype = this->Unify(f->ret_type, rtype, GetRef<Function>(f));
     }
diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py
index 05f8b8fd22f9..8c8e7dfd1fcc 100644
--- a/tests/python/relay/test_type_infer.py
+++ b/tests/python/relay/test_type_infer.py
@@ -133,6 +133,58 @@ def test_incomplete_call():
     assert ft.checked_type == relay.FuncType([tt, f_type], tt)
 
 
+def test_higher_order_argument():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    b = relay.TypeVar('b')
+    f = relay.Var('f', relay.FuncType([b], b))
+    y = relay.Var('y', b)
+    ho_func = relay.Function([f, y], f(y), b, [b])
+
+    # id func should be an acceptable argument to the higher-order
+    # function even though id_func takes a type parameter
+    ho_call = ho_func(id_func, relay.const(0, 'int32'))
+
+    hc = relay.ir_pass.infer_type(ho_call)
+    expected = relay.scalar_type('int32')
+    assert hc.checked_type == expected
+
+
+def test_higher_order_return():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    b = relay.TypeVar('b')
+    nested_id = relay.Function([], id_func, relay.FuncType([b], b), [b])
+
+    ft = relay.ir_pass.infer_type(nested_id)
+    assert ft.checked_type == relay.FuncType([], relay.FuncType([b], b), [b])
+
+
+def test_higher_order_nested():
+    a = relay.TypeVar('a')
+    x = relay.Var('x', a)
+    id_func = relay.Function([x], x, a, [a])
+
+    choice_t = relay.FuncType([], relay.scalar_type('bool'))
+    f = relay.Var('f', choice_t)
+
+    b = relay.TypeVar('b')
+    z = relay.Var('z')
+    top = relay.Function(
+        [f],
+        relay.If(f(), id_func, relay.Function([z], z)),
+        relay.FuncType([b], b),
+        [b])
+
+    expected = relay.FuncType([choice_t], relay.FuncType([b], b), [b])
+    ft = relay.ir_pass.infer_type(top)
+    assert ft.checked_type == expected
+
+
 def test_tuple():
     tp = relay.TensorType((10,))
     x = relay.var("x", tp)

From abad345cf264a1ee6b0cf2f50abc96cec1bc081d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Tue, 26 Feb 2019 20:40:06 -0800
Subject: [PATCH 28/93] [Relay] add more descriptive error for checked_type
 (#2652)

---
 include/tvm/relay/expr.h | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/include/tvm/relay/expr.h b/include/tvm/relay/expr.h
index b9a57c5d4618..06a1aa1ac9ef 100644
--- a/include/tvm/relay/expr.h
+++ b/include/tvm/relay/expr.h
@@ -34,12 +34,7 @@ class ExprNode : public RelayNode {
   /*!
    * \return The checked_type
    */
-  const Type& checked_type() const {
-    CHECK(checked_type_.defined()) << "internal error: the type checker has "
-                                      "not populated the checked_type "
-                                      "field for this node";
-    return this->checked_type_;
-  }
+  const Type& checked_type() const;
   /*!
    * \brief Check if the inferred(checked) type of the Expr
    *  is backed by a TTypeNode and return it.
@@ -235,8 +230,8 @@ class FunctionNode : public ExprNode {
     v->Visit("body", &body);
     v->Visit("ret_type", &ret_type);
     v->Visit("type_params", &type_params);
-    v->Visit("span", &span);
     v->Visit("attrs", &attrs);
+    v->Visit("span", &span);
     v->Visit("_checked_type_", &checked_type_);
   }
 
@@ -527,6 +522,14 @@ class TempExprNode : public ExprNode {
 RELAY_DEFINE_NODE_REF(TempExpr, TempExprNode, Expr);
 
 // implementataions
+inline const Type& ExprNode::checked_type() const {
+  CHECK(checked_type_.defined()) << "internal error: the type checker has "
+    "not populated the checked_type "
+    "field for "
+                                 << GetRef<Expr>(this);
+  return this->checked_type_;
+}
+
 template<typename TTypeNode>
 inline const TTypeNode* ExprNode::type_as() const {
   static_assert(std::is_base_of<TypeNode, TTypeNode>::value,

From 38794e143309aba3955330ba85ec0952b9ecab94 Mon Sep 17 00:00:00 2001
From: Logan Weber <36520469+weberlo@users.noreply.github.com>
Date: Tue, 26 Feb 2019 21:15:39 -0800
Subject: [PATCH 29/93] [Relay] Port param dict save/load from NNVM (#2620)

---
 python/tvm/api.py                     |  2 +-
 python/tvm/relay/__init__.py          |  5 ++
 python/tvm/relay/param_dict.py        | 60 ++++++++++++++++++
 src/relay/backend/interpreter.cc      |  5 ++
 src/relay/backend/param_dict.cc       | 87 +++++++++++++++++++++++++++
 src/relay/backend/param_dict.h        | 43 +++++++++++++
 tests/python/relay/test_param_dict.py | 78 ++++++++++++++++++++++++
 7 files changed, 279 insertions(+), 1 deletion(-)
 create mode 100644 python/tvm/relay/param_dict.py
 create mode 100644 src/relay/backend/param_dict.cc
 create mode 100644 src/relay/backend/param_dict.h
 create mode 100644 tests/python/relay/test_param_dict.py

diff --git a/python/tvm/api.py b/python/tvm/api.py
index 10a97171e58f..514490ae83ea 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -136,7 +136,7 @@ def load_json(json_str):
 
 
 def save_json(node):
-    """Load tvm object as json string.
+    """Save tvm object as json string.
 
     Parameters
     ----------
diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py
index fe00877c0fb0..6d44d07f4bbf 100644
--- a/python/tvm/relay/__init__.py
+++ b/python/tvm/relay/__init__.py
@@ -13,6 +13,7 @@
 from . import prelude
 from . import parser
 from . import debug
+from . import param_dict
 
 # Root operators
 from .op import Op
@@ -85,3 +86,7 @@
 
 # Parser
 fromtext = parser.fromtext
+
+# Param Serialization
+save_param_dict = param_dict.save_param_dict
+load_param_dict = param_dict.load_param_dict
diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py
new file mode 100644
index 000000000000..f7647beadeb2
--- /dev/null
+++ b/python/tvm/relay/param_dict.py
@@ -0,0 +1,60 @@
+# pylint: disable=invalid-name
+"""Helper utility to save parameter dicts."""
+import tvm
+
+_save_param_dict = tvm.get_global_func("tvm.relay._save_param_dict")
+_load_param_dict = tvm.get_global_func("tvm.relay._load_param_dict")
+
+def save_param_dict(params):
+    """Save parameter dictionary to binary bytes.
+
+    The result binary bytes can be loaded by the
+    GraphModule with API "load_params".
+
+    Parameters
+    ----------
+    params : dict of str to NDArray
+        The parameter dictionary.
+
+    Returns
+    -------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Examples
+    --------
+    .. code-block:: python
+
+       # compile and save the modules to file.
+       graph, lib, params = tvm.relay.build(func, target=target, params=params)
+       module = graph_runtime.create(graph, lib, tvm.gpu(0))
+       # save the parameters as byte array
+       param_bytes = tvm.relay.save_param_dict(params)
+       # We can serialize the param_bytes and load it back later.
+       # Pass in byte array to module to directly set parameters
+       module.load_params(param_bytes)
+    """
+    args = []
+    for k, v in params.items():
+        args.append(k)
+        args.append(tvm.nd.array(v))
+    return _save_param_dict(*args)
+
+
+def load_param_dict(param_bytes):
+    """Load parameter dictionary to binary bytes.
+
+    Parameters
+    ----------
+    param_bytes: bytearray
+        Serialized parameters.
+
+    Returns
+    -------
+    params : dict of str to NDArray
+        The parameter dictionary.
+    """
+    if isinstance(param_bytes, (bytes, str)):
+        param_bytes = bytearray(param_bytes)
+    load_arr = _load_param_dict(param_bytes)
+    return {v.name : v.array for v in load_arr}
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 4ef893f463e9..3128d2a71159 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -578,5 +578,10 @@ TVM_REGISTER_API("relay.backend.CreateInterpreter")
 .set_body([](TVMArgs args, TVMRetValue* ret) {
     *ret = CreateInterpreter(args[0], args[1], args[2]);
   });
+
+TVM_REGISTER_NODE_TYPE(ClosureNode);
+TVM_REGISTER_NODE_TYPE(TupleValueNode);
+TVM_REGISTER_NODE_TYPE(TensorValueNode);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc
new file mode 100644
index 000000000000..87d3dd373e83
--- /dev/null
+++ b/src/relay/backend/param_dict.cc
@@ -0,0 +1,87 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file param_dict.cc
+ * \brief Implementation and registration of parameter dictionary
+ * serializing/deserializing functions.
+ */
+#include "param_dict.h"
+
+#include <dmlc/memory_io.h>
+
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+using namespace runtime;
+
+TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    CHECK_EQ(args.size() % 2, 0u);
+    // `args` is in the form "key, value, key, value, ..."
+    size_t num_params = args.size() / 2;
+    std::vector<std::string> names;
+    names.reserve(num_params);
+    std::vector<DLTensor*> arrays;
+    arrays.reserve(num_params);
+    for (size_t i = 0; i < num_params * 2; i += 2) {
+      names.emplace_back(args[i].operator std::string());
+      arrays.emplace_back(args[i + 1].operator DLTensor*());
+    }
+    std::string bytes;
+    dmlc::MemoryStringStream strm(&bytes);
+    dmlc::Stream* fo = &strm;
+    uint64_t header = kTVMNDArrayListMagic, reserved = 0;
+    fo->Write(header);
+    fo->Write(reserved);
+    fo->Write(names);
+    {
+      uint64_t sz = static_cast<uint64_t>(arrays.size());
+      fo->Write(sz);
+      for (size_t i = 0; i < sz; ++i) {
+        tvm::runtime::SaveDLTensor(fo, arrays[i]);
+      }
+    }
+    TVMByteArray arr;
+    arr.data = bytes.c_str();
+    arr.size = bytes.length();
+    *rv = arr;
+  });
+
+TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string bytes = args[0];
+    std::vector<std::string> names;
+    dmlc::MemoryStringStream memstrm(&bytes);
+    dmlc::Stream* strm = &memstrm;
+    uint64_t header, reserved;
+    CHECK(strm->Read(&header))
+        << "Invalid parameters file format";
+    CHECK(header == kTVMNDArrayListMagic)
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&reserved))
+        << "Invalid parameters file format";
+    CHECK(strm->Read(&names))
+        << "Invalid parameters file format";
+    uint64_t sz;
+    strm->Read(&sz, sizeof(sz));
+    size_t size = static_cast<size_t>(sz);
+    CHECK(size == names.size())
+        << "Invalid parameters file format";
+    tvm::Array<NamedNDArray> ret;
+    for (size_t i = 0; i < size; ++i) {
+      tvm::runtime::NDArray temp;
+      temp.Load(strm);
+      auto n = tvm::make_node<NamedNDArrayNode>();
+      n->name = std::move(names[i]);
+      n->array = temp;
+      ret.push_back(NamedNDArray(n));
+    }
+    *rv = ret;
+  });
+
+TVM_REGISTER_NODE_TYPE(NamedNDArrayNode);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h
new file mode 100644
index 000000000000..0c32d2bf4742
--- /dev/null
+++ b/src/relay/backend/param_dict.h
@@ -0,0 +1,43 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file param_dict.h
+ * \brief Definitions for serializing and deserializing parameter dictionaries.
+ */
+#ifndef TVM_RELAY_BACKEND_PARAM_DICT_H_
+#define TVM_RELAY_BACKEND_PARAM_DICT_H_
+
+#include <tvm/node/node.h>
+#include <tvm/packed_func_ext.h>
+#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/packed_func.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Magic number for NDArray list file  */
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+
+/*!
+ * \brief Wrapper node for naming `NDArray`s.
+ */
+struct NamedNDArrayNode : public ::tvm::Node {
+  std::string name;
+  tvm::runtime::NDArray array;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("array", &array);
+  }
+
+  static constexpr const char* _type_key = "NamedNDArray";
+  TVM_DECLARE_NODE_TYPE_INFO(NamedNDArrayNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(NamedNDArray, NamedNDArrayNode);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_BACKEND_PARAM_DICT_H_
diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py
new file mode 100644
index 000000000000..b398ea8ba2f5
--- /dev/null
+++ b/tests/python/relay/test_param_dict.py
@@ -0,0 +1,78 @@
+import os
+import numpy as np
+import tvm
+import json
+import base64
+from tvm._ffi.base import py_str
+from tvm.relay.op import add
+from tvm import relay
+from tvm import rpc
+from tvm.contrib import util, graph_runtime
+
+
+def test_save_load():
+    x = np.ones((10, 2)).astype("float32")
+    y = np.ones((1, 2, 3)).astype("float32")
+    params = {"x": x, "y": y}
+    param_bytes = relay.save_param_dict(params)
+    assert isinstance(param_bytes, bytearray)
+    param2 = relay.load_param_dict(param_bytes)
+    assert len(param2) == 2
+    np.testing.assert_equal(param2["x"].asnumpy(), x)
+    np.testing.assert_equal(param2["y"].asnumpy(), y)
+
+
+def test_ndarray_reflection():
+    # Make two `NDArrayWrapper`s that point to the same underlying array.
+    np_array = np.random.uniform(size=(10, 2)).astype("float32")
+    tvm_array = tvm.nd.array(np_array)
+    param_dict = {'x': tvm_array, 'y': tvm_array}
+    assert param_dict['x'].same_as(param_dict['y'])
+    # Serialize then deserialize `param_dict`.
+    deser_param_dict = relay.load_param_dict(relay.save_param_dict(param_dict))
+    # Make sure the data matches the original data and `x` and `y` contain the same data.
+    np.testing.assert_equal(deser_param_dict['x'].asnumpy(), tvm_array.asnumpy())
+    # Make sure `x` and `y` contain the same data.
+    np.testing.assert_equal(deser_param_dict['x'].asnumpy(), deser_param_dict['y'].asnumpy())
+
+
+def test_bigendian_rpc_param():
+    """Test big endian rpc when there is a PowerPC RPC server available"""
+    host = os.environ.get("TVM_POWERPC_TEST_HOST", None)
+    port = os.environ.get("TVM_POWERPC_TEST_PORT", 9090)
+    if host is None:
+        return
+
+    def verify_graph_runtime(remote, target, shape, dtype):
+        x = relay.var('x')
+        y = relay.const(1)
+        z = relay.add(x, y)
+        func = relay.Function([x], z)
+
+        x_in = np.ones(shape).astype(dtype)
+        params = {'x': x_in}
+        graph, lib, params = relay.build(func, target=target, params=params)
+
+        temp = util.tempdir()
+        path_dso = temp.relpath("dev_lib.o")
+        lib.save(path_dso)
+        remote.upload(path_dso)
+        lib = remote.load_module("dev_lib.o")
+        ctx = remote.cpu(0)
+        mod = graph_runtime.create(graph, lib, ctx)
+        mod.load_params(relay.save_param_dict(params))
+        mod.run()
+        out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx))
+        tvm.testing.assert_allclose(x_in + 1, out.asnumpy())
+
+    print("Test RPC connection to PowerPC...")
+    remote = rpc.connect(host, port)
+    target = "llvm -mtriple=powerpc-linux-gnu"
+    for dtype in ["float32", "float64", "int32", "int8"]:
+        verify_graph_runtime(remote, target, (10,), dtype)
+
+
+if __name__ == "__main__":
+    test_save_load()
+    test_ndarray_reflection()
+    test_bigendian_rpc_param()

From d5f6064a51c80e4e5b2bc9864a052e097f85bdaa Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Wed, 27 Feb 2019 00:01:52 -0800
Subject: [PATCH 30/93] add converter for MXNet slice in nnvm and relay (#2662)

---
 nnvm/python/nnvm/frontend/mxnet.py               | 14 ++++++++++++++
 nnvm/tests/python/frontend/mxnet/test_forward.py |  9 +++++++++
 python/tvm/relay/frontend/mxnet.py               | 16 ++++++++++++++++
 tests/python/frontend/mxnet/test_forward.py      |  7 +++++++
 4 files changed, 46 insertions(+)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index bdea6bb10fbc..d8855693e7d5 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -189,6 +189,19 @@ def _reshape(inputs, attrs):
     new_attrs['shape'] = _required_attr(attrs, 'shape')
     return _get_nnvm_op(op_name)(*inputs, **new_attrs)
 
+def _slice(inputs, attrs):
+    begin = attrs.get('begin', None)
+    end = attrs.get('end', None)
+    stride = attrs.get('step', None)
+    if begin is None or end is None:
+        raise RuntimeError('begin and end are required params')
+    if 'None' in begin or 'None' in end:
+        raise RuntimeError('None in begin or end not supported yet...')
+    new_attrs = {'begin': begin, 'end': end}
+    if stride is not None:
+        new_attrs['stride'] = stride
+    return _get_nnvm_op('strided_slice')(inputs[0], **new_attrs)
+
 def _split(inputs, attrs):
     op_name, new_attrs = 'split', {}
     axis = attrs.get('axis', 1)
@@ -349,6 +362,7 @@ def _argmin(inputs, attrs):
     'Pooling'       : _pooling,
     'Pooling_v1'    : _pooling,
     'Reshape'       : _reshape,
+    'slice'         : _slice,
     'SliceChannel'  : _split,
     'split'         : _split,
     'Softmax'       : _rename('softmax'),
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index e9225a4c7c50..97ffa20b3edc 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -220,6 +220,14 @@ def test_forward_where():
         tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
         tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
 
+def test_forward_slice():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 3))
+    mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
+
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -242,4 +250,5 @@ def test_forward_where():
     test_forward_argmax()
     test_forward_argmin()
     test_forward_where()
+    test_forward_slice()
 
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index c48a116a9d0e..9ef5f626393a 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -172,6 +172,21 @@ def _mx_batch_norm(inputs, attrs):
     return _op.nn.batch_norm(*inputs, **new_attrs)
 
 
+def _mx_slice(inputs, attrs):
+    new_attrs = {}
+    begin = attrs.get_int_tuple('begin', None)
+    end = attrs.get_int_tuple('end', None)
+    stride = attrs.get_int_tuple('step', None)
+    if begin is None or end is None:
+        raise RuntimeError("begin and end are required parameters.")
+    if None in begin or None in end:
+        raise RuntimeError("None in begin or end is not supported yet.")
+    new_attrs = {'begin': begin, 'end': end}
+    if stride is not None:
+        new_attrs['strides'] = stride
+    return _op.strided_slice(inputs[0], **new_attrs)
+
+
 def _mx_split(inputs, attrs):
     axis = attrs.get_int("axis", 1)
     new_attrs = {}
@@ -368,6 +383,7 @@ def _mx_roi_align(inputs, attrs):
     "BatchNorm"     : _mx_batch_norm,
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
+    "slice"         : _mx_slice,
     "SliceChannel"  : _mx_split,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index ca1bdbbbefc9..671316079308 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -190,6 +190,13 @@ def test_forward_argmin():
     mx_sym = mx.sym.argmin(data, axis=0)
     verify_mxnet_frontend_impl(mx_sym, (5, 4), (4,))
 
+def test_forward_slice():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.slice(data, begin=(0, 1), end=(2, 4))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 3))
+    mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
+    verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
+
 def test_forward_where():
     cond = mx.sym.var('cond')
     x = mx.sym.var('x')

From 9c2a4e15bfb5a25b79c91f8d3156252cf636093d Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Wed, 27 Feb 2019 18:01:05 +0000
Subject: [PATCH 31/93] [PYLINT] Disable consider-using-get (#2654)

---
 tests/lint/pylintrc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/lint/pylintrc b/tests/lint/pylintrc
index 18f526702ad8..355e2ad5acd1 100644
--- a/tests/lint/pylintrc
+++ b/tests/lint/pylintrc
@@ -65,7 +65,7 @@ enable=indexing-exception,old-raise-syntax
 # --enable=similarities". If you want to run only the classes checker, but have
 # no Warning level messages displayed, use"--disable=all --enable=classes
 # --disable=W"
-disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance
+disable=design,similarities,no-self-use,attribute-defined-outside-init,locally-disabled,star-args,pointless-except,bad-option-value,global-statement,fixme,suppressed-message,useless-suppression,locally-enabled,no-member,no-name-in-module,import-error,unsubscriptable-object,unbalanced-tuple-unpacking,undefined-variable,protected-access,useless-object-inheritance,consider-using-get
 
 [REPORTS]
 

From 5fcb16f5401e7623d47a340fde7951c1f24bb6bc Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 28 Feb 2019 03:24:23 +0900
Subject: [PATCH 32/93] [DOC] CoreML frontend tutorial (#2667)

* [DOC] CoreML frontend tutorial

* Update tutorials/frontend/from_coreml.py

Co-Authored-By: kazum <morita.kazutaka@lab.ntt.co.jp>

* Update tutorials/frontend/from_coreml.py

Co-Authored-By: kazum <morita.kazutaka@lab.ntt.co.jp>

* Addressed comments and added the original author
---
 tutorials/frontend/from_coreml.py | 101 ++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 tutorials/frontend/from_coreml.py

diff --git a/tutorials/frontend/from_coreml.py b/tutorials/frontend/from_coreml.py
new file mode 100644
index 000000000000..a79e21921068
--- /dev/null
+++ b/tutorials/frontend/from_coreml.py
@@ -0,0 +1,101 @@
+"""
+Compile CoreML Models
+=====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_, \
+            `Kazutaka Morita <https://github.com/kazum>`_
+
+This article is an introductory tutorial to deploy CoreML models with Relay.
+
+For us to begin with, coremltools module is required to be installed.
+
+A quick solution is to install via pip
+
+.. code-block:: bash
+
+    pip install -U coremltools --user
+
+or please refer to official site
+https://github.com/apple/coremltools
+"""
+import tvm
+import tvm.relay as relay
+import coremltools as cm
+import numpy as np
+from PIL import Image
+
+def download(url, path, overwrite=False):
+    import os
+    if os.path.isfile(path) and not overwrite:
+        print('File {} existed, skip.'.format(path))
+        return
+    print('Downloading from url {} to {}'.format(url, path))
+    try:
+        import urllib.request
+        urllib.request.urlretrieve(url, path)
+    except:
+        import urllib
+        urllib.urlretrieve(url, path)
+
+######################################################################
+# Load pretrained CoreML model
+# ----------------------------
+# We will download and load a pretrained mobilenet classification network
+# provided by apple in this example
+model_url = 'https://docs-assets.developer.apple.com/coreml/models/MobileNet.mlmodel'
+model_file = 'mobilenet.mlmodel'
+download(model_url, model_file)
+# Now you have mobilenet.mlmodel on disk
+mlmodel = cm.models.MLModel(model_file)
+
+######################################################################
+# Load a test image
+# ------------------
+# A single cat dominates the examples!
+img_url = 'https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true'
+download(img_url, 'cat.png')
+img = Image.open('cat.png').resize((224, 224))
+x = np.transpose(img, (2, 0, 1))[np.newaxis, :]
+
+######################################################################
+# Compile the model on Relay
+# ---------------------------
+# We should be familiar with the process right now.
+target = 'cuda'
+shape_dict = {'image': x.shape}
+
+# Parse CoreML model and convert into Relay computation graph
+func, params = relay.frontend.from_coreml(mlmodel, shape_dict)
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+######################################################################
+# Execute on TVM
+# -------------------
+# The process is no different from other example
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('image', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+
+#####################################################################
+# Look up synset name
+# -------------------
+# Look up prediction top 1 index in 1000 class synset.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+print('Top-1 id', top1, 'class name', synset[top1])

From 8614a7c31f3d5cba2503a4c5947611b6acd72361 Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Thu, 28 Feb 2019 03:46:05 +0800
Subject: [PATCH 33/93] Support mean in NNVM to Relay converter. (#2680)

---
 nnvm/python/nnvm/to_relay.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
index 030fe9991331..264a18d90c77 100644
--- a/nnvm/python/nnvm/to_relay.py
+++ b/nnvm/python/nnvm/to_relay.py
@@ -377,6 +377,16 @@ def _dropout(children, attrs, odtype='float32'):
     rate = attrs.get_float('rate', 0.5)
     return op.nn.dropout(children[0], rate)
 
+def _mean(children, attrs, odtype='float32'):
+    axis = None
+    try:
+        axis = [attrs.get_int('axis', None)]
+    except ValueError:
+        axis = axis or attrs.get_int_tuple('axis', None)
+    keepdims = attrs.get_bool('keepdims')
+
+    return op.mean(children[0], axis, keepdims)
+
 
 NNVM_OP_2_RELAY_OP = {
     'flatten': _nn_batch_flatten,
@@ -388,6 +398,7 @@ def _dropout(children, attrs, odtype='float32'):
     'reshape': _reshape,
     'transpose': _transpose,
     'dropout': _dropout,
+    'mean': _mean,
     # Addition
     '__add_scalar__': _add,
     'broadcast_add': _add,

From 87dba8294b854cb81306575b725b15e2244e1b1a Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Wed, 27 Feb 2019 19:47:14 +0000
Subject: [PATCH 34/93] Stop pylint complaining about unnecessary return
 statement. (#2684)

Recent pylint introduced support for the useless-return diagnostic.
This patch remove the useless returns.
---
 python/tvm/relay/quantize/_annotate.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index 912aa9a0a23c..d56f21b2e2bb 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -124,7 +124,6 @@ def conv2d_nchwc_rewrite(ref_call, new_args, ctx):
     warnings.warn("NCHWc layout Conv2D detected, please use a lower "
                   "optimization level before applying the quantization "
                   "pass as quantization will have no effect here...")
-    return None
 
 
 @register_annotate_function("nn.conv2d")

From fad2597702afb0bedf0b1c0f6be4a9fa52afc114 Mon Sep 17 00:00:00 2001
From: Takeshi Watanabe <take-cheeze@users.noreply.github.com>
Date: Thu, 28 Feb 2019 04:48:32 +0900
Subject: [PATCH 35/93] [RUST] Fix typo (#2681)

---
 rust/frontend/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rust/frontend/README.md b/rust/frontend/README.md
index 5bd4362aefc4..9f46cf760c91 100644
--- a/rust/frontend/README.md
+++ b/rust/frontend/README.md
@@ -215,5 +215,5 @@ fn main() {
         .unwrap();
 
     assert_eq!(ret, 14f64);
-    }
+}
 ```

From 6fee9f6f46ca9be8be4cf30af1967b958afccefe Mon Sep 17 00:00:00 2001
From: Salem Derisavi <derisavi@users.noreply.github.com>
Date: Wed, 27 Feb 2019 17:01:54 -0500
Subject: [PATCH 36/93] Handle Select in IntSetEvaluator (#2687)

---
 src/arithmetic/int_set.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/arithmetic/int_set.cc b/src/arithmetic/int_set.cc
index 1136cf0b1206..ed6e55904cdd 100644
--- a/src/arithmetic/int_set.cc
+++ b/src/arithmetic/int_set.cc
@@ -531,6 +531,11 @@ class IntSetEvaluator :
     CHECK(eval_vec_);
     return Eval(op->value);
   }
+  IntSet VisitExpr_(const Select* op, const Expr& e) final {
+    IntSet true_set = this->Eval(op->true_value);
+    IntSet false_set = this->Eval(op->false_value);
+    return Union({false_set, true_set});
+  }
   IntSet VisitExprDefault_(const Node* op, const Expr& e) final {
     LOG(WARNING) << "cannot evaluate set type " << e->type_key();
     return IntSet::everything();

From 6897874e8e7240927f25c4419f7fc134fc31d5b6 Mon Sep 17 00:00:00 2001
From: Denis Khalikov <dennis.khalikov@gmail.com>
Date: Thu, 28 Feb 2019 01:38:01 +0300
Subject: [PATCH 37/93] [CODEGEN LLVM GPU] Initialize llvm before lookup for
 the target (#2683)

---
 python/tvm/contrib/nvcc.py         | 2 +-
 src/codegen/llvm/codegen_amdgpu.cc | 1 +
 src/codegen/llvm/codegen_nvptx.cc  | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py
index 99cea18d1487..f9970f6bceb2 100644
--- a/python/tvm/contrib/nvcc.py
+++ b/python/tvm/contrib/nvcc.py
@@ -151,7 +151,7 @@ def find_libdevice_path(arch):
     selected_ver = 0
     selected_path = None
     cuda_ver = get_cuda_version(cuda_path)
-    if cuda_ver in (9.0, 9.1):
+    if cuda_ver in (9.0, 9.1, 10.0):
         path = os.path.join(lib_path, "libdevice.10.bc")
     else:
         for fn in os.listdir(lib_path):
diff --git a/src/codegen/llvm/codegen_amdgpu.cc b/src/codegen/llvm/codegen_amdgpu.cc
index d1a0716bc1d9..205d99f1ab65 100644
--- a/src/codegen/llvm/codegen_amdgpu.cc
+++ b/src/codegen/llvm/codegen_amdgpu.cc
@@ -156,6 +156,7 @@ inline int DetectROCMComputeVersion(const std::string& target) {
 }
 
 runtime::Module BuildAMDGPU(Array<LoweredFunc> funcs, std::string target) {
+  InitializeLLVM();
   CHECK(target.length() >= 4 &&
         target.substr(0, 4) == "rocm");
   std::ostringstream config;
diff --git a/src/codegen/llvm/codegen_nvptx.cc b/src/codegen/llvm/codegen_nvptx.cc
index 2d416d34ea0c..0a9361c57de7 100644
--- a/src/codegen/llvm/codegen_nvptx.cc
+++ b/src/codegen/llvm/codegen_nvptx.cc
@@ -166,6 +166,7 @@ inline int DetectCUDAComputeVersion() {
 }
 
 runtime::Module BuildNVPTX(Array<LoweredFunc> funcs, std::string target) {
+  InitializeLLVM();
   CHECK(target.length() >= 5 &&
         target.substr(0, 5) == "nvptx");
   int compute_ver = DetectCUDAComputeVersion();

From d0e52542d547d9ece5c012af31dd2b4b748b3c1a Mon Sep 17 00:00:00 2001
From: lixiaoquan <radioheads@163.com>
Date: Thu, 28 Feb 2019 10:01:06 +0800
Subject: [PATCH 38/93] [RELAY] Fix get_int_tuple for shape like '(1001,)'
 (#2691)

tshape.strip('()[]').split(',') will make a list ['1001',''] but the empty one isn't needed.
---
 python/tvm/relay/frontend/common.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py
index be23f2b50273..ef9f63f3cd95 100644
--- a/python/tvm/relay/frontend/common.py
+++ b/python/tvm/relay/frontend/common.py
@@ -106,7 +106,7 @@ def get_int_tuple(self, key, default=RequiredAttr()):
         """
         if key in self.attrs:
             tshape = self.attrs[key]
-            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(','))
+            return tuple(int(x.strip()) for x in tshape.strip('()[]').split(',') if x)
         if isinstance(default, RequiredAttr):
             raise AttributeError("Required attribute {} not found.".format(key))
         return default

From bce740df9b04041b7b1f65a5713f06bd97bd535b Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Wed, 27 Feb 2019 20:25:35 -0800
Subject: [PATCH 39/93] [AUTOTVM] tweak `sample_int` implementation (#2677)

* check in

* lint

* cleanup

* Update util.py
---
 python/tvm/autotvm/util.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/tvm/autotvm/util.py b/python/tvm/autotvm/util.py
index 2b52bfb46992..528eb24be380 100644
--- a/python/tvm/autotvm/util.py
+++ b/python/tvm/autotvm/util.py
@@ -4,6 +4,8 @@
 import multiprocessing
 import time
 
+from random import randrange
+
 import numpy as np
 
 from .. import expr, ir_pass
@@ -59,9 +61,9 @@ def sample_ints(low, high, m):
     vis = set()
     assert m <= high - low
     while len(vis) < m:
-        new = np.random.randint(low, high)
+        new = randrange(low, high)
         while new in vis:
-            new = np.random.randint(low, high)
+            new = randrange(low, high)
         vis.add(new)
 
     return list(vis)

From 6dd5cbac9e26f22b769163d2a5c761960e961c14 Mon Sep 17 00:00:00 2001
From: Yizhi Liu <liuyizhi@apache.org>
Date: Wed, 27 Feb 2019 20:26:22 -0800
Subject: [PATCH 40/93] [Lang] Layout in TVM node system (#2509)

* move layout.h & layout.cc from relay to tvm

* change ConvertLayout in relay to bijectiveLayout->Forward/backward

* add first test case

* add LayoutAxis

* add LayoutAxis struct and compiles

* simplify BijectiveLayout rule consturct

* polish func name for Layout, move impl to .cc, remove Layout::defined(), add defined() checker

* partially add layout py support

* add layout test cases

* add doc for tvm.layout & tvm.bijective_layout

* fix lint

* fix lint

* fix layout name generation bug

* fix layout typo

* address comments and add topi.layout_transform

* layout.h->data_layout.h, test_lang_layout.py->test_lang_data_layout.py
---
 docs/api/python/topi.rst                      |   2 +
 include/tvm/data_layout.h                     | 335 ++++++++++++++
 nnvm/src/top/nn/nn.cc                         |  38 +-
 python/tvm/api.py                             |  45 +-
 python/tvm/tensor.py                          | 139 ++++++
 src/api/api_lang.cc                           |  58 +++
 src/lang/data_layout.cc                       | 322 +++++++++++++
 src/relay/op/debug.cc                         |   2 +-
 src/relay/op/image/resize.cc                  |  13 +-
 src/relay/op/layout.cc                        |  80 ----
 src/relay/op/layout.h                         | 432 ------------------
 src/relay/op/nn/convolution.cc                |  93 ++--
 src/relay/op/nn/nn.cc                         |   2 +-
 src/relay/op/nn/pad.cc                        |   2 +-
 src/relay/op/nn/pooling.cc                    |  44 +-
 src/relay/op/nn/upsampling.cc                 |  14 +-
 src/relay/op/tensor/transform.cc              |  54 +--
 src/relay/pass/alter_op_layout.cc             |   2 +-
 src/relay/pass/alter_op_layout.h              |  20 +-
 src/relay/pass/combine_parallel_conv2d.cc     |   6 +-
 src/relay/pass/fold_scale_axis.cc             |  32 +-
 src/relay/pass/mac_count.cc                   |   6 +-
 src/relay/pass/pattern_util.h                 |   7 +-
 .../python/unittest/test_lang_data_layout.py  |  65 +++
 topi/include/topi/nn.h                        |  23 -
 topi/include/topi/transform.h                 |  39 ++
 topi/python/topi/transform.py                 |  17 +
 topi/src/topi.cc                              |   5 +
 topi/tests/python/test_topi_transform.py      |  29 ++
 29 files changed, 1200 insertions(+), 726 deletions(-)
 create mode 100644 include/tvm/data_layout.h
 create mode 100644 src/lang/data_layout.cc
 delete mode 100644 src/relay/op/layout.cc
 delete mode 100644 src/relay/op/layout.h
 create mode 100644 tests/python/unittest/test_lang_data_layout.py

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index ec5d600dab2b..9680adc1231b 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -68,6 +68,7 @@ List of operators
    topi.greater_equal
    topi.less_equal
    topi.arange
+   topi.layout_transform
    topi.image.resize
 
 
@@ -125,6 +126,7 @@ topi
 .. autofunction:: topi.greater
 .. autofunction:: topi.less
 .. autofunction:: topi.arange
+.. autofunction:: topi.layout_transform
 
 topi.nn
 ~~~~~~~
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
new file mode 100644
index 000000000000..99aebc3a1c31
--- /dev/null
+++ b/include/tvm/data_layout.h
@@ -0,0 +1,335 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/data_layout.h
+ * \brief Layout expression to describe the data organization of a tensor.
+ *  And BijectiveLayout to mapping two data layouts between each other.
+ */
+#ifndef TVM_DATA_LAYOUT_H_
+#define TVM_DATA_LAYOUT_H_
+
+#include <tvm/base.h>
+#include <tvm/expr.h>
+
+#include <string>
+#include <sstream>
+#include <vector>
+#include <utility>
+#include <algorithm>
+
+#include "ir_operator.h"
+
+namespace tvm {
+
+class LayoutAxis {
+ public:
+  static const LayoutAxis& Get(const char name);
+
+  // Get the singleton LayoutAxis using itvar->var->name_hint
+  static const LayoutAxis& Get(const IterVar& itvar);
+
+  // Get the singleton LayoutAxis using name[0] (size of name must be 1).
+  static const LayoutAxis& make(const std::string& name);
+
+  inline bool IsPrimal() const { return name_ >= 'A' && name_ <= 'Z'; }
+  inline std::string name() const { return std::string(1, name_); }
+
+  // if current axis is primal, switch the axis to its subordinate one,
+  // else switch to the primal.
+  inline const LayoutAxis& ToDual() const {
+    if (name_ >= 'A' && name_ <= 'Z') {
+      return LayoutAxis::Get(name_ - 'A' + 'a');
+    } else {
+      return LayoutAxis::Get(name_ - 'a' + 'A');
+    }
+  }
+
+  // return the primal axis. If it is already primal, return itself.
+  const LayoutAxis& ToPrimal() const {
+    return IsPrimal() ? *this : ToDual();
+  }
+
+  // return the subordinate axis. If it is already subordinate, return itself.
+  const LayoutAxis& ToSubordinate() const {
+    return IsPrimal() ? ToDual() : *this;
+  }
+
+  inline bool operator==(const LayoutAxis& rhs) const {
+    return name_ == rhs.name_;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const LayoutAxis& l) {
+    os << l.name();
+    return os;
+  }
+
+ private:
+  static const LayoutAxis UPPER_CASE[];
+  static const LayoutAxis LOWER_CASE[];
+  LayoutAxis(const LayoutAxis&);
+  LayoutAxis& operator=(const LayoutAxis&);
+  explicit LayoutAxis(const char name) : name_(name) {}
+
+  const char name_;
+};
+
+class Layout;
+// Internal node container Buffer
+class LayoutNode : public Node {
+ public:
+  /*! \brief string representation of layout */
+  std::string name;
+  /*! \brief specify each axis of the layout,
+   *   in which the variable name is the name of the axis.
+   *   The IterVar's extent indicates the size of the axis,
+   *   it is a variable for a primal axis, but a constant for a subordinate axis.
+   */
+  Array<IterVar> axes;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("name", &name);
+    v->Visit("axes", &axes);
+  }
+
+  TVM_DLL static Layout make(const std::string& layout);
+
+  static constexpr const char* _type_key = "Layout";
+  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
+};
+
+/*!
+ * \brief Layout is to describe how data is organized within an N-dimention tensor.
+ *  It is composed of upper cases, lower cases and numbers,
+ *  where upper case indicates a primal axis and
+ *  the corresponding lower case with factor size indicates the subordinate axis.
+ *  For example, NCHW16c can describe a 5-D tensor of
+ *  [batch_size, channel, height, width, channel_block].
+ *  Here subordinate axis channel_block=16 is the factor size of the primal axis C (channel).
+ */
+class Layout : public NodeRef {
+ public:
+  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
+
+  /*! \brief default constructor */
+  Layout() = default;
+
+  explicit Layout(const Array<IterVar>& axes);
+
+  /*! \brief construct from a string */
+  Layout(const char* name) : Layout(std::string(name)) {} // NOLINT(*)
+
+  /*!
+   * \brief construct from a string.
+   * \param name input in layout convention:
+   *        upper case indicates a dimension and
+   *        the corresponding lower case with factor size
+   *        indicates the split dimension.
+   *        return undefined layout if "__undef__" is passed.
+   */
+  Layout(const std::string& name); // NOLINT(*)
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  const LayoutNode* operator->() const {
+    return static_cast<const LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  LayoutNode* operator->() {
+    return static_cast<LayoutNode*>(node_.get());
+  }
+
+  /*!
+   * \brief Return an undefined layout.
+   * \return a (global) undefined layout.
+   */
+  static const Layout& Undef() {
+    static Layout undef;
+    return undef;
+  }
+
+  /*!
+   * \brief Returns a sub-layout which is the portion of the object
+   *        that starts at dimension \p pos and spans \p len dimensions
+   *        (or until the end of the layout, whichever comes first).
+   * \param pos The start position.
+   * \param len The length of the sub-layout.
+   * \return A newly constructed Layout object.
+   */
+  Layout SubLayout(size_t pos, size_t len) const;
+
+  /*!
+   * \brief Split \p axis by \p size and put the sub-axis to position \p target_pos.
+   * \param axis The source axis to be split. It must be a primal-axis;
+   * \param target_pos The target position of the newly split subordinate-axis.
+   * \param factor size of the sub-dimension.
+   * \return A newly constructed Layout object.
+   */
+  Layout Split(const LayoutAxis &axis, size_t target_pos, int32_t factor) const;
+
+
+  /*! \return number of dimensions */
+  inline size_t ndim() const {
+    if (!defined()) return 0;
+    return operator->()->axes.size();
+  }
+
+  /*! \return number of super dimensions */
+  inline size_t ndim_primal() const {
+    if (!defined()) return 0;
+    size_t ct = 0;
+    for (auto x : operator->()->axes) {
+      if (LayoutAxis::Get(x).IsPrimal()) {
+        ct++;
+      }
+    }
+    return ct;
+  }
+
+  /*!
+   * \brief return the index of the input axis.
+   *        If it is not found in the layout or the layout is undefined,
+   *        return -1.
+   * \param axis the input axis.
+   * \return the index or -1 if not found.
+   */
+  inline int32_t IndexOf(const LayoutAxis& axis) const {
+    if (!this->defined()) return -1;
+    const auto axes = operator->()->axes;
+    for (size_t i = 0; i < axes.size(); ++i) {
+      if (axes[i]->var.get()->name_hint == axis.name()) return static_cast<int32_t>(i);
+    }
+    return -1;
+  }
+
+  /*!
+   * \brief Get the factor size of the subordinate axis.
+   * \param axis the input primal-axis or subordinate-axis.
+   * \return the size of the subordinate-axis of \p axis (if \p axis is a primal-axis),
+   *         or the size of \p axis itself (if \p axis is a subordinate-axis).
+   *         Return -1 if \p axis is not in the layout the layout is undefined.
+   */
+  int32_t FactorOf(const LayoutAxis& axis) const;
+
+  /*!
+   * \brief Whether the layout contains an axis.
+   * \param axis axis to be checked.
+   * \return Whether the layout contains the axis.
+   */
+  bool Contains(const LayoutAxis& axis) const {
+    if (!defined()) return false;
+    for (const IterVar var : operator->()->axes) {
+      if (var->var.get()->name_hint == axis.name()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const LayoutAxis& operator[](int32_t i) const {
+    CHECK(defined()) << "Try to access axis from an undefined layout.";
+    int32_t index = i < 0 ? static_cast<int32_t>(ndim() + i) : i;
+    CHECK(index >= 0 && static_cast<size_t>(index) < ndim()) << "Invalid index " << i;
+    const IterVar axis = operator->()->axes[index];
+    return LayoutAxis::Get(axis);
+  }
+
+  /*! \return the string description of the layout */
+  inline std::string name() const {
+    if (!defined()) return "__undef__";
+    return operator->()->name;
+  }
+
+  /*!
+   * \brief Whether the two layouts are equal.
+   * \param rhs Another layout.
+   * \return whether the two layouts are equal.
+   */
+  inline bool Equals(const Layout &rhs) const {
+    return name() == rhs.name();
+  }
+
+  /*!
+   * \brief allow output string of layout to ostream
+   * \param os the output stream
+   * \param l the layout
+   * \return the ostream
+   */
+  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
+    os << l.name();
+    return os;
+  }
+
+  using ContainerType = LayoutNode;
+};
+
+class BijectiveLayout;
+// Internal node container BijectiveLayout
+class BijectiveLayoutNode : public Node {
+ public:
+  /*! \brief Describes how source axes can be mapped to the destination axes,
+   *   e.g., [i0 / 16, i1, i0 % 16] can describe NC -> NC16n
+   */
+  Array<Expr> forward_rule;
+  /*! \brief Describes how destination axes can be mapped to the source axes */
+  Array<Expr> backward_rule;
+
+  /*! \brief The source layout */
+  Layout src_layout;
+  /*! \brief The destination layout */
+  Layout dst_layout;
+
+  void VisitAttrs(AttrVisitor* v) final {
+    v->Visit("src_layout", &src_layout);
+    v->Visit("dst_layout", &dst_layout);
+    v->Visit("forward_rule", &forward_rule);
+    v->Visit("backward_rule", &backward_rule);
+  }
+
+  static constexpr const char* _type_key = "BijectiveLayout";
+  TVM_DECLARE_NODE_TYPE_INFO(BijectiveLayoutNode, Node);
+
+  TVM_DLL static BijectiveLayout make(const Layout& src_layout,
+                                      const Layout& dst_layout);
+};
+
+/*! \brief Bijective function mapping for data layout transformation.
+ *   Given two Layout, BijectiveLayout build and store the mapping rules,
+ *   provides API to transform N-dimention tensor from the source indices (i0, i1, …, im)
+ *   to the destination indices (j0, j1, … jm).
+ */
+class BijectiveLayout : public NodeRef {
+ public:
+  BijectiveLayout() = default;
+  explicit BijectiveLayout(NodePtr<Node> n) : NodeRef(n) {}
+
+  // Given the source shape, infer the destination shape.
+  TVM_DLL Array<Expr> ForwardShape(const Array<Expr>& shape) const;
+  // Given the destination shape, recover the source shape.
+  TVM_DLL Array<Expr> BackwardShape(const Array<Expr>& dst_shape) const;
+  // Given the destination indices, infer the destination indices.
+  TVM_DLL Array<Expr> ForwardIndex(const Array<Expr>& index) const;
+  // Given the destination indices, recover the source indices.
+  TVM_DLL Array<Expr> BackwardIndex(const Array<Expr>& dst_index) const;
+
+  /*!
+   * \brief access the internal node container
+   * \return the pointer to the internal node container
+   */
+  inline const BijectiveLayoutNode* operator->() const;
+
+  /*! \brief specify container node */
+  using ContainerType = BijectiveLayoutNode;
+};
+
+inline const BijectiveLayoutNode* BijectiveLayout::operator->() const {
+  return static_cast<const BijectiveLayoutNode*>(node_.get());
+}
+
+}  // namespace tvm
+
+#endif  // TVM_DATA_LAYOUT_H_
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index e9a556281ff0..f213fa3a19ec 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -726,42 +726,8 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
                     const Array<Tensor>& inputs,
                     const Array<Tensor>& outputs) {
     const LayoutTransformParam& param = nnvm::get<LayoutTransformParam>(attrs.parsed);
-
-    Layout src_layout(param.src_layout);
-    Layout dst_layout(param.dst_layout);
-
-    if (src_layout == dst_layout) {
-      return Array<Tensor>{ inputs[0] };
-    } else if (!src_layout.defined() || !dst_layout.defined()) {
-      LOG(FATAL) << "cannot convert from/to undefined layout";
-    }
-
-    CHECK(src_layout.convertible(dst_layout)) << "cannot convert from " << param.src_layout
-                                                << " to " << param.dst_layout;
-
-    return Array<Tensor> {
-      topi::layout_transform(inputs[0], outputs[0]->shape, [&](const Array<Var>& dst_indices) {
-        std::vector<Expr> dst_to_src_indices;
-        for (Layout::LayoutDim src_axis : src_layout) {
-          int dst_major_pos = dst_layout.indexof(Layout::to_superdim(src_axis));
-          int dst_minor_pos = dst_layout.indexof(Layout::to_subdim(src_axis));
-          int32_t src_factor = static_cast<int32_t>(src_layout.subsizeof(src_axis));
-          int32_t dst_factor = static_cast<int32_t>(dst_layout.subsizeof(src_axis));
-
-          Expr src_index(dst_indices[dst_major_pos]);
-          if (dst_minor_pos >= 0) {
-            CHECK_GT(dst_factor, 0);
-            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
-          }
-          if (Layout::is_superdim(src_axis) && src_factor > 0) {
-            src_index = src_index / src_factor;
-          } else if (Layout::is_subdim(src_axis) && src_factor > 0) {
-            src_index = src_index % src_factor;
-          }
-          dst_to_src_indices.push_back(src_index);
-        }
-        return Array<Expr>(dst_to_src_indices);
-      })
+    return Array<Tensor>{
+      topi::layout_transform(inputs[0], param.src_layout, param.dst_layout)
     };
 })
 .set_support_level(1);
diff --git a/python/tvm/api.py b/python/tvm/api.py
index 514490ae83ea..7b81f863f6b0 100644
--- a/python/tvm/api.py
+++ b/python/tvm/api.py
@@ -515,7 +515,7 @@ def decl_buffer(shape,
                 scope="",
                 data_alignment=-1,
                 offset_factor=0):
-    """Decleare a new symbolic buffer.
+    """Declare a new symbolic buffer.
 
     Normally buffer is created automatically during lower and build.
     This is only needed if user want to specify their own buffer layout.
@@ -587,6 +587,49 @@ def decl_buffer(shape,
         data, dtype, shape, strides, elem_offset, name, scope,
         data_alignment, offset_factor)
 
+def layout(layout_str):
+    """Create a layout node from a string.
+
+    Parameters
+    ----------
+    layout_str : str
+        A layout representation is composed of upper cases, lower cases and numbers,
+        where upper case indicates a primal axis and
+        the corresponding lower case with factor size indicates the subordinate axis.
+        For example, NCHW16c can describe a 5-D tensor of
+        [batch_size, channel, height, width, channel_block].
+        Here subordinate axis channel_block=16 is the factor size of
+        the primal axis C (channel).
+
+    Returns
+    -------
+    layout : Layout
+        The created layout
+    """
+    return _api_internal._Layout(layout_str)
+
+def bijective_layout(src_layout, dst_layout):
+    """Create a bijective layout mapping.
+
+    Parameters
+    ----------
+    src_layout : str or Layout
+        source layout.
+
+    dst_layout : str or Layout
+        destination layout.
+
+    Returns
+    -------
+    bijective_layout : BijectiveLayout
+        The created bijective layout
+    """
+    if isinstance(src_layout, str):
+        src_layout = layout(src_layout)
+    if isinstance(dst_layout, str):
+        dst_layout = layout(dst_layout)
+    return _api_internal._BijectiveLayout(src_layout, dst_layout)
+
 def _IterVar(dom, name, iter_type, thread_tag=''):
     """Internal function to create IterVar
 
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index 6e7a2b357a96..ce8f16d6a309 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -185,3 +185,142 @@ class HybridOp(Operation):
     def axis(self):
         """Represent axis of IterVar, also defined when it is a HybridOp"""
         return self.__getattr__("axis")
+
+
+@register_node
+class Layout(NodeBase):
+    """Layout is composed of upper cases, lower cases and numbers,
+    where upper case indicates a primal axis and
+    the corresponding lower case with factor size indicates the subordinate axis.
+    For example, NCHW16c can describe a 5-D tensor of
+    [batch_size, channel, height, width, channel_block].
+    Here subordinate axis channel_block=16 is the factor size of the primal axis C (channel).
+
+    Do not construct directly, use :any:`layout` instead.
+    See the documentation of :any:`layout` for more details.
+
+    See Also
+    --------
+    layout : Declare a layout
+    """
+    def __str__(self):
+        return self.name
+
+    def __repr__(self):
+        return "Layout(" + self.name + ")"
+
+    def __len__(self):
+        return _api_internal._LayoutNdim(self)
+
+    def __contains__(self, axis):
+        return len(axis) == 1 and axis[0].isalpha() and axis[0] in self.name
+
+    def __getitem__(self, index):
+        if index >= len(self):
+            raise IndexError("Layout index out of range")
+        return _api_internal._LayoutGetItem(self, index)
+
+    def index_of(self, axis):
+        """Get the index of an axis
+
+        Parameters
+        ----------
+        axis : str
+            The axis name, need to be [a-z,A-Z]
+
+        Returns
+        -------
+        index : int
+            The index of the axis, -1 if not found.
+        """
+        return _api_internal._LayoutIndexOf(self, axis)
+
+    def factor_of(self, axis):
+        """Get the factor size of the subordinate axis.
+
+        Parameters
+        ----------
+        axis : str
+            The axis name, need to be [a-z,A-Z]
+
+        Returns
+        -------
+        factor : int
+            the size of the subordinate-axis of axis (if axis is a primal-axis),
+            or the size of axis itself (if axis is a subordinate-axis).
+            Return -1 if axis is not in the layout.
+        """
+        return _api_internal._LayoutFactorOf(self, axis)
+
+
+@register_node
+class BijectiveLayout(NodeBase):
+    """Bijective mapping for two layouts (src-layout and dst-layout).
+    It provides shape and index conversion between each other.
+
+    Do not construct directly, use :any:`bijective_layout` instead.
+    See the documentation of :any:`bijective_layout` for more details.
+
+    See Also
+    --------
+    bijective_layout : Declare a bijective layout converter
+    """
+    def forward_index(self, index):
+        """Given the indices of the src-layout, infer the dst index.
+
+        Parameters
+        ----------
+        index: Array of Expr
+            The indices in src-layout.
+
+        Returns
+        -------
+        dst_index: Array of Expr
+            The inferred indices in dst-layout.
+        """
+        return _api_internal._BijectiveLayoutForwardIndex(self, index)
+
+    def backward_index(self, index):
+        """Given the indices of the dst-layout, infer the src index.
+
+        Parameters
+        ----------
+        index: Array of Expr
+            The indices in dst-layout.
+
+        Returns
+        -------
+        src_index: Array of Expr
+            The inferred indices in src-layout.
+        """
+        return _api_internal._BijectiveLayoutBackwardIndex(self, index)
+
+    def forward_shape(self, shape):
+        """Given the shape of the src-layout, infer the dst shape.
+
+        Parameters
+        ----------
+        shape: Array of Expr
+            The shape in src-layout.
+
+        Returns
+        -------
+        dst_shape: Array of Expr
+            The inferred shape in dst-layout.
+        """
+        return _api_internal._BijectiveLayoutForwardShape(self, shape)
+
+    def backward_shape(self, shape):
+        """Given the shape of the dst-layout, infer the src shape.
+
+        Parameters
+        ----------
+        shape: Array of Expr
+            The shape in dst-layout.
+
+        Returns
+        -------
+        src_shape: Array of Expr
+            The inferred shape in src-layout.
+        """
+        return _api_internal._BijectiveLayoutBackwardShape(self, shape)
diff --git a/src/api/api_lang.cc b/src/api/api_lang.cc
index e30111e938bd..50f81644b0b5 100644
--- a/src/api/api_lang.cc
+++ b/src/api/api_lang.cc
@@ -11,6 +11,7 @@
 #include <tvm/schedule.h>
 #include <tvm/api_registry.h>
 #include <tvm/build_module.h>
+#include <tvm/data_layout.h>
 
 namespace tvm {
 
@@ -224,6 +225,63 @@ TVM_REGISTER_API("_BufferVStore")
         .vstore(args[1], args[2]);
   });
 
+TVM_REGISTER_API("_Layout")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = LayoutNode::make(args[0]);
+  });
+
+TVM_REGISTER_API("_LayoutIndexOf")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = args[0].operator Layout()
+      .IndexOf(LayoutAxis::make(args[1]));
+});
+
+TVM_REGISTER_API("_LayoutFactorOf")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = args[0].operator Layout()
+      .FactorOf(LayoutAxis::make(args[1]));
+});
+
+TVM_REGISTER_API("_LayoutNdim")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  *ret = static_cast<int64_t>(args[0].operator Layout().ndim());
+});
+
+TVM_REGISTER_API("_LayoutGetItem")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+  const LayoutAxis& axis = args[0].operator Layout()[args[1]];
+  *ret = axis.name();
+});
+
+TVM_REGISTER_API("_BijectiveLayout")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = BijectiveLayoutNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutForwardIndex")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .ForwardIndex(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutBackwardIndex")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .BackwardIndex(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutForwardShape")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .ForwardShape(args[1]);
+  });
+
+TVM_REGISTER_API("_BijectiveLayoutBackwardShape")
+.set_body([](TVMArgs args,  TVMRetValue* ret) {
+    *ret = args[0].operator BijectiveLayout()
+        .BackwardShape(args[1]);
+  });
+
 TVM_REGISTER_API("_Tensor")
 .set_body([](TVMArgs args,  TVMRetValue* ret) {
     *ret = TensorNode::make(args[0],
diff --git a/src/lang/data_layout.cc b/src/lang/data_layout.cc
new file mode 100644
index 000000000000..900a58029901
--- /dev/null
+++ b/src/lang/data_layout.cc
@@ -0,0 +1,322 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file src/lang/data_layout.cc
+ * \brief Data Layout expression.
+ */
+#include <tvm/data_layout.h>
+#include <tvm/ir_pass.h>
+
+namespace tvm {
+
+TVM_REGISTER_NODE_TYPE(LayoutNode);
+TVM_REGISTER_NODE_TYPE(BijectiveLayoutNode);
+
+const LayoutAxis LayoutAxis::UPPER_CASE[] = {
+  LayoutAxis('A'), LayoutAxis('B'), LayoutAxis('C'), LayoutAxis('D'), LayoutAxis('E'),
+  LayoutAxis('F'), LayoutAxis('G'), LayoutAxis('H'), LayoutAxis('I'), LayoutAxis('J'),
+  LayoutAxis('K'), LayoutAxis('L'), LayoutAxis('M'), LayoutAxis('N'), LayoutAxis('O'),
+  LayoutAxis('P'), LayoutAxis('Q'), LayoutAxis('R'), LayoutAxis('S'), LayoutAxis('T'),
+  LayoutAxis('U'), LayoutAxis('V'), LayoutAxis('W'), LayoutAxis('X'), LayoutAxis('Y'),
+  LayoutAxis('Z')
+};
+
+const LayoutAxis LayoutAxis::LOWER_CASE[] = {
+  LayoutAxis('a'), LayoutAxis('b'), LayoutAxis('c'), LayoutAxis('d'), LayoutAxis('e'),
+  LayoutAxis('f'), LayoutAxis('g'), LayoutAxis('h'), LayoutAxis('i'), LayoutAxis('j'),
+  LayoutAxis('k'), LayoutAxis('l'), LayoutAxis('m'), LayoutAxis('n'), LayoutAxis('o'),
+  LayoutAxis('p'), LayoutAxis('q'), LayoutAxis('r'), LayoutAxis('s'), LayoutAxis('t'),
+  LayoutAxis('u'), LayoutAxis('v'), LayoutAxis('w'), LayoutAxis('x'), LayoutAxis('y'),
+  LayoutAxis('z')
+};
+
+const LayoutAxis& LayoutAxis::Get(const char name) {
+  CHECK((name >= 'A' && name <= 'Z') || (name >= 'a' && name <= 'z'))
+    << "Invalid layout axis name: " << name << ". Has to be A-Z or a-z.";
+  return (name >= 'A' && name <= 'Z') ?
+         LayoutAxis::UPPER_CASE[name-'A'] :
+         LayoutAxis::LOWER_CASE[name-'a'];
+}
+
+const LayoutAxis& LayoutAxis::Get(const IterVar& itvar) {
+  const std::string axis = itvar->var.get()->name_hint;
+  CHECK_EQ(axis.size(), 1) << "Invalid layout axis " << axis;
+  return LayoutAxis::Get(axis[0]);
+}
+
+const LayoutAxis& LayoutAxis::make(const std::string& name) {
+  CHECK_EQ(name.length(), 1) << "Invalid axis " << name;
+  return LayoutAxis::Get(name[0]);
+}
+
+Layout::Layout(const Array<IterVar>& axes) {
+  node_ = make_node<LayoutNode>();
+  LayoutNode *node = operator->();
+  node->axes = axes;
+  std::ostringstream repr;
+  for (const IterVar& axis : axes) {
+    if (const auto* factor = axis->dom->extent.as<IntImm>()) {
+      CHECK_GT(factor->value, 0);
+      repr << factor->value;
+    }
+    CHECK_EQ(axis->var.get()->name_hint.size(), 1) << "Invalid layout axis "
+                                                   << axis->var.get()->name_hint;
+    char c = axis->var.get()->name_hint[0];
+    CHECK((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) << "Invalid layout axis " << c;
+    repr << axis->var.get()->name_hint;
+  }
+  node->name = repr.str();
+}
+
+Layout::Layout(const std::string& name) { // NOLINT(*)
+  if (name.empty() || name == "__undef__") return;
+
+  node_ = make_node<LayoutNode>();
+  LayoutNode *node = operator->();
+  node->name = name;
+
+  // parse layout string
+  int32_t factor = 0;
+  for (char c : name) {
+    if (c >= 'A' && c <= 'Z') {
+      CHECK_EQ(factor, 0) << "Invalid layout " << name
+                          << ": invalid factor size " << factor
+                          << " before dimension " << c;
+      std::string shape_name("_shape");
+      shape_name.insert(0, 1, c);
+      IterVar axis = IterVarNode::make(Range(Expr(0), Var(shape_name)),
+                                       Var(std::string(1, c)), kDataPar);
+      node->axes.push_back(axis);
+    } else if (c >= 'a' && c <= 'z') {
+      CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size "
+                          << factor << " for dimension " << c;
+      IterVar axis = IterVarNode::make(Range(Expr(0), Expr(factor)),
+                                       Var(std::string(1, c)), kDataPar);
+      node->axes.push_back(axis);
+      factor = 0;
+    } else if (c >= '0' && c <= '9') {
+      CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
+      factor = factor * 10 + c - '0';
+    } else {
+      LOG(FATAL) << "Invalid layout " << name;
+    }
+  }
+
+  // validate layout
+  std::vector<bool> exist_axis(256, false);
+  for (const IterVar& v : node->axes) {
+    auto axis_str = v->var.get()->name_hint;
+    CHECK_EQ(axis_str.size(), 1);
+    char axis = axis_str[0];
+    CHECK((axis >= 'a' && axis <= 'z') || (axis >= 'A' && axis <= 'Z'));
+    CHECK(!exist_axis[axis]) << "Invalid layout " << name << ": duplicate axis " << axis;
+    exist_axis[axis] = true;
+  }
+  for (const IterVar& v : node->axes) {
+    char axis = v->var.get()->name_hint[0];
+    if (axis >= 'a' && axis <= 'z') {
+      CHECK(exist_axis[axis-'a'+'A']) << "Invalid layout " << name << ": missing axis "
+                                      << axis - 'a' + 'A';
+    }
+  }
+}
+
+Layout LayoutNode::make(const std::string& layout) {
+  return Layout(layout);
+}
+
+Layout Layout::SubLayout(size_t pos, size_t len) const {
+  if (!defined() || pos > ndim()) return Layout::Undef();
+  if (pos + len > ndim()) len = ndim() - pos;
+  Array<IterVar> new_layout;
+  const auto axes = operator->()->axes;
+  for (size_t i = pos; i < pos + len; ++i) {
+    new_layout.push_back(axes[i]);
+  }
+  return Layout(new_layout);
+}
+
+Layout Layout::Split(const LayoutAxis &axis, size_t target_pos, int32_t factor) const {
+  if (!defined()) return Layout::Undef();
+  const std::string& name = operator->()->name;
+  const auto axes = operator->()->axes;
+  CHECK(target_pos <= this->ndim()) << "Invalid split position "
+                                    << target_pos << " for layout " << name;
+  CHECK(axis.IsPrimal()) << "Cannot split a subordinate axis " << axis;
+  CHECK(this->Contains(axis)) << "Axis " << axis << " does not exist in " << name;
+  CHECK(!this->Contains(axis.ToSubordinate())) << "Axis " << axis
+                                                << " has already been split in " << name;
+  CHECK(factor > 0) << "Invalid split size " << factor;
+  Array<IterVar> new_layout;
+  for (size_t i = 0; i <= this->ndim(); ++i) {
+    if (i == target_pos) {
+      new_layout.push_back(IterVarNode::make(Range(Expr(0), Expr(factor)),
+                                             Var(axis.ToSubordinate().name()), kDataPar));
+    }
+    if (i == this->ndim()) break;
+    new_layout.push_back(axes[i]);
+  }
+  return Layout(new_layout);
+}
+
+int32_t Layout::FactorOf(const LayoutAxis& axis) const {
+  if (!defined()) return -1;
+  const LayoutAxis& sub = axis.ToSubordinate();
+  if (!this->defined()) return -1;
+  for (const IterVar& itvar : operator->()->axes) {
+    if (sub == LayoutAxis::Get(itvar)) {
+      const auto* factor = itvar->dom->extent.as<IntImm>();
+      CHECK(factor);
+      return factor->value;
+    }
+  }
+  return -1;
+}
+
+inline bool GetStoreRule(Array<Expr>* rule,
+                         const Layout& src_layout,
+                         const Layout& dst_layout) {
+  for (size_t i = 0; i < dst_layout.ndim(); ++i) {
+    const auto& store_axis = dst_layout[i];
+    const IterVar& store_axis_impl = dst_layout->axes[i];
+    Expr store(0);
+
+    for (size_t j = 0; j < src_layout.ndim(); ++j) {
+      const auto& orig_axis = src_layout[j];
+      const IterVar& orig_axis_impl = src_layout->axes[j];
+      if (store_axis.ToPrimal() == orig_axis.ToPrimal()) {
+        if (orig_axis.IsPrimal()) {
+          Expr orig_var = orig_axis_impl->var;
+          const int32_t factor = src_layout.FactorOf(orig_axis);
+          if (factor > 0) {
+            orig_var = orig_var * Expr(factor);
+          }
+          store = store + orig_var;
+        } else {
+          store = store + orig_axis_impl->var;
+        }
+      }
+    }
+    if (is_zero(store)) {
+      // Not convertible
+      return false;
+    }
+
+    if (store_axis.IsPrimal()) {
+      const int32_t factor = dst_layout.FactorOf(store_axis);
+      if (factor > 0) {
+        store = store / Expr(factor);
+      }
+    } else {
+      store = store % store_axis_impl->dom->extent;
+    }
+
+    rule->push_back(store);
+  }
+  return true;
+}
+
+inline Array<Expr> TransformIndex(const Array<Expr>& src_index,
+                                  const Array<IterVar>& src_axis,
+                                  const Array<Expr>& transform_rule) {
+  Array<Expr> result;
+  std::unordered_map<const Variable*, Expr> bind_map;
+  for (size_t i = 0; i < src_index.size(); ++i) {
+    bind_map[src_axis[i]->var.get()] = src_index[i];
+  }
+  for (Expr rule : transform_rule) {
+    result.push_back(ir::Simplify(ir::Substitute(rule, bind_map)));
+  }
+  return result;
+}
+
+Array<Expr> BijectiveLayout::ForwardIndex(const Array<Expr>& src_index) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  CHECK_EQ(src_index.size(), self->src_layout->axes.size())
+    << "Input mismatch with layout " << self->src_layout;
+  return TransformIndex(src_index, self->src_layout->axes, self->forward_rule);
+}
+
+
+Array<Expr> BijectiveLayout::BackwardIndex(const Array<Expr>& dst_index) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  CHECK_EQ(dst_index.size(), self->dst_layout->axes.size())
+    << "Output mismatch with layout " << self->dst_layout;
+  return TransformIndex(dst_index, self->dst_layout->axes, self->backward_rule);
+}
+
+inline Array<Expr> TransformShape(const Array<Expr>& src_shape,
+                                  const Array<IterVar>& src_axis,
+                                  const Array<IterVar>& target_axis,
+                                  const Array<Expr>& transform_rule) {
+  CHECK_EQ(src_shape.size(), src_axis.size());
+  // bind variables for original axes
+  // for major-axis, bind the corresponding size
+  // for minor-axis, simply bind it as 0, so that we can reuse forward/backward_rule,
+  // e.g., (C * 16 + c) / 32
+  std::unordered_map<const Variable*, Expr> bind_map;
+  for (size_t i = 0; i < src_shape.size(); ++i) {
+    Expr orig_shape = src_shape[i];
+    IterVar orig_axis = src_axis[i];
+    if (!LayoutAxis::Get(orig_axis).IsPrimal()) {
+      if (orig_shape.defined()) {
+        const auto* orig_shape_const = orig_shape.as<IntImm>();
+        const auto* orig_axis_extent = orig_axis->dom->extent.as<IntImm>();
+        CHECK_EQ(orig_shape_const->value, orig_axis_extent->value)
+          << "Input shape mismatch at index " << i << ". Expected "
+          << orig_axis->dom->extent << ", get " << orig_shape;
+      }
+      bind_map[orig_axis->var.get()] = Expr(0);
+    } else {
+      bind_map[orig_axis->var.get()] = orig_shape;
+    }
+  }
+  // infer the target shape,
+  // for major-axis, use the forward/backward_rule directly,
+  // for minor-axis, simply use the extent.
+  Array<Expr> result;
+  CHECK_EQ(transform_rule.size(), target_axis.size());
+  for (size_t i = 0; i < transform_rule.size(); ++i) {
+    Expr rule = transform_rule[i];
+    IterVar axis = target_axis[i];
+    if (!LayoutAxis::Get(axis).IsPrimal()) {
+      result.push_back(axis->dom->extent);
+    } else {
+      result.push_back(ir::Simplify(ir::Substitute(rule, bind_map)));
+    }
+  }
+  return result;
+}
+
+Array<Expr> BijectiveLayout::ForwardShape(const Array<Expr>& shape) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  return TransformShape(shape, self->src_layout->axes,
+                        self->dst_layout->axes, self->forward_rule);
+}
+
+Array<Expr> BijectiveLayout::BackwardShape(const Array<Expr>& shape) const {
+  CHECK(defined()) << "Cannot operate on an undefined bijective layout.";
+  const BijectiveLayoutNode* self = operator->();
+  return TransformShape(shape, self->dst_layout->axes,
+                        self->src_layout->axes, self->backward_rule);
+}
+
+BijectiveLayout BijectiveLayoutNode::make(const Layout& src_layout,
+                                          const Layout& dst_layout) {
+  auto n = make_node<BijectiveLayoutNode>();
+
+  n->src_layout = src_layout;
+  n->dst_layout = dst_layout;
+
+  if (!GetStoreRule(&n->forward_rule, n->src_layout, n->dst_layout)) {
+    // not convertible
+    return BijectiveLayout();
+  }
+  CHECK(GetStoreRule(&n->backward_rule, n->dst_layout, n->src_layout));
+
+  return BijectiveLayout(n);
+}
+
+}  // namespace tvm
diff --git a/src/relay/op/debug.cc b/src/relay/op/debug.cc
index 4c9b0a5ca83e..4a5a7a86f1ea 100644
--- a/src/relay/op/debug.cc
+++ b/src/relay/op/debug.cc
@@ -4,13 +4,13 @@
  * \brief Property def of nn operators.
  */
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/debug.h>
 #include <topi/elemwise.h>
 #include <vector>
 #include "./type_relations.h"
 #include "./op_common.h"
-#include "./layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/image/resize.cc b/src/relay/op/image/resize.cc
index e6efcb8ce459..d92e380fa9cc 100644
--- a/src/relay/op/image/resize.cc
+++ b/src/relay/op/image/resize.cc
@@ -3,11 +3,11 @@
  * \file resize.cc
  * \brief Image operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/image.h>
 #include <topi/elemwise.h>
 #include <topi/image/resize.h>
-#include "../layout.h"
 #include "../op_common.h"
 
 namespace tvm {
@@ -28,17 +28,18 @@ bool ResizeRel(const Array<Type>& types,
   const ResizeAttrs* param = attrs.as<ResizeAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.Convertible(kNCHW))
+  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(layout_converter.defined())
     << "Resize only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
-  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
-  oshape[2] = param->size[0];
-  oshape[3] = param->size[1];
+  auto oshape = layout_converter.ForwardShape(data->shape);
+  oshape.Set(2, param->size[0]);
+  oshape.Set(3, param->size[1]);
 
   // assign output type
   reporter->Assign(types[1],
-                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                   TensorTypeNode::make(layout_converter.BackwardShape(oshape),
                                         data->dtype));
   return true;
 }
diff --git a/src/relay/op/layout.cc b/src/relay/op/layout.cc
deleted file mode 100644
index 98fea55aa4c1..000000000000
--- a/src/relay/op/layout.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file src/relay/op/layout.cc
- * \brief Layout expression.
- */
-
-#include "layout.h"
-
-namespace tvm {
-namespace relay {
-
-TVM_REGISTER_NODE_TYPE(LayoutNode);
-
-std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  CHECK_EQ(src_layout.ndim(), src.size());
-  if (src_layout == dst_layout) {
-    return src;
-  } else if (!src_layout.defined()) {
-    LOG(FATAL) << "cannot convert undefined layout to " << dst_layout;
-  } else if (!dst_layout.defined()) {
-    LOG(FATAL) << "cannot convert " << src_layout << " to undefined layout";
-  }
-
-  CHECK(src_layout.Convertible(dst_layout))
-    << "cannot convert from "
-    << src_layout << " to " << dst_layout;
-
-  std::vector<IndexExpr> dst(dst_layout.ndim());
-  for (size_t i = 0; i < src_layout.ndim(); ++i) {
-    Layout::LayoutDim src_dim = src_layout[i];
-    if (Layout::IsSuperdim(src_dim)) {
-      int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_dim));
-      int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_dim));
-      int src_minor_pos = src_layout.Indexof(Layout::ToSubdim(src_dim));
-      int src_factor = src_layout.Subsizeof(src_dim);
-      int dst_factor = dst_layout.Subsizeof(src_dim);
-      IndexExpr src_dim_size = src[i];
-
-      if (src_minor_pos >= 0) {
-        CHECK(is_const_int(src[src_minor_pos], src_factor))
-          << "src shape " << Array<IndexExpr>(src)
-          << " does not agree with layout "
-          << src_layout;
-        src_dim_size *= src_factor;
-      }
-      dst[dst_major_pos] = src_dim_size;
-      if (dst_minor_pos >= 0) {
-        CHECK_GT(dst_factor, 0);
-        if (const int64_t* const_src_dim_size = as_const_int(src_dim_size)) {
-          CHECK_LE(dst_factor, const_src_dim_size[0])
-            << "Converting " << Array<IndexExpr>(src)
-            << " from " << src_layout
-            << " to " << dst_layout
-            << ": cannot split dimension size of "
-            << src_dim_size << " by " << dst_factor;
-        }
-        dst[dst_major_pos] /= dst_factor;
-        dst[dst_minor_pos] = dst_factor;
-      }
-    }
-  }
-  return dst;
-}
-
-std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout) {
-  std::vector<IndexExpr> ret(src.size());
-  for (size_t i = 0; i < src.size(); ++i) {
-    ret[i] = src[i];
-  }
-  return ConvertLayout(ret, src_layout, dst_layout);
-}
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/op/layout.h b/src/relay/op/layout.h
deleted file mode 100644
index 09cf3a9cf780..000000000000
--- a/src/relay/op/layout.h
+++ /dev/null
@@ -1,432 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file relay/op/layout.h
- * \brief Layout expression.
- *
- *  This file is adapted from its nnvm counterpart and will keep involving
- *  to the new layout system
- *
- *  The layout is composed of upper cases, lower cases and numbers,
- *  where upper case indicates a (super-)dimension and
- *  the corresponding lower case with factor size indicates the split (sub-)dimension.
- *  For example, NCHW16c can describe a 5-D tensor of
- *  [batch_size, channel, height, width, channel_block].
- *  Here sub-dimension channel_block=16 is the split of super-dimension C (channel).
- */
-#ifndef TVM_RELAY_OP_LAYOUT_H_
-#define TVM_RELAY_OP_LAYOUT_H_
-
-#include <tvm/base.h>
-#include <tvm/expr.h>
-#include <tvm/relay/base.h>
-
-#include <string>
-#include <sstream>
-#include <vector>
-#include <utility>
-#include <algorithm>
-
-namespace tvm {
-namespace relay {
-
-class LayoutNode : public Node {
- public:
-  std::string name;
-  Array<Integer> superdim_pos;
-  Array<Integer> subdim_pos;
-  Array<Integer> subdim_size;
-  Array<Integer> layout_simplified;
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("name", &name);
-    v->Visit("superdim_pos", &superdim_pos);
-    v->Visit("subdim_pos", &subdim_pos);
-    v->Visit("subdim_size", &subdim_size);
-    v->Visit("layout_simplified", &layout_simplified);
-  }
-
-  static constexpr const char* _type_key = "Layout";
-  TVM_DECLARE_NODE_TYPE_INFO(LayoutNode, Node);
-};
-
-class Layout : public NodeRef {
- public:
-  using LayoutDim = char;
-  static constexpr uint32_t kUniqueDim = 26;
-
-  explicit Layout(NodePtr<Node> n) : NodeRef(n) {}
-
-  /*! \brief default constructor */
-  Layout() : Layout("__undef__") {} // NOLINT(*)
-
-  /*! \brief construct from a string */
-  Layout(const char* name) : Layout(std::string(name)) {} // NOLINT(*)
-
-  /*!
-   * \brief construct from a string.
-   * \param layout input in layout convention:
-   *        upper case indicates a dimension and
-   *        the corresponding lower case with factor size
-   *        indicates the split dimension.
-   *        return undefined layout if "__undef__" is passed.
-   */
-  Layout(const std::string& name) { // NOLINT(*)
-    node_ = make_node<LayoutNode>();
-
-    std::vector<int32_t> superdim_pos(kUniqueDim, -1);
-    std::vector<int32_t> subdim_pos(kUniqueDim, -1);
-    std::vector<int32_t> subdim_size(kUniqueDim, -1);
-    std::vector<char> layout_simplified;
-
-    if (name != "__undef__") {  // parse layout string
-      int32_t factor = 0;
-      uint32_t curr = 0;
-      for (size_t i = 0; i < name.size(); ++i) {
-        const LayoutDim c = name.at(i);
-        if (IsSuperdim(c)) {
-          int pos = c - 'A';
-          CHECK_EQ(factor, 0) << "Invalid layout " << name
-                              << ": invalid factor size " << factor
-                              << " before dimension " << c;
-          CHECK_EQ(superdim_pos[pos], -1) << "Invalid layout " << name
-                                          << ": duplicate dimension " << c;
-          superdim_pos[pos] = curr++;
-          layout_simplified.push_back(c);
-        } else if (IsSubdim(c)) {
-          int pos = c - 'a';
-          CHECK_GT(factor, 0) << "Invalid layout " << name << ": invalid factor size "
-                              << factor << " for dimension " << c;
-          CHECK_EQ(subdim_pos[pos], -1) << "Invalid layout " << name
-                                        << ": duplicate dimension " << c;
-          CHECK_EQ(subdim_size[pos], -1) << "Invalid layout " << name
-                                         << ": duplicate dimension " << c;
-          subdim_pos[pos] = curr++;
-          subdim_size[pos] = factor;
-          layout_simplified.push_back(c);
-          factor = 0;
-        } else if (c >= '0' && c <= '9') {
-          CHECK(factor >= 0) << "Invalid layout " << name << ": _ is adjacent to a number.";
-          factor = factor * 10 + c - '0';
-        } else {
-          LOG(FATAL) << "Invalid layout " << name;
-        }
-      }
-      for (LayoutDim dim : layout_simplified) {
-        CHECK(IsSuperdim(dim) || superdim_pos[dim-'a'] >= 0)
-          << "Invalid layout " << name << ": missing axis "
-          << static_cast<char>(dim - 'a' + 'A');
-      }
-    }
-
-    LayoutNode *node = operator->();
-    node->name = name;
-
-    for (uint32_t i = 0; i < kUniqueDim; ++i) {
-      node->superdim_pos.push_back(superdim_pos[i]);
-      node->subdim_pos.push_back(subdim_pos[i]);
-      node->subdim_size.push_back(subdim_size[i]);
-    }
-    for (LayoutDim dim : layout_simplified) {
-      node->layout_simplified.push_back(dim);
-    }
-  }
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  const LayoutNode* operator->() const {
-    return static_cast<const LayoutNode*>(node_.get());
-  }
-
-  /*!
-   * \brief access the internal node container
-   * \return the pointer to the internal node container
-   */
-  LayoutNode* operator->() {
-    return static_cast<LayoutNode*>(node_.get());
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a super-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a super-dimension.
-   */
-  static bool IsSuperdim(LayoutDim dim) {
-    return dim >= 'A' && dim <= 'Z';
-  }
-
-  /*!
-   * \brief Check whether a given dimension is a sub-dimension.
-   * \param dim input dimension
-   * \return Whether a given dimension is a sub-dimension.
-   */
-  static bool IsSubdim(LayoutDim dim) {
-    return dim >= 'a' && dim <= 'z';
-  }
-
-  /*!
-   * \brief Convert a given dimension to super-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim ToSuperdim(LayoutDim dim) {
-    if (IsSubdim(dim)) {
-      return dim - 'a' + 'A';
-    }
-    return dim;
-  }
-
-  /*!
-   * \brief Convert a given dimension to sub-dimension.
-   * \param dim input dimension
-   * \return The converted description.
-   */
-  static LayoutDim ToSubdim(LayoutDim dim) {
-    if (IsSuperdim(dim)) {
-      return dim - 'A' + 'a';
-    }
-    return dim;
-  }
-
-  /*!
- * \brief Return an undefined layout.
- * \return a (global) undefined layout.
- */
-  static const Layout& Undef() {
-    static Layout undef;
-    return undef;
-  }
-
-  /*!
-   * \brief Two layouts are convertible only if
-   *        they have same set of super-dimensions.
-   *        e.g., NCHW, NCHW16c, NHWC are convertible between each other,
-   *        but NCHW, CHW, OIHW are not.
-   * \param dst the target layout
-   * \return Whether can be converted to dst layout.
-   */
-  bool Convertible(const Layout &dst) const {
-    const LayoutNode *n = operator->();
-    if (!this->defined() || !dst.defined()) return false;
-    for (size_t i = 0; i < kUniqueDim; ++i) {
-      if ((n->superdim_pos[i]->value >= 0 && dst->superdim_pos[i]->value < 0) ||
-          (n->superdim_pos[i]->value < 0 && dst->superdim_pos[i]->value >= 0)) {
-        return false;
-      }
-    }
-    return true;
-  }
-
-  /*!
-   * \brief Returns a sublayout which is the portion of the object
-   *        that starts at dimension \p pos and spans \p len dimensions
-   *        (or until the end of the layout, whichever comes first).
-   * \param pos The start position.
-   * \param len The length of the sub-layout.
-   * \return A newly constructed Layout object.
-   */
-  Layout Sublayout(size_t pos, size_t len) const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    if (pos > ndim()) return Layout::Undef();
-    if (pos + len > ndim()) len = ndim() - pos;
-    std::ostringstream new_layout;
-    for (size_t i = pos; i < pos + len; ++i) {
-      if (IsSubdim(layout_simplified[i]->value)) {
-        auto block_size = this->Subsizeof(layout_simplified[i]->value);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << static_cast<char>(layout_simplified[i]->value);
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*! \return A newly constructed reversed Layout object. */
-  Layout Reverse() const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    if (!this->defined()) return Layout::Undef();
-    std::ostringstream new_layout;
-    for (int64_t i = this->ndim() - 1; i >= 0; --i) {
-      if (IsSubdim(layout_simplified[i]->value)) {
-        auto block_size = this->Subsizeof(layout_simplified[i]->value);
-        CHECK_GT(block_size, 0);
-        new_layout << block_size;
-      }
-      new_layout << layout_simplified[i]->value;
-    }
-    return Layout(new_layout.str());
-  }
-
-  /*!
-   * \brief Split \p dim by \p size and put the sub-dimension to position \p target_pos.
-   * \param dim The source dimension to be split. It must be a super-dimension.
-   * \param target_pos The target position of the newly split sub-dimension.
-   * \param size size of the sub-dimension.
-   * \return A newly constructed Layout object.
-   */
-  Layout Split(LayoutDim dim, size_t target_pos, uint32_t size) const {
-    const std::string &name = operator->()->name;
-    CHECK(target_pos <= this->ndim()) << "Invalid split position "
-                                      << target_pos << " for layout " << name;
-    CHECK(IsSuperdim(dim)) << "Cannot split a sub-dimension " << dim;
-    CHECK(this->Contains(dim)) << "Axis " << dim << " does not exist in " << name;
-    CHECK(!this->Contains(ToSubdim(dim))) << "Dimension " << dim
-                                           << " has already been split in "
-                                           << name;
-    CHECK(size > 0) << "Invalid split size " << size;
-    std::ostringstream new_layout;
-    for (size_t i = 0; i <= this->ndim(); ++i) {
-      if (i == target_pos) {
-        new_layout << size << Layout::ToSubdim(dim);
-      }
-      if (i == this->ndim()) break;
-      new_layout << this->at(i);
-    }
-    Layout x(new_layout.str());
-    return x;
-  }
-
-
-  /*! \return number of dimensions */
-  size_t ndim() const {
-    return operator->()->layout_simplified.size();
-  }
-
-  /*! \return number of super dimensions */
-  size_t ndim_super() const {
-    size_t ct = 0;
-    for (auto x : operator->()->layout_simplified) {
-      if (IsSuperdim(x))
-        ct++;
-    }
-    return ct;
-  }
-
-  /*!
-   * \brief The description of the \p i-th dimension.
-   *        If it is a sub-dimension, the size will be returned as well,
-   *        e.g., 16c. Otherwise a single character is returned, e.g., C.
-   * \param i The position
-   * \return the description of the dimension.
-   */
-  std::string at(size_t i) const {
-    const Array<Integer>& layout_simplified = operator->()->layout_simplified;
-    CHECK_LT(i, this->ndim()) << "position " << i
-                              << " exceeds ndim=" << this->ndim();
-    std::ostringstream repr;
-    if (IsSubdim(layout_simplified[i]->value)) {
-      auto factor = Subsizeof(layout_simplified[i]->value);
-      CHECK_GT(factor, 0);
-      repr << factor;
-    }
-    repr << static_cast<char>(layout_simplified[i]->value);
-    return repr.str();
-  }
-
-  /*!
-   * \brief return the index of the input dimension.
-   *        If it is not found in the layout or the layout is undefined,
-   *        return -1.
-   * \param dim the input dimension.
-   * \return the index or -1 if not found.
-   */
-  int32_t Indexof(LayoutDim dim) const {
-    if (!this->defined()) return -1;
-    else if (IsSuperdim(dim)) return operator->()->superdim_pos[dim - 'A']->value;
-    else if (IsSubdim(dim)) return operator->()->subdim_pos[dim - 'a']->value;
-    return -1;
-  }
-
-  /*!
-   * \param dim the input super-dimension or sub-dimension.
-   * \return the size of the sub-dimension of \p dim (if \p dim is a super-dimension),
-   *         or the size of \p dim itself (if \p dim is a sub-dimension).
-   *         Return -1 if \p dim is not in the layout or the layout is undefined.
-   */
-  int64_t Subsizeof(LayoutDim dim) const {
-    CHECK(IsSuperdim(dim) || IsSubdim(dim)) << "Invalid dim " << dim;
-    if (!this->defined() || !this->Contains(ToSubdim(dim))) {
-      return -1;
-    }
-    int idx = ToSubdim(dim) - 'a';
-    return operator->()->subdim_size[idx]->value;
-  }
-
-  /*!
-   * \brief Whether the layout contains a dimension.
-   * \param dim dimension to be checked.
-   * \return Whether the layout contains the dimension.
-   */
-  bool Contains(LayoutDim dim) const {
-    if (IsSuperdim(dim)) {
-      return operator->()->superdim_pos[dim-'A']->value >= 0;
-    } else if (IsSubdim(dim)) {
-      return operator->()->subdim_pos[dim-'a']->value >= 0;
-    }
-    return false;
-  }
-
-  LayoutDim operator[](size_t i) const {
-    return operator->()->layout_simplified[i];
-  }
-
-  /*! \return whether the layout is defined */
-  bool defined() const {
-    return operator->()->name != "__undef__";
-  }
-  /*! \return the string description of the layout */
-  const std::string& name() const {
-    return operator->()->name;
-  }
-
-  /*!
-   * \brief Whether the two layouts are equal.
-   * \param rhs Another layout.
-   * \return whether the two layouts are equal.
-   */
-  bool Equals(const Layout &rhs) const {
-    return operator->()->name == rhs->name;
-  }
-
-  /*!
- * \brief allow output string of layout to ostream
- * \param os the output stream
- * \param l the layout
- * \return the ostream
- */
-  friend std::ostream& operator<<(std::ostream& os, const Layout& l) {
-    os << l.name();
-    return os;
-  }
-
-  using ContainerType = LayoutNode;
-};
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-std::vector<IndexExpr> ConvertLayout(
-    std::vector<IndexExpr> src,
-    const Layout& src_layout,
-    const Layout& dst_layout);
-
-/*!
- * \brief Convert shape in src_layout to shape in dst_layout
- * \param src original shape
- * \param src_layout layout of original shape
- * \param dst_layout target layout
- * \return shape in target layout
- */
-std::vector<IndexExpr> ConvertLayout(
-    const Array<IndexExpr>& src,
-    const Layout& src_layout,
-    const Layout& dst_layout);
-}  // namespace relay
-}  // namespace tvm
-
-#endif  // TVM_RELAY_OP_LAYOUT_H_
diff --git a/src/relay/op/nn/convolution.cc b/src/relay/op/nn/convolution.cc
index e05b24d967bc..963257a14961 100644
--- a/src/relay/op/nn/convolution.cc
+++ b/src/relay/op/nn/convolution.cc
@@ -3,12 +3,12 @@
  * \file convolution.cc
  * \brief Convolution operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <vector>
 
 #include "../../pass/alter_op_layout.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -31,32 +31,36 @@ bool Conv2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
-  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
-      data->shape, in_layout, kNCHW);
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
     CHECK_EQ(param->dilation.size(), 2);
-    std::vector<IndexExpr> wshape(
+    Array<IndexExpr> wshape(
        {param->channels,
          dshape_nchw[1] / param->groups,
          param->kernel_size[0],
          param->kernel_size[1]});
-    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape = trans_kernel_layout.BackwardShape(wshape);
     channels = param->channels;
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
@@ -65,7 +69,7 @@ bool Conv2DRel(const Array<Type>& types,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       CHECK_EQ(param->kernel_size.size(), 2);
       // check the size
@@ -73,13 +77,13 @@ bool Conv2DRel(const Array<Type>& types,
             reporter->AssertEQ(param->kernel_size[1], wshape[3]))
           << "Conv2D: shape of weight is inconsistent with kernel_size, "
           << " kernel_size=" << param->kernel_size
-          << " wshape=" << Array<IndexExpr>(wshape);
+          << " wshape=" << wshape;
     }
     if (param->channels.defined()) {
       CHECK(reporter->AssertEQ(param->channels, wshape[0]))
           << "Conv2D: shape of weight is inconsistent with channels, "
           << " channels=" << param->channels
-          << " wshape=" << Array<IndexExpr>(wshape);
+          << " wshape=" << wshape;
     }
     CHECK(reporter->AssertEQ(dshape_nchw[1] / param->groups, wshape[1]));
     channels = wshape[0];
@@ -87,15 +91,15 @@ bool Conv2DRel(const Array<Type>& types,
     dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
   }
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
 
-  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
-  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  oshape.Set(2, (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1);
+  oshape.Set(3, (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1);
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
@@ -193,33 +197,38 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
     << "Conv only support output layouts that are convertible from NCHW."
     << " But got " << out_layout;
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
-  auto dshape_nchw = ConvertLayout(data->shape, in_layout, kNCHW);
+  auto dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   // infer weight if the kernel_size and channels are defined
   if (param->kernel_size.defined() && param->channels.defined()) {
     CHECK_EQ(param->kernel_size.size(), 2);
     CHECK_EQ(param->dilation.size(), 2);
 
-    std::vector<IndexExpr> wshape({dshape_nchw[1],
-                                   param->channels / param->groups,
-                                   param->kernel_size[0],
-                                   param->kernel_size[1]});
+    Array<IndexExpr> wshape({dshape_nchw[1],
+                             param->channels / param->groups,
+                             param->kernel_size[0],
+                             param->kernel_size[1]});
 
-    wshape = ConvertLayout(wshape, kOIHW, kernel_layout);
+    wshape = trans_kernel_layout.BackwardShape(wshape);
     dilated_ksize_y = 1 + (param->kernel_size[0] - 1) * param->dilation[0];
     dilated_ksize_x = 1 + (param->kernel_size[1] - 1) * param->dilation[1];
     channels = param->channels;
@@ -229,7 +238,7 @@ bool Conv2DTransposeRel(const Array<Type>& types,
   } else {
     // use weight to infer the conv shape.
     if (weight == nullptr) return false;
-    auto wshape = ConvertLayout(weight->shape, kernel_layout, kOIHW);
+    auto wshape = trans_kernel_layout.ForwardShape(weight->shape);
     if (param->kernel_size.defined()) {
       CHECK_EQ(param->kernel_size.size(), 2);
       // check the size
@@ -251,17 +260,17 @@ bool Conv2DTransposeRel(const Array<Type>& types,
     dilated_ksize_x = 1 + (wshape[3] - 1) * param->dilation[1];
   }
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
-  oshape[2] = (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
-               2 * param->padding[0] + param->output_padding[0]);
-  oshape[3] = (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
-               2 * param->padding[1] + param->output_padding[1]);
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  oshape.Set(2, (param->strides[0] * (dshape_nchw[2] - 1) + dilated_ksize_y -
+                 2 * param->padding[0] + param->output_padding[0]));
+  oshape.Set(3, (param->strides[1] * (dshape_nchw[3] - 1) + dilated_ksize_x -
+                 2 * param->padding[1] + param->output_padding[1]));
 
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
 }
@@ -349,20 +358,24 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   CHECK(param != nullptr);
   const Layout in_layout(param->data_layout);
   const Layout kernel_layout(param->kernel_layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  const auto trans_in_layout = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(trans_in_layout.defined())
     << "Conv only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
-  CHECK(kernel_layout.Convertible(kOIHW))
+
+  const auto trans_kernel_layout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  CHECK(trans_kernel_layout.defined())
     << "Conv only support kernel layouts that are convertible from OIHW."
     << " But got "<< kernel_layout;
 
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  CHECK(out_layout.Convertible(kNCHW))
+  const auto trans_out_layout = BijectiveLayoutNode::make(out_layout, kNCHW);
+  CHECK(trans_out_layout.defined())
       << "Conv only support output layouts that are convertible from NCHW."
       << " But got " << out_layout;
 
-  std::vector<IndexExpr> dshape_nchw = ConvertLayout(
-      data->shape, in_layout, kNCHW);
+  Array<IndexExpr> dshape_nchw = trans_in_layout.ForwardShape(data->shape);
 
   IndexExpr channels, dilated_ksize_y, dilated_ksize_x;
 
@@ -384,15 +397,15 @@ bool Conv2DWinogradRel(const Array<Type>& types,
   // can handle this correctly in alter_op_layout.
 
   // dilation
-  std::vector<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
+  Array<IndexExpr> oshape({dshape_nchw[0], channels, 0, 0});
 
-  oshape[2] = (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1;
-  oshape[3] = (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1;
+  oshape.Set(2, (dshape_nchw[2] + param->padding[0] * 2 - dilated_ksize_y) / param->strides[0] + 1);
+  oshape.Set(3, (dshape_nchw[3] + param->padding[1] * 2 - dilated_ksize_x) / param->strides[1] + 1);
   DataType out_dtype = param->out_dtype;
   if (out_dtype.bits() == 0) {
     out_dtype = data->dtype;
   }
-  oshape = ConvertLayout(oshape, kNCHW, out_layout);
+  oshape = trans_out_layout.BackwardShape(oshape);
   // assign output type
   reporter->Assign(types[2], TensorTypeNode::make(oshape, out_dtype));
   return true;
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 7ed43d0df019..9ab841cf4286 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -4,6 +4,7 @@
  * \brief Property def of nn operators.
  */
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/image.h>
@@ -14,7 +15,6 @@
 #include "../type_relations.h"
 #include "../../pass/alter_op_layout.h"
 #include "../op_common.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index dc99f05f4d2d..c24203cebdb3 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -3,12 +3,12 @@
  * \file pad.cc
  * \brief Implementation of operator pad
  */
+#include <tvm/data_layout.h>
 #include <tvm/ir_operator.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn.h>
 #include <vector>
-#include "../layout.h"
 #include "../op_common.h"
 
 namespace tvm {
diff --git a/src/relay/op/nn/pooling.cc b/src/relay/op/nn/pooling.cc
index 8fd33e1f3cdc..23704693732b 100644
--- a/src/relay/op/nn/pooling.cc
+++ b/src/relay/op/nn/pooling.cc
@@ -3,12 +3,12 @@
  * \file pooling.cc
  * \brief Pooling operators
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn/pooling.h>
 #include <vector>
-#include "../layout.h"
 #include "../../pass/alter_op_layout.h"
 
 namespace tvm {
@@ -32,14 +32,15 @@ Array<Array<Layout> > Pool2DInferCorrectLayout(
 
     Layout raw_layout(params->layout);
     Layout input = new_in_layouts[0];
-    if (input.Indexof('W') == raw_layout.Indexof('W') &&
-        input.Indexof('H') == raw_layout.Indexof('H') &&
-        !input.Contains('w') && !input.Contains('h')) {
+    if (input.IndexOf(LayoutAxis::Get('W')) == raw_layout.IndexOf(LayoutAxis::Get('W')) &&
+    input.IndexOf(LayoutAxis::Get('H')) == raw_layout.IndexOf(LayoutAxis::Get('H')) &&
+        !input.Contains(LayoutAxis::Get('w')) && !input.Contains(LayoutAxis::Get('h'))) {
       params->layout = input.name();  // modify self to follow the input layout
     }
   }
 
-  return Array<Array<Layout> >{{params->layout}, {params->layout}};
+  Layout inferred_layout(params->layout);
+  return Array<Array<Layout> >{{inferred_layout}, {inferred_layout}};
 }
 
 template <typename AttrType>
@@ -59,13 +60,13 @@ bool Pool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains('H') && layout.Contains('W') &&
-        !layout.Contains('h') && !layout.Contains('w'))
+  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.Indexof('H');
-  const auto widx = layout.Indexof('W');
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
 
   IndexExpr pad_h, pad_w;
   if (param->padding.size() == 1) {
@@ -125,6 +126,7 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
                             const Array<Tensor>& inputs,
                             const Type& out_type,
                             const Target& target) {
+  static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<AttrType>();
   CHECK(param != nullptr);
   auto pool_size = param->pool_size;
@@ -132,10 +134,13 @@ Array<Tensor> Pool2DCompute(const Attrs& attrs,
   auto padding = param->padding;
   auto ceil_mode = param->ceil_mode;
   Layout layout(param->layout);
-  CHECK(layout.Convertible(Layout("NCHW")))
+
+  CHECK(BijectiveLayoutNode::make(layout, kNCHW).defined())
       << "max_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.Indexof('h'), -1) << "max_pool2d does not support input split on height";
-  CHECK_EQ(layout.Indexof('w'), -1) << "max_pool2d does not support input split on width";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
+      << "max_pool2d does not support input split on height";
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
+      << "max_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
       << "Pool2D only support 4-D input (e.g., NCHW)"
@@ -271,13 +276,13 @@ bool GlobalPool2DRel(const Array<Type>& types,
   CHECK(param != nullptr);
 
   Layout layout(param->layout);
-  CHECK(layout.Contains('H') && layout.Contains('W') &&
-        !layout.Contains('h') && !layout.Contains('w'))
+  CHECK(layout.Contains(LayoutAxis::Get('H')) && layout.Contains(LayoutAxis::Get('W')) &&
+        !layout.Contains(LayoutAxis::Get('h')) && !layout.Contains(LayoutAxis::Get('w')))
     << "Invalid layout " << layout
     << ". Pool2D layout must have H and W, which cannot be split";
 
-  const auto hidx = layout.Indexof('H');
-  const auto widx = layout.Indexof('W');
+  const auto hidx = layout.IndexOf(LayoutAxis::Get('H'));
+  const auto widx = layout.IndexOf(LayoutAxis::Get('W'));
   Array<IndexExpr> oshape(dshape);
   oshape.Set(hidx, 1);
   oshape.Set(widx, 1);
@@ -293,14 +298,15 @@ Array<Tensor> GlobalPool2DCompute(const Attrs& attrs,
                                   const Array<Tensor>& inputs,
                                   const Type& out_type,
                                   const Target& target) {
+  static const Layout kNCHW("NCHW");
   const auto* param = attrs.as<GlobalPool2DAttrs>();
   CHECK(param != nullptr);
   Layout layout(param->layout);
-  CHECK(layout.Convertible(Layout("NCHW")))
+  CHECK(BijectiveLayoutNode::make(layout, kNCHW).defined())
     << "global_avg_pool2d currently only supports layouts that are convertible from NCHW";
-  CHECK_EQ(layout.Indexof('h'), -1)
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('h')), -1)
     << "global_avg_pool2d does not support input split on height";
-  CHECK_EQ(layout.Indexof('w'), -1)
+  CHECK_EQ(layout.IndexOf(LayoutAxis::Get('w')), -1)
     << "global_avg_pool2d does not support input split on width";
 
   CHECK(inputs[0].ndim() == 4U || inputs[0].ndim() == 5U)
diff --git a/src/relay/op/nn/upsampling.cc b/src/relay/op/nn/upsampling.cc
index d386437ae15b..48a7a04ebb8a 100644
--- a/src/relay/op/nn/upsampling.cc
+++ b/src/relay/op/nn/upsampling.cc
@@ -3,6 +3,7 @@
  * \file upsampling.cc
  * \brief upsampling operator
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/op_attr_types.h>
@@ -11,7 +12,6 @@
 #include <topi/nn/upsampling.h>
 #include <vector>
 #include "../op_common.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -31,18 +31,20 @@ bool UpSamplingRel(const Array<Type>& types,
   const UpSamplingAttrs* param = attrs.as<UpSamplingAttrs>();
   CHECK(param != nullptr);
   const Layout in_layout(param->layout);
-  CHECK(in_layout.Convertible(kNCHW))
+
+  auto layout_converter = BijectiveLayoutNode::make(in_layout, kNCHW);
+  CHECK(layout_converter.defined())
     << "UpSampling only support input layouts that are convertible from NCHW."
     << " But got " << in_layout;
 
-  auto oshape = ConvertLayout(data->shape, in_layout, kNCHW);
+  auto oshape = layout_converter.ForwardShape(data->shape);
 
-  oshape[2] = oshape[2] * param->scale;
-  oshape[3] = oshape[3] * param->scale;
+  oshape.Set(2, oshape[2] * param->scale);
+  oshape.Set(3, oshape[3] * param->scale);
 
   // assign output type
   reporter->Assign(types[1],
-                   TensorTypeNode::make(ConvertLayout(oshape, kNCHW, in_layout),
+                   TensorTypeNode::make(layout_converter.BackwardShape(oshape),
                                         data->dtype));
   return true;
 }
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 48c97b91dfda..df23b22512e3 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -7,6 +7,7 @@
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/ir_operator.h>
 #include <tvm/ir.h>
+#include <tvm/data_layout.h>
 #include <topi/transform.h>
 #include <topi/elemwise.h>
 #include <topi/broadcast.h>
@@ -16,7 +17,6 @@
 #include "../op_common.h"
 #include "../../../arithmetic/compute_expr.h"
 #include "../../pass/alter_op_layout.h"
-#include "../layout.h"
 
 namespace tvm {
 namespace relay {
@@ -218,7 +218,7 @@ Array<Array<Layout>> ConcatenateLayout(
 
   Layout ret;
   if (new_in_layouts.defined()) {  // this function is called after some operators are alternated.
-    Layout::LayoutDim concate_dim = old_in_layouts[0][axis];
+    const auto& concate_dim = old_in_layouts[0][axis];
     for (size_t i = 0; i < new_in_layouts.size(); ++i) {
       if (new_in_layouts[i].ndim() > axis &&
           new_in_layouts[i][axis] == concate_dim) {
@@ -234,7 +234,7 @@ Array<Array<Layout>> ConcatenateLayout(
       }
     }
 
-    if (ret.ndim() <= axis || Layout::IsSubdim(ret[axis])) {
+    if (ret.ndim() <= axis || !ret[axis].IsPrimal()) {
       return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
     }
   }
@@ -1682,46 +1682,10 @@ Array<Tensor> LayoutTransformCompute(const Attrs& attrs,
                                      const Array<Tensor>& inputs,
                                      const Type& out_type,
                                      const Target& target) {
-  const LayoutTransformAttrs *param = attrs.as<LayoutTransformAttrs>();
+  const auto* param = attrs.as<LayoutTransformAttrs>();
   CHECK(param != nullptr);
-
-  Layout src_layout(param->src_layout);
-  Layout dst_layout(param->dst_layout);
-
-  if (src_layout.Equals(dst_layout)) {
-    return Array<Tensor>{ inputs[0] };
-  }
-
-  CHECK(src_layout.defined() && dst_layout.defined())
-    << "cannot convert from/to undefined layout";
-  CHECK(src_layout.Convertible(dst_layout))
-    << "cannot convert from " << param->src_layout << " to " << param->dst_layout;
-
-  const auto& out_shape = ConvertLayout(inputs[0]->shape, src_layout, dst_layout);
-  return Array<Tensor> {
-      topi::layout_transform(inputs[0], out_shape, [&](const Array<tvm::Var>& dst_indices) {
-        std::vector<tvm::Expr> dst_to_src_indices;
-        for (size_t i = 0; i < src_layout.ndim(); ++i) {
-          Layout::LayoutDim src_axis = src_layout[i];
-          int dst_major_pos = dst_layout.Indexof(Layout::ToSuperdim(src_axis));
-          int dst_minor_pos = dst_layout.Indexof(Layout::ToSubdim(src_axis));
-          int32_t src_factor = static_cast<int32_t>(src_layout.Subsizeof(src_axis));
-          int32_t dst_factor = static_cast<int32_t>(dst_layout.Subsizeof(src_axis));
-
-          tvm::Expr src_index(dst_indices[dst_major_pos]);
-          if (dst_minor_pos >= 0) {
-            CHECK_GT(dst_factor, 0);
-            src_index = src_index * dst_factor + dst_indices[dst_minor_pos];
-          }
-          if (Layout::IsSuperdim(src_axis) && src_factor > 0) {
-            src_index = src_index / src_factor;
-          } else if (Layout::IsSubdim(src_axis) && src_factor > 0) {
-            src_index = src_index % src_factor;
-          }
-          dst_to_src_indices.push_back(src_index);
-        }
-        return Array<tvm::Expr>(dst_to_src_indices);
-      })
+  return Array<Tensor>{
+    topi::layout_transform(inputs[0], param->src_layout, param->dst_layout)
   };
 }
 
@@ -1738,10 +1702,12 @@ bool LayoutTransformRel(const Array<Type>& types,
 
   CHECK(src_layout.defined() && dst_layout.defined())
     << "cannot convert from/to undefined layout";
-  CHECK(src_layout.Convertible(dst_layout))
+
+  auto layout_converter = BijectiveLayoutNode::make(src_layout, dst_layout);
+  CHECK(layout_converter.defined())
     << "cannot convert from " << params->src_layout << " to " << params->dst_layout;
 
-  const auto& out_shape = ConvertLayout(data->shape, src_layout, dst_layout);
+  const auto& out_shape = layout_converter.ForwardShape(data->shape);
   reporter->Assign(types[1], TensorTypeNode::make(out_shape, data->dtype));
   return true;
 }
diff --git a/src/relay/pass/alter_op_layout.cc b/src/relay/pass/alter_op_layout.cc
index 6d988eb2bcdf..fe624a6489c1 100644
--- a/src/relay/pass/alter_op_layout.cc
+++ b/src/relay/pass/alter_op_layout.cc
@@ -26,7 +26,7 @@ Expr TransformLayout(Expr raw, Layout src_layout, Layout dst_layout) {
   if (src_layout.Equals(dst_layout)) { return raw; }
   CHECK(src_layout.defined() && dst_layout.defined())
     << "Cannot insert layout transform because there are undefined layouts";
-  CHECK(src_layout.Convertible(dst_layout))
+  CHECK(BijectiveLayoutNode::make(src_layout, dst_layout).defined())
     << "Cannot insert layout transform because there are inconvertible layouts: "
     << src_layout << " v.s. " << dst_layout;
   static auto &transform_op = Op::Get("layout_transform");
diff --git a/src/relay/pass/alter_op_layout.h b/src/relay/pass/alter_op_layout.h
index fcb7b379a0ec..93d9ee52f687 100644
--- a/src/relay/pass/alter_op_layout.h
+++ b/src/relay/pass/alter_op_layout.h
@@ -9,10 +9,9 @@
 #ifndef TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
 #define TVM_RELAY_PASS_ALTER_OP_LAYOUT_H_
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/expr.h>
 
-#include "../op/layout.h"
-
 namespace tvm {
 namespace relay {
 
@@ -78,9 +77,9 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
 
     if (old_in_shapes[defined_idx].size() >= old_in_shapes[undef_idx].size()) {
       layouts.Set(undef_idx,
-                  layouts[defined_idx].Sublayout(
-                      old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
-                      old_in_shapes[undef_idx].size()));
+                  layouts[defined_idx].SubLayout(
+                  old_in_shapes[defined_idx].size() - old_in_shapes[undef_idx].size(),
+                  old_in_shapes[undef_idx].size()));
       return Array<Array<Layout> > {layouts, {layouts[defined_idx]}};
     } else {
       // only know the tensor with smaller dimensions,
@@ -90,21 +89,22 @@ inline Array<Array<Layout> > BinaryBroadcastLayout(const Attrs& attrs,
     }
   } else {
     // try to broadcast the tensors to the larger dimension
-    int large_idx = layouts[0].ndim_super() >= layouts[1].ndim_super() ? 0 : 1;
+    int large_idx = layouts[0].ndim_primal() >= layouts[1].ndim_primal() ? 0 : 1;
     int small_idx = 1 - large_idx;
     Layout ret = layouts[large_idx];
 
     // extract common part
     size_t i = layouts[large_idx].ndim();
     for (; i != 0; --i) {
-      auto dim = layouts[large_idx][i-1];
-      if (!layouts[small_idx].Contains(Layout::ToSuperdim(dim))) {
+      const auto& axis = layouts[large_idx][i-1];
+      if (!layouts[small_idx].Contains(axis.ToPrimal())) {
         break;
       }
     }
 
-    Layout common_part = layouts[large_idx].Sublayout(i, layouts[large_idx].ndim() - i);
-    if (!layouts[small_idx].Convertible(common_part)) {  // fail
+    Layout common_part = layouts[large_idx].SubLayout(i, layouts[large_idx].ndim() - i);
+    if (!BijectiveLayoutNode::make(layouts[small_idx], common_part).defined()) {
+      // not convertible
       return Array<Array<Layout> > {{Layout::Undef()}, {Layout::Undef()}};
     }
 
diff --git a/src/relay/pass/combine_parallel_conv2d.cc b/src/relay/pass/combine_parallel_conv2d.cc
index cd2d29e80048..44b239919ce2 100644
--- a/src/relay/pass/combine_parallel_conv2d.cc
+++ b/src/relay/pass/combine_parallel_conv2d.cc
@@ -91,8 +91,10 @@ class BranchGroupFinder : private ExprVisitor {
     CHECK(attrs_b);
     const auto* tweight_a = a->args[1]->type_as<TensorTypeNode>();
     const auto* tweight_b = b->args[1]->type_as<TensorTypeNode>();
-    const auto shape_a = ConvertLayout(tweight_a->shape, attrs_a->kernel_layout, kOIHW);
-    const auto shape_b = ConvertLayout(tweight_b->shape, attrs_b->kernel_layout, kOIHW);
+    const auto shape_a = BijectiveLayoutNode::make(
+      Layout(attrs_a->kernel_layout), kOIHW).ForwardShape(tweight_a->shape);
+    const auto shape_b = BijectiveLayoutNode::make(
+      Layout(attrs_b->kernel_layout), kOIHW).ForwardShape(tweight_b->shape);
 
     return eq(attrs_a->strides, attrs_b->strides) && eq(attrs_a->padding, attrs_b->padding) &&
            eq(attrs_a->dilation, attrs_b->dilation) && eq(attrs_a->groups, attrs_b->groups) &&
diff --git a/src/relay/pass/fold_scale_axis.cc b/src/relay/pass/fold_scale_axis.cc
index 270965886ab9..044cc4e5d9c9 100644
--- a/src/relay/pass/fold_scale_axis.cc
+++ b/src/relay/pass/fold_scale_axis.cc
@@ -6,12 +6,12 @@
  * \brief Fold axis scaling into weights of
  *  conv/dense operators.
  */
+#include <tvm/data_layout.h>
 #include <tvm/relay/pass.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
 #include "pattern_util.h"
 #include "pass_util.h"
-#include "../op/layout.h"
 
 
 namespace tvm {
@@ -435,8 +435,8 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
-  int c_big_axis = data_layout.Indexof('C');
-  int c_small_axis = data_layout.Indexof('c');
+  int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
+  int c_small_axis = data_layout.IndexOf(LayoutAxis::Get('c'));
 
   CHECK_GE(c_big_axis, 0);
   Message none = NullValue<Message>();
@@ -449,7 +449,7 @@ Array<Message> Conv2DForwardPrep(const Call& call, const Message& out_message) {
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (kernel_layout.Indexof('i') < 0 &&
+  if (kernel_layout.IndexOf(LayoutAxis::Get('i')) < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     data_axes = {c_big_axis};
@@ -473,15 +473,15 @@ Expr Conv2DForwardRewrite(const Call& ref_call,
   CHECK(param != nullptr);
   Layout data_layout(param->data_layout);
   Layout kernel_layout(param->kernel_layout);
-  int c_big_axis = data_layout.Indexof('C');
+  int c_big_axis = data_layout.IndexOf(LayoutAxis::Get('C'));
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(kernel_layout.Indexof('i'), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('i')), -1);
   CHECK(sdata->axes.size() == 1 &&
         c_big_axis == sdata->axes[0]->value);
-  int big_oc_axis = kernel_layout.Indexof('O');
-  int big_ic_axis = kernel_layout.Indexof('I');
+  int big_oc_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
+  int big_ic_axis = kernel_layout.IndexOf(LayoutAxis::Get('I'));
 
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(ref_call, param, kernel_layout);
@@ -857,8 +857,8 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   CHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  int c_big_axis = out_layout.Indexof('C');
-  int c_small_axis = out_layout.Indexof('c');
+  int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
+  int c_small_axis = out_layout.IndexOf(LayoutAxis::Get('c'));
 
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
@@ -869,8 +869,8 @@ Message Conv2DBackwardPrep(const Call& call, const Array<Message>& in_messages)
   // only handle depthwise or full conv2d.
   // TODO(tvm-team) handle grouped conv by reshape + bcast
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
-  if (kernel_layout.Indexof('o') < 0 &&
-      kernel_layout.Indexof('i') < 0 &&
+  if (kernel_layout.IndexOf(LayoutAxis::Get('o')) < 0 &&
+  kernel_layout.IndexOf(LayoutAxis::Get('i')) < 0 &&
       c_small_axis < 0 &&
       (param->groups == 1 || is_depthwise_conv2d)) {
     return MessageNode::make({c_big_axis}, false);
@@ -891,16 +891,16 @@ Expr Conv2DBackwardTransform(const Call& call,
   CHECK(param != nullptr);
   Layout kernel_layout(param->kernel_layout);
   Layout out_layout(param->out_layout == "" ? param->data_layout : param->out_layout);
-  int c_big_axis = out_layout.Indexof('C');
+  int c_big_axis = out_layout.IndexOf(LayoutAxis::Get('C'));
   CHECK_GE(c_big_axis, 0);
   // For now, we only support simple pattern (no folded weight/data)
   // TODO(tvm-team) support general data layout
-  CHECK_EQ(kernel_layout.Indexof('o'), -1);
-  CHECK_EQ(kernel_layout.Indexof('i'), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('o')), -1);
+  CHECK_EQ(kernel_layout.IndexOf(LayoutAxis::Get('i')), -1);
   CHECK(message->axes.size() == 1 &&
         c_big_axis == message->axes[0]->value);
 
-  int big_oc_axis = kernel_layout.Indexof('O');
+  int big_oc_axis = kernel_layout.IndexOf(LayoutAxis::Get('O'));
   // Check it must be depthwise or full conv2d.
   bool is_depthwise_conv2d = IsDepthwiseConv2D(call, param, kernel_layout);
   CHECK(param->groups == 1 || is_depthwise_conv2d);
diff --git a/src/relay/pass/mac_count.cc b/src/relay/pass/mac_count.cc
index 500312117c5b..e801cdc37d12 100644
--- a/src/relay/pass/mac_count.cc
+++ b/src/relay/pass/mac_count.cc
@@ -11,7 +11,7 @@
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/expr_functor.h>
-#include "../op/layout.h"
+#include <tvm/data_layout.h>
 
 namespace tvm {
 namespace relay {
@@ -51,8 +51,8 @@ int64_t ConvMacCount(const Call& call_node) {
   const auto* data_type = args[0]->checked_type().as<TensorTypeNode>();
   Array<IndexExpr> data_shape = data_type->shape;
   std::string data_layout = conv_2d_attr->data_layout;
-  int32_t C_ind = Layout(data_layout).Indexof('C');
-  int32_t c_ind = Layout(data_layout).Indexof('c');
+  int32_t C_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('C'));
+  int32_t c_ind = Layout(data_layout).IndexOf(LayoutAxis::Get('c'));
   CHECK(C_ind != -1)
       << "There is no input channel dimension.";
   int64_t input_channel = static_cast<int64_t>(data_shape[C_ind].as<IntImm>()->value);
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 08fc017f41eb..0644c26c6bcc 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -8,13 +8,13 @@
 #ifndef TVM_RELAY_PASS_PATTERN_UTIL_H_
 #define TVM_RELAY_PASS_PATTERN_UTIL_H_
 
+#include <tvm/data_layout.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/expr.h>
 #include <tvm/relay/attrs/nn.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/attrs/nn.h>
 #include <string>
-#include "../op/layout.h"
 
 
 namespace tvm {
@@ -155,9 +155,8 @@ inline bool IsDepthwiseConv2D(const Call& call,
                               const Conv2DAttrs* param,
                               const Layout& kernel_layout) {
   static const Layout kOIHW("OIHW");
-  auto wshape = ConvertLayout(
-      call->args[1]->type_as<TensorTypeNode>()->shape,
-      kernel_layout, kOIHW);
+  const auto bilayout = BijectiveLayoutNode::make(kernel_layout, kOIHW);
+  auto wshape = bilayout.ForwardShape(call->args[1]->type_as<TensorTypeNode>()->shape);
   return is_const_int(wshape[0], param->groups) &&
       is_const_int(wshape[1], 1);
 }
diff --git a/tests/python/unittest/test_lang_data_layout.py b/tests/python/unittest/test_lang_data_layout.py
new file mode 100644
index 000000000000..73d626e32fa7
--- /dev/null
+++ b/tests/python/unittest/test_lang_data_layout.py
@@ -0,0 +1,65 @@
+"""Test layout and bijective-layout node"""
+
+import tvm
+from topi.util import get_const_tuple
+
+def test_layout():
+    layout = tvm.layout("NCHW16c")
+    assert layout is not None
+    assert isinstance(layout, tvm.tensor.Layout)
+
+    assert layout.factor_of("c") == 16
+    assert layout.factor_of("C") == 16
+    assert layout.factor_of("N") == -1
+
+    assert layout.index_of("N") == 0
+    assert layout.index_of("C") == 1
+    assert layout.index_of("H") == 2
+    assert layout.index_of("W") == 3
+    assert layout.index_of("c") == 4
+    assert layout.index_of("O") == -1
+
+    assert "N" in layout
+    assert "C" in layout
+    assert "H" in layout
+    assert "W" in layout
+    assert "c" in layout
+    assert "O" not in layout
+
+    assert layout[0] == "N"
+    assert layout[1] == "C"
+    assert layout[2] == "H"
+    assert layout[3] == "W"
+    assert layout[4] == "c"
+    assert layout[-1] == "c"
+
+def test_bilayout_convertible():
+    # not convertible
+    assert tvm.bijective_layout("NCHW", "ABCD") is None
+    # convertible
+    assert tvm.bijective_layout("NCHW", "NCHW16c") is not None
+
+def test_bilayout_shape():
+    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+    assert isinstance(bilayout, tvm.tensor.BijectiveLayout)
+
+    dst_shape = bilayout.forward_shape((1, 32, 7, 7))
+    assert get_const_tuple(dst_shape) == (1, 2, 7, 7, 16)
+
+    src_shape = bilayout.backward_shape(dst_shape)
+    assert get_const_tuple(src_shape) == (1, 32, 7, 7)
+
+def test_bilayout_index():
+    bilayout = tvm.bijective_layout("NCHW", "NCHW16c")
+
+    dst_index = bilayout.forward_index([0, 18, 6, 6])
+    assert get_const_tuple(dst_index) == (0, 1, 6, 6, 2)
+
+    src_index = bilayout.backward_index([0, 1, 6, 6, 2])
+    assert get_const_tuple(src_index) == (0, 18, 6, 6)
+
+if __name__ == "__main__":
+    test_layout()
+    test_bilayout_convertible()
+    test_bilayout_shape()
+    test_bilayout_index()
diff --git a/topi/include/topi/nn.h b/topi/include/topi/nn.h
index 5f0b758c6424..00c3f999853d 100644
--- a/topi/include/topi/nn.h
+++ b/topi/include/topi/nn.h
@@ -450,28 +450,5 @@ inline tvm::Tensor group_conv2d_ngchw(const tvm::Tensor& I,
   return tvm::compute(output_shape, l, name, tag);
 }
 
-using FLayoutIndicesTransform = std::function<Array<Expr>(const Array<Var>& indices)>;
-
-/*!
- * \brief Transform the layout according to the mapping function \p to_src_indices.
- * \param src the source input.
- * \param dst_shape the output shape.
- * \param to_src_indices the mapping function from input index to output index.
- * \param name output tensor name.
- * \param tag output tensor tag.
- * \return A tensor with shape \p dst_shape.
- */
-inline Tensor layout_transform(const Tensor& src,
-                               const Array<Expr>& dst_shape,
-                               const FLayoutIndicesTransform& to_src_indices,
-                               const std::string name = "layout_transform",
-                               const std::string tag = kInjective) {
-  auto src_shape = src->shape;
-  return compute(
-  dst_shape, [&](const Array<Var>& dst_indices) {
-    return src(to_src_indices(dst_indices));
-  }, name, tag);
-}
-
 }  // namespace topi
 #endif  // TOPI_NN_H_
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index e399b8c6978c..24ebe5de4a20 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -16,6 +16,7 @@
 #include "topi/detail/ravel_unravel.h"
 #include "topi/detail/constant_utils.h"
 #include "tvm/tvm.h"
+#include "tvm/data_layout.h"
 
 namespace topi {
 using namespace tvm;
@@ -882,5 +883,43 @@ inline Tensor arange(const Expr start,
   }, name, tag);
 }
 
+/*!
+ * \brief Transform the layout according to \p src_layout and \p dst_layout
+ * \param src the source input.
+ * \param src_layout the source layout.
+ * \param dst_layout the destination layout.
+ * \param name output tensor name.
+ * \param tag output tensor tag.
+ * \return A tensor with shape in \p dst_layout
+ */
+inline Tensor layout_transform(const Tensor& src,
+                               const std::string& src_layout,
+                               const std::string& dst_layout,
+                               const std::string name = "layout_transform",
+                               const std::string tag = kInjective) {
+  Layout src_layout_struct = LayoutNode::make(src_layout);
+  Layout dst_layout_struct = LayoutNode::make(dst_layout);
+
+  if (src_layout_struct.Equals(dst_layout_struct)) {
+    return src;
+  }
+
+  CHECK(src_layout_struct.defined() && dst_layout_struct.defined())
+    << "cannot convert from/to undefined layout";
+
+  auto layout_converter = BijectiveLayoutNode::make(src_layout_struct, dst_layout_struct);
+  CHECK(layout_converter.defined())
+    << "cannot convert from " << src_layout << " to " << dst_layout;
+
+  Array<Expr> dst_shape = layout_converter.ForwardShape(src->shape);
+
+  return compute(
+    dst_shape, [&](const Array<Var>& dst_indices) {
+      Array<Expr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
+      Array<Expr> src_indices = layout_converter.BackwardIndex(dst_indices_expr);
+      return src(src_indices);
+  }, name, tag);
+}
+
 }  // namespace topi
 #endif  // TOPI_TRANSFORM_H_
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 2fb20162a5a7..e3ab0b364c65 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -318,3 +318,20 @@ def arange(start, stop=None, step=1, dtype="float32"):
         stop = start
         start = 0
     return cpp.arange(start, stop, step, dtype)
+
+
+def layout_transform(array, src_layout, dst_layout):
+    """Transform the layout according to src_layout and dst_layout
+
+    Parameters
+    ----------
+    array : tvm.Tensor
+        The source array.
+
+    src_layout : str
+        the source layout.
+
+    dst_layout : str
+        the destination layout.
+    """
+    return cpp.layout_transform(array, src_layout, dst_layout)
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index e3fec08cb491..aac2d1653c78 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -272,6 +272,11 @@ TVM_REGISTER_GLOBAL("topi.split")
   }
   });
 
+TVM_REGISTER_GLOBAL("topi.layout_transform")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = layout_transform(args[0], args[1], args[2]);
+});
+
 TVM_REGISTER_GLOBAL("topi.take")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   if (args.size() == 2) {
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index dad527e3951f..31e37d4d26f2 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -449,6 +449,34 @@ def test_arange():
     verify_arange(20, 1, -1.5)
 
 
+def test_layout_transform():
+    in_shape = (1, 32, 8, 8)
+    A = tvm.placeholder(shape=in_shape, dtype="float32", name="A")
+    B = topi.layout_transform(A, "NCHW", "NCHW16c")
+
+    input = np.random.uniform(size=in_shape).astype(A.dtype)
+    output = np.transpose(input, axes=(0, 2, 3, 1))
+    output = np.reshape(output, newshape=(1, 8, 8, 2, 16))
+    output = np.transpose(output, axes=(0, 3, 1, 2, 4))
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        tvm_input = tvm.nd.array(input, ctx)
+        tvm_output = tvm.nd.empty(output.shape, ctx=ctx, dtype=B.dtype)
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_injective(B)
+        f = tvm.build(s, [A, B], device, name="layout_transform")
+        f(tvm_input, tvm_output)
+        tvm.testing.assert_allclose(tvm_output.asnumpy(), output)
+
+    for backend in get_all_backend():
+        check_device(backend)
+
+
 if __name__ == "__main__":
     test_strided_slice()
     test_concatenate()
@@ -462,3 +490,4 @@ def test_arange():
     test_take()
     test_gather_nd()
     test_arange()
+    test_layout_transform()

From 86d746ff593bd67ba7679ba88e01e07437f20376 Mon Sep 17 00:00:00 2001
From: Truman TIAN <flametbs@gmail.com>
Date: Thu, 28 Feb 2019 21:05:45 +0800
Subject: [PATCH 41/93] [DOC] Using External Libraries in Relay (#2694)

* added relay quick start

* added relay/using_external_lib.py

* update using_external_lib

* Update using_external_lib.py

* update tvm/make/config.mk -> cmake/config.cmake

* Fixed: result mismatched when lowering relay with cudnn support at opt level 2

* setting opt_level=2 and out_channels=16 for consistency of original tutorial

* Fixed some typos
---
 tutorials/relay/using_external_lib.py | 544 ++++++++++++++++++++++++++
 tutorials/relay_quick_start.py        | 144 +++++++
 2 files changed, 688 insertions(+)
 create mode 100644 tutorials/relay/using_external_lib.py
 create mode 100644 tutorials/relay_quick_start.py

diff --git a/tutorials/relay/using_external_lib.py b/tutorials/relay/using_external_lib.py
new file mode 100644
index 000000000000..fb4b52ea5cf1
--- /dev/null
+++ b/tutorials/relay/using_external_lib.py
@@ -0,0 +1,544 @@
+"""
+Using External Libraries in Relay
+================================
+**Author**: `Masahiro Masuda <https://github.com/masahi>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This is a short tutorial on how to use external libraries such as cuDNN, or cuBLAS with Relay.
+
+Relay uses TVM internally to generate target specific code. For example, with cuda backend TVM generates cuda kernels for all layers in the user provided network.
+But sometimes it is also helpful to incorporate external libraries developed by various vendors into Relay.
+Luckily, TVM has a mechanism to transparently call into these libraries.
+For Relay users, all we need to do is just to set a target string appropriately.
+
+Before we can use external libraries from Relay, your TVM needs to be built with libraries you want to use.
+For example, to use cuDNN, USE_CUDNN option in `cmake/config.cmake` needs to be enabled, and cuDNN include and library directories need to be specified if necessary.
+
+To begin with, we import Relay and TVM.
+"""
+import tvm
+import numpy as np
+from tvm.contrib import graph_runtime as runtime
+from tvm import relay
+from tvm.relay import testing
+
+######################################################################
+# Create a simple network
+# -----------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, and ReLU activation.
+
+out_channels = 16
+batch_size = 1
+
+data = relay.var("data", relay.TensorType((batch_size, 3, 224, 224), "float32"))
+weight = relay.var("weight")
+bn_gamma = relay.var("bn_gamma")
+bn_beta = relay.var("bn_beta")
+bn_mmean = relay.var("bn_mean")
+bn_mvar = relay.var("bn_var")
+
+simple_net = relay.nn.conv2d(data=data, weight=weight, kernel_size=(3,3), channels=out_channels, padding=(1, 1))
+simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, bn_mvar)[0]
+simple_net = relay.nn.relu(simple_net)
+simple_net = relay.Function(relay.ir_pass.free_vars(simple_net), simple_net)
+
+data_shape = (batch_size, 3, 224, 224)
+net, params = testing.create_workload(simple_net)
+
+######################################################################
+# Build and run with cuda backend
+# -------------------------------
+# We build and run this network with cuda backend, as usual.
+# By setting the logging level to DEBUG, the result of Relay graph compilation will be dumped as pseudo code.
+import logging
+logging.basicConfig(level=logging.DEBUG) # to dump TVM IR after fusion
+
+target = "cuda"
+graph, lib, params = relay.build_module.build(
+    net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cuda = out.asnumpy()
+######################################################################
+# The generated pseudo code should look something like below.
+# Note how bias add, batch normalization, and ReLU activation are fused into the convolution kernel.
+# TVM generates a single, fused kernel from this representation.
+#
+# .. code-block:: text
+#
+#       produce tensor {
+#         // attr [iter_var(blockIdx.z, , blockIdx.z)] thread_extent = 1
+#         // attr [compute] storage_scope = "local"
+#         allocate compute[float32 * 32]
+#         // attr [pad_temp.shared] storage_scope = "shared"
+#         allocate pad_temp.shared[float32 * 180]
+#         // attr [placeholder.shared] storage_scope = "shared"
+#         allocate placeholder.shared[float32 * 144]
+#         // attr [iter_var(blockIdx.y, , blockIdx.y)] thread_extent = 28
+#         // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 14
+#         // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#         // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#         // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#         produce compute {
+#           compute[0] = 0.000000f
+#           compute[1] = 0.000000f
+#           compute[2] = 0.000000f
+#           compute[3] = 0.000000f
+#           compute[4] = 0.000000f
+#           compute[5] = 0.000000f
+#           compute[6] = 0.000000f
+#           compute[7] = 0.000000f
+#           compute[8] = 0.000000f
+#           compute[9] = 0.000000f
+#           compute[10] = 0.000000f
+#           compute[11] = 0.000000f
+#           compute[12] = 0.000000f
+#           compute[13] = 0.000000f
+#           compute[14] = 0.000000f
+#           compute[15] = 0.000000f
+#           compute[16] = 0.000000f
+#           compute[17] = 0.000000f
+#           compute[18] = 0.000000f
+#           compute[19] = 0.000000f
+#           compute[20] = 0.000000f
+#           compute[21] = 0.000000f
+#           compute[22] = 0.000000f
+#           compute[23] = 0.000000f
+#           compute[24] = 0.000000f
+#           compute[25] = 0.000000f
+#           compute[26] = 0.000000f
+#           compute[27] = 0.000000f
+#           compute[28] = 0.000000f
+#           compute[29] = 0.000000f
+#           compute[30] = 0.000000f
+#           compute[31] = 0.000000f
+#           for (rc.outer, 0, 3) {
+#             produce pad_temp.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*15) < (60 - threadIdx.x)))) {
+#                 if (likely((threadIdx.x < 15))) {
+#                   pad_temp.shared[(((((threadIdx.z*15) + threadIdx.x)/60)*180) + ((((((threadIdx.z*15) + threadIdx.x)/6) % 10)*18) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))] = tvm_if_then_else((((((1 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((threadIdx.z*15) + threadIdx.x)/6) % 10)))) && ((1 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - ((((threadIdx.z*3) + threadIdx.x)*3) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((threadIdx.z*15) + threadIdx.x)/60)*9408))*16) + ((((threadIdx.z*3) + threadIdx.x)*3) % 18)) + (((((threadIdx.z*15) + threadIdx.x)/6) % 10)*224)) + -225)], 0.000000f)
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 1) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 1)/18) % 10)*224)) + -225)], 0.000000f)
+#                   pad_temp.shared[(((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*180) + ((((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*18) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))] = tvm_if_then_else((((((1 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)) <= (blockIdx.y*8)) && ((blockIdx.y*8) < (225 - ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)))) && ((1 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) <= (blockIdx.x*16))) && ((blockIdx.x*16) < (225 - (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)))), placeholder[((((((((blockIdx.y*112) + blockIdx.x) + (rc.outer*3136)) + ((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/180)*9408))*16) + (((((threadIdx.z*3) + threadIdx.x)*3) + 2) % 18)) + (((((((threadIdx.z*15) + threadIdx.x)*3) + 2)/18) % 10)*224)) + -225)], 0.000000f)
+#                 }
+#               }
+#             }
+#             produce placeholder.shared {
+#               // attr [iter_var(threadIdx.z, , threadIdx.z)] thread_extent = 4
+#               // attr [iter_var(threadIdx.y, , threadIdx.y)] thread_extent = 1
+#               // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 16
+#               if (likely(((threadIdx.z*4) < (16 - (threadIdx.x/3))))) {
+#                 if (likely(((threadIdx.z*12) < (48 - threadIdx.x)))) {
+#                   if (likely((threadIdx.x < 12))) {
+#                     placeholder.shared[(((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3)] = placeholder[(((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 1)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 1)]
+#                     placeholder.shared[((((((threadIdx.z*4) + (threadIdx.x/3))*3) + (threadIdx.x % 3))*3) + 2)] = placeholder[((((((rc.outer + (threadIdx.z*12)) + ((threadIdx.x/3)*3))*3) + (threadIdx.x % 3))*3) + 2)]
+#                   }
+#                 }
+#               }
+#             }
+#             compute[0] = (compute[0] + (pad_temp.shared[threadIdx.x]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[(threadIdx.z*36)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 9)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 18)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[threadIdx.x]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 27)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 1)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 10)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 19)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 1)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 28)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 2)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 11)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 20)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 2)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 29)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 3)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 12)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 21)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 18)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 30)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 4)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 13)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 22)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 19)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 31)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 5)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 14)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 23)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 20)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 32)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 6)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 15)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 24)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 36)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 54)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 72)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 90)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 108)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 126)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 144)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 162)]*placeholder.shared[((threadIdx.z*36) + 33)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 7)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 16)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 25)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 37)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 55)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 73)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 91)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 109)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 127)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 145)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 163)]*placeholder.shared[((threadIdx.z*36) + 34)]))
+#             compute[0] = (compute[0] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[1] = (compute[1] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[2] = (compute[2] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[3] = (compute[3] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[4] = (compute[4] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[5] = (compute[5] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[6] = (compute[6] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[7] = (compute[7] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 8)]))
+#             compute[8] = (compute[8] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[9] = (compute[9] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[10] = (compute[10] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[11] = (compute[11] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[12] = (compute[12] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[13] = (compute[13] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[14] = (compute[14] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[15] = (compute[15] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 17)]))
+#             compute[16] = (compute[16] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[17] = (compute[17] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[18] = (compute[18] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[19] = (compute[19] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[20] = (compute[20] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[21] = (compute[21] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[22] = (compute[22] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[23] = (compute[23] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 26)]))
+#             compute[24] = (compute[24] + (pad_temp.shared[(threadIdx.x + 38)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[25] = (compute[25] + (pad_temp.shared[(threadIdx.x + 56)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[26] = (compute[26] + (pad_temp.shared[(threadIdx.x + 74)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[27] = (compute[27] + (pad_temp.shared[(threadIdx.x + 92)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[28] = (compute[28] + (pad_temp.shared[(threadIdx.x + 110)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[29] = (compute[29] + (pad_temp.shared[(threadIdx.x + 128)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[30] = (compute[30] + (pad_temp.shared[(threadIdx.x + 146)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#             compute[31] = (compute[31] + (pad_temp.shared[(threadIdx.x + 164)]*placeholder.shared[((threadIdx.z*36) + 35)]))
+#           }
+#         }
+#         tensor[(((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x)] = max(((compute[0]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 224)] = max(((compute[1]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 448)] = max(((compute[2]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 672)] = max(((compute[3]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 896)] = max(((compute[4]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1120)] = max(((compute[5]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1344)] = max(((compute[6]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 1568)] = max(((compute[7]*placeholder[(threadIdx.z*4)]) + placeholder[(threadIdx.z*4)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50176)] = max(((compute[8]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50400)] = max(((compute[9]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50624)] = max(((compute[10]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 50848)] = max(((compute[11]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51072)] = max(((compute[12]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51296)] = max(((compute[13]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51520)] = max(((compute[14]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 51744)] = max(((compute[15]*placeholder[((threadIdx.z*4) + 1)]) + placeholder[((threadIdx.z*4) + 1)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100352)] = max(((compute[16]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100576)] = max(((compute[17]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 100800)] = max(((compute[18]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101024)] = max(((compute[19]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101248)] = max(((compute[20]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101472)] = max(((compute[21]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101696)] = max(((compute[22]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 101920)] = max(((compute[23]*placeholder[((threadIdx.z*4) + 2)]) + placeholder[((threadIdx.z*4) + 2)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150528)] = max(((compute[24]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150752)] = max(((compute[25]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 150976)] = max(((compute[26]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151200)] = max(((compute[27]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151424)] = max(((compute[28]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151648)] = max(((compute[29]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 151872)] = max(((compute[30]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#         tensor[((((((blockIdx.y*112) + blockIdx.x) + (threadIdx.z*12544))*16) + threadIdx.x) + 152096)] = max(((compute[31]*placeholder[((threadIdx.z*4) + 3)]) + placeholder[((threadIdx.z*4) + 3)]), 0.000000f)
+#       }
+
+######################################################################
+# Use cuDNN for a convolutional layer
+# -----------------------------------
+# We can use cuDNN to replace convolution kernels with cuDNN ones.
+# To do that, all we need to do is to append the option " -libs=cudnn" to the target string.
+net, params = testing.create_workload(simple_net)
+target = "cuda -libs=cudnn" # use cudnn for convolution
+graph, lib, params = relay.build_module.build(
+        net, target, params=params)
+
+ctx = tvm.context(target, 0)
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+module = runtime.create(graph, lib, ctx)
+module.set_input(**params)
+module.set_input("data", data)
+module.run()
+out_shape = (batch_size, out_channels, 224, 224)
+out = module.get_output(0, tvm.nd.empty(out_shape))
+out_cudnn = out.asnumpy()
+
+######################################################################
+# Note that if you use cuDNN, Relay cannot fuse convolution with layers following it.
+# This is because layer fusion happens at the level of TVM internal representation(IR).
+# Relay treats external libraries as black box, so there is no way to fuse them with TVM IR.
+#
+# The pseudo code below shows that cuDNN convolution + bias add + batch norm + ReLU turned into two stages of computation, one for cuDNN call and the other for the rest of operations.
+#
+# .. code-block:: text
+#
+#      // attr [y] storage_scope = "global"
+#      allocate y[float32 * 802816]
+#      produce y {
+#        // attr [0] extern_scope = 0
+#        tvm_call_packed("tvm.contrib.cudnn.conv2d.forward", 1, 0, 1, 1, 1, 1, 1, 1, 1, tvm_stack_make_array(placeholder, tvm_stack_make_shape(1, 3, 224, 224), 0, 4, 0.000000f, 0), tvm_stack_make_array(placeholder, tvm_stack_make_shape(16, 3, 3, 3), 0, 4, 0.000000f, 0), tvm_stack_make_array(y, tvm_stack_make_shape(1, 16, 224, 224), 0, 4, 0.000000f, 0))
+#      }
+#      produce tensor {
+#        // attr [iter_var(blockIdx.x, , blockIdx.x)] thread_extent = 256
+#        // attr [iter_var(threadIdx.x, , threadIdx.x)] thread_extent = 512
+#        for (ax0.ax1.fused.ax2.fused.ax3.fused.outer, 0, 7) {
+#          if (likely(((blockIdx.x*512) < ((802816 - (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072)) - threadIdx.x)))) {
+#            tensor[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))] = max(((y[(((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/802816)*802816) + (((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/224) % 224)*224) + ((((blockIdx.x*64) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*32)) % 224))) + ((((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)*50176))]*placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]) + placeholder[(((((blockIdx.x*512) + threadIdx.x) + (ax0.ax1.fused.ax2.fused.ax3.fused.outer*131072))/50176) % 16)]), 0.000000f)
+#          }
+#        }
+#      }
+
+
+######################################################################
+# Verify the result
+# -----------------
+# We can check that the results of two runs match.
+
+tvm.testing.assert_allclose(out_cuda, out_cudnn, rtol=1e-5)
+
+#####################################################################
+# Conclusion
+# ----------
+# This tutorial covered the usage of cuDNN with Relay.
+# We also have support for cuBLAS. If cuBLAS is enabled, it will be used inside a fully connected layer (relay.dense).
+# To use cuBLAS, set a target string as "cuda -libs=cublas".
+# You can use both cuDNN and cuBLAS with "cuda -libs=cudnn,cublas".
+#
+# For ROCm backend, we have support for MIOpen and rocBLAS.
+# They can be enabled with target "rocm -libs=miopen,rocblas".
+#
+# Being able to use external libraries is great, but we need to keep in mind some cautions.
+#
+# First, the use of external libraries may restrict your usage of TVM and Relay.
+# For example, MIOpen only supports NCHW layout and fp32 data type at the moment, so you cannot use other layouts or data type in TVM.
+#
+# Second, and more importantly, external libraries restrict the possibility of operator fusion during graph compilation, as shown above.
+# TVM and Relay aim to achieve the best performance on a variety of hardwares, with joint operator level and graph level optimization.
+# To achieve this goal, we should continue developing better optimizations for TVM and Relay, while using external libraries as a nice way to fall back to existing implementation when necessary.
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
new file mode 100644
index 000000000000..0768458d2cd4
--- /dev/null
+++ b/tutorials/relay_quick_start.py
@@ -0,0 +1,144 @@
+"""
+.. _tutorial-relay-quick-start:
+Quick Start Tutorial for Compiling Deep Learning Models
+======================================================e
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
+
+This example shows how to build a neural network with Relay python frontend and
+generates a runtime library for Nvidia GPU with TVM.
+Notice that you need to build TVM with cuda and llvm enabled.
+"""
+
+######################################################################
+# Overview for Supported Hardware Backend of TVM
+# ----------------------------------------------
+# The image below shows hardware backend currently supported by TVM:
+#
+# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/tvm_support_list.png
+#      :align: center
+#      :scale: 100%
+#
+# In this tutorial, we'll choose cuda and llvm as target backends.
+# To begin with, let's import Relay and TVM.
+
+import numpy as np
+
+from tvm import relay
+from tvm.relay import testing
+import tvm
+from tvm.contrib import graph_runtime
+
+######################################################################
+# Define Neural Network in Relay
+# -----------------------------
+# First, let's define a neural network with relay python frontend.
+# For simplicity, we'll use pre-defined resnet-18 network in Relay.
+# Parameters are initialized with Xavier initializer.
+# Relay also supports other model formats such as MXNet, CoreML, ONNX and
+# Tensorflow.
+#
+# In this tutorial, we assume we will do inference on our device
+# and the batch size is set to be 1. Input images are RGB color
+# images of size 224 * 224. We can call the :any:`tvm.relay.expr.astext()`
+# to show the network structure.
+
+batch_size = 1
+num_class = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+out_shape = (batch_size, num_class)
+
+net, params = relay.testing.resnet.get_workload(
+    num_layers=18, batch_size=batch_size, image_shape=image_shape)
+
+# set show_meta_data=True if you want to show meta data
+print(net.astext(show_meta_data=False))
+
+######################################################################
+# Compilation
+# -----------
+# Next step is to compile the model using the Relay/TVM pipeline.
+# Users can specify the optimization level of the compilation.
+# Currently this value can be 0 to 3. The optimization passes include
+# operator fusion, pre-computation, layout transformation and so on.
+#
+# :any:`relay.build_module.build` returns three components: the execution graph in
+# json format, the TVM module library of compiled functions specifically
+# for this graph on the target hardware, and the parameter blobs of
+# the model. During the compilation, Relay does the graph-level
+# optimization while TVM does the tensor-level optimization, resulting
+# in an optimized runtime module for model serving.
+#
+# We'll first compile for Nvidia GPU. Behind the scene, `relay.build_module.build`
+# first does a number of graph-level optimizations, e.g. pruning, fusing, etc.,
+# then registers the operators (i.e. the nodes of the optimized graphs) to
+# TVM implementations to generate a `tvm.module`.
+# To generate the module library, TVM will first transfer the high level IR
+# into the lower intrinsic IR of the specified target backend, which is CUDA
+# in this example. Then the machine code will be generated as the module library.
+
+opt_level = 3
+target = tvm.target.cuda()
+with relay.build_config(opt_level=opt_level):
+    graph, lib, params = relay.build_module.build(
+        net, target, params=params)
+
+#####################################################################
+# Run the generate library
+# ------------------------
+# Now we can create graph runtime and run the module on Nvidia GPU.
+
+# create random input
+ctx = tvm.gpu()
+data = np.random.uniform(-1, 1, size=data_shape).astype("float32")
+# create module
+module = graph_runtime.create(graph, lib, ctx)
+# set input and parameters
+module.set_input("data", data)
+module.set_input(**params)
+# run
+module.run()
+# get output
+out = module.get_output(0, tvm.nd.empty(out_shape)).asnumpy()
+
+# Print first 10 elements of output
+print(out.flatten()[0:10])
+
+######################################################################
+# Save and Load Compiled Module
+# -----------------------------
+# We can also save the graph, lib and parameters into files and load them
+# back in deploy environment.
+
+####################################################
+
+# save the graph, lib and params into separate files
+from tvm.contrib import util
+
+temp = util.tempdir()
+path_lib = temp.relpath("deploy_lib.tar")
+lib.export_library(path_lib)
+with open(temp.relpath("deploy_graph.json"), "w") as fo:
+    fo.write(graph)
+with open(temp.relpath("deploy_param.params"), "wb") as fo:
+    fo.write(relay.save_param_dict(params))
+print(temp.listdir())
+
+####################################################
+
+# load the module back.
+loaded_json = open(temp.relpath("deploy_graph.json")).read()
+loaded_lib = tvm.module.load(path_lib)
+loaded_params = bytearray(open(temp.relpath("deploy_param.params"), "rb").read())
+input_data = tvm.nd.array(np.random.uniform(size=data_shape).astype("float32"))
+
+module = graph_runtime.create(loaded_json, loaded_lib, ctx)
+module.load_params(loaded_params)
+module.run(data=input_data)
+out_deploy = module.get_output(0).asnumpy()
+
+# Print first 10 elements of output
+print(out_deploy.flatten()[0:10])
+
+# check whether the output from deployed module is consistent with original one
+tvm.testing.assert_allclose(out_deploy, out, atol=1e-3)

From 7cfd579733c1cd04f356bee23dd4c2452a1bf653 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Fri, 1 Mar 2019 01:22:52 +0800
Subject: [PATCH 42/93] [RELAY][PASS] Enable switching CanonicalizeOps in
 pass_enabled (#2696)

---
 python/tvm/relay/build_module.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 9ca986907567..7d63513d7dc0 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -21,6 +21,7 @@
     "CombineParallelConv2D": 3,
     "FoldScaleAxis": 3,
     "AlterOpLayout": 3,
+    "CanonicalizeOps": 3,
 }
 
 
@@ -177,13 +178,15 @@ def optimize(func, target=None, params=None):
         func = ir_pass.forward_fold_scale_axis(func)
         func = ir_pass.fold_constant(func)
 
+    if cfg.pass_enabled("CanonicalizeOps"):
+        func = ir_pass.infer_type(func)
+        func = ir_pass.canonicalize_ops(func)
+
     # FIXME(zhiics) Skip AlterOpLayout pass for heterogeneous compilation for
     # now. We probably need to pass target to this pass as well. Fix it in
     # a followup PR.
     if cfg.pass_enabled("AlterOpLayout"):
         if isinstance(target, _target.Target):
-            func = ir_pass.infer_type(func)
-            func = ir_pass.canonicalize_ops(func)
             func = ir_pass.infer_type(func)
             with target:
                 func = ir_pass.alter_op_layout(func)

From 462c88a319f77dee84697f75bed719daf3d69b0b Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Thu, 28 Feb 2019 18:23:59 +0000
Subject: [PATCH 43/93] Docker updates (#2702)

* [DOCKER] Switch from yes|apt-get to apt-get -y

The yes | apt-get idom guarantees that the 'yes' process always exists
with exit code 141 (pipe broken).  This is fine while the script
generally ignores failures but won't work when the script behaviour is
tightened to robustly catch errors.

* [DOCKER] Turn down the wget/curl volume
---
 docker/install/ubuntu_install_androidsdk.sh | 2 +-
 docker/install/ubuntu_install_antlr.sh      | 2 +-
 docker/install/ubuntu_install_darknet.sh    | 2 +-
 docker/install/ubuntu_install_emscripten.sh | 4 ++--
 docker/install/ubuntu_install_gradle.sh     | 2 +-
 docker/install/ubuntu_install_iverilog.sh   | 2 +-
 docker/install/ubuntu_install_llvm.sh       | 2 +-
 docker/install/ubuntu_install_nodejs.sh     | 2 +-
 docker/install/ubuntu_install_python.sh     | 4 ++--
 docker/install/ubuntu_install_rust.sh       | 2 +-
 docker/install/ubuntu_install_sgx.sh        | 4 ++--
 docker/install/ubuntu_install_tflite.sh     | 2 +-
 docker/install/ubuntu_install_vulkan.sh     | 2 +-
 13 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
index a5c02e573b43..fa21d57c410f 100644
--- a/docker/install/ubuntu_install_androidsdk.sh
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -7,7 +7,7 @@ ASDKTOOLS_HOME=/opt/android-sdk-tools
 ASDKTOOLS_VERSION=3859397
 ASDKTOOLS_SHA256=444e22ce8ca0f67353bda4b85175ed3731cae3ffa695ca18119cbacef1c1bea0
 
-wget http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
+wget -q http://dl.google.com/android/repository/sdk-tools-linux-${ASDKTOOLS_VERSION}.zip -O sdk-tools-linux.zip
 echo "${ASDKTOOLS_SHA256} *sdk-tools-linux.zip" | sha256sum --check -
 unzip sdk-tools-linux.zip
 rm sdk-tools-linux.zip
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
index d2f2d6a8c48f..6eb004213778 100644
--- a/docker/install/ubuntu_install_antlr.sh
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -1,3 +1,3 @@
 cd /usr/local/lib
-wget https://www.antlr.org/download/antlr-4.7.1-complete.jar
+wget -q https://www.antlr.org/download/antlr-4.7.1-complete.jar
 cd -
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
index f5e0c2791d80..ecfb19626bc5 100644
--- a/docker/install/ubuntu_install_darknet.sh
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -1,4 +1,4 @@
 #install the necessary dependancies, cffi, opencv
-wget 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
+wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
 pip2 install opencv-python cffi
 pip3 install opencv-python cffi
diff --git a/docker/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
index 31470bb69de9..4902538c9cfa 100644
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -1,11 +1,11 @@
 alias make="make -j4"
 
 # Get latest cmake
-wget https://cmake.org/files/v3.8/cmake-3.8.2-Linux-x86_64.tar.gz
+wget -q https://cmake.org/files/v3.8/cmake-3.8.2-Linux-x86_64.tar.gz
 tar xf cmake-3.8.2-Linux-x86_64.tar.gz
 export PATH=/cmake-3.8.2-Linux-x86_64/bin/:${PATH}
 
-wget https://s3.amazonaws.com/mozilla-games/emscripten/releases/emsdk-portable.tar.gz
+wget -q https://s3.amazonaws.com/mozilla-games/emscripten/releases/emsdk-portable.tar.gz
 tar xf emsdk-portable.tar.gz
 cd emsdk-portable
 ./emsdk update
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
index b1535c98cabb..9cc3a170e8ea 100644
--- a/docker/install/ubuntu_install_gradle.sh
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -7,7 +7,7 @@ GRADLE_VERSION=4.10-rc-2
 GRADLE_SHA256=e90d3c32910e259814bcca82b3911172ecca1ff1ab5ed69b4de3c1df8b378b40
 
 echo "Downloading Gradle"
-wget --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
+wget -q --output-document=gradle.zip "https://services.gradle.org/distributions/gradle-${GRADLE_VERSION}-bin.zip"
 echo "Checking Gradle hash"
 echo "${GRADLE_SHA256} *gradle.zip" | sha256sum --check -
 echo "Installing Gradle"
diff --git a/docker/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
index bf7a0001dc70..358bf9dc8376 100644
--- a/docker/install/ubuntu_install_iverilog.sh
+++ b/docker/install/ubuntu_install_iverilog.sh
@@ -1,5 +1,5 @@
 apt-get install -y --no-install-recommends --force-yes make bison flex
-wget ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
+wget -q ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
 tar xf verilog-10.1.tar.gz
 cd verilog-10.1
 ./configure --prefix=/usr
diff --git a/docker/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
index 16d0fe150b7e..6a20fb227d38 100644
--- a/docker/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -18,5 +18,5 @@ echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
-wget -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
+wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
 apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
index 75d367dfa98f..fd43b4149af4 100644
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -1,4 +1,4 @@
 apt-get update && apt-get install -y curl
-curl -sL https://deb.nodesource.com/setup_6.x | bash -
+curl -s -S -L https://deb.nodesource.com/setup_6.x | bash -
 apt-get update && apt-get install -y nodejs
 npm install eslint jsdoc ws
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
index a34019e1003e..ec30e77fb400 100644
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -2,11 +2,11 @@
 apt-get update && apt-get install -y python-dev
 
 # python 3.6
-apt-get update && yes | apt-get install software-properties-common
+apt-get update && apt-get install -y software-properties-common
 add-apt-repository ppa:jonathonf/python-3.6 &&\
     apt-get update && apt-get install -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
 # Install pip
-cd /tmp && wget https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
+cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
index fed63d58a27b..ab75802c84ed 100644
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -3,7 +3,7 @@ apt-get update && apt-get install -y --no-install-recommends --force-yes curl
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
+curl -s -S https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
index ca5f517849d8..aea93d294d27 100644
--- a/docker/install/ubuntu_install_sgx.sh
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -8,7 +8,7 @@ apt-get update && apt-get install -y --no-install-recommends --force-yes \
 git clone https://github.com/intel/linux-sgx.git
 cd linux-sgx
 git checkout sgx_2.2
-curl 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
+curl -s -S 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
 ./download_prebuilt.sh
 make -j4 sdk && make -j4 sdk_install_pkg
 ./linux/installer/bin/sgx_linux_x64_sdk*.bin --prefix /opt
@@ -17,5 +17,5 @@ cd -
 git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
 cd /opt/rust-sgx-sdk
 git checkout 6098af # v1.0.5
-curl 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
+curl -s -S 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
 cd -
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
index 97235c4644f5..5df01f186c26 100644
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -13,7 +13,7 @@ pip2 install flatbuffers
 # Setup tflite from schema
 mkdir tflite
 cd tflite
-wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
+wget -q https://raw.githubusercontent.com/tensorflow/tensorflow/r1.12/tensorflow/contrib/lite/schema/schema.fbs
 flatc --python schema.fbs
 
 cat <<EOM >setup.py
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
index a4155da49651..72a6139905e6 100644
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -1,6 +1,6 @@
 #/bin/bash
 
-wget https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
+wget -q https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
 
 bash vulkansdk-linux-x86_64-1.0.65.0.run
 mv VulkanSDK /usr/local/VulkanSDK

From 97b35c3bfdd7e914355871afd72e7b1765c1344e Mon Sep 17 00:00:00 2001
From: Wei Chen <ipondering.weic@gmail.com>
Date: Thu, 28 Feb 2019 10:25:27 -0800
Subject: [PATCH 44/93] [Relay][Doc] Separate arguments types formatting with
 comma (#2690)

---
 src/relay/op/type_relations.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc
index 9152f0677616..0ae7ab2f9e33 100644
--- a/src/relay/op/type_relations.cc
+++ b/src/relay/op/type_relations.cc
@@ -90,8 +90,8 @@ bool BroadcastRel(const Array<Type>& types,
                   const Attrs& attrs,
                   const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
-                  << "Out: " << types[2] << std::endl;
+  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+                  << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);
@@ -108,8 +108,8 @@ bool BroadcastCompRel(const Array<Type>& types,
                       const Attrs& attrs,
                       const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 3);
-  RELAY_LOG(INFO) << "In1: " << types[0] << "In2: " << types[1]
-                  << "Out: " << types[2] << std::endl;
+  RELAY_LOG(INFO) << "In1:" << types[0] << ",In2:" << types[1]
+                  << ",Out:" << types[2] << std::endl;
   if (auto t0 = ToTensorType(types[0])) {
     if (auto t1 = ToTensorType(types[1])) {
       CHECK_EQ(t0->dtype, t1->dtype);

From d68ab9a623e69affe4a3f7d5688e1d63af844e09 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Fri, 1 Mar 2019 03:26:27 +0900
Subject: [PATCH 45/93] [DOC] MXNet frontend tutorial (#2688)

---
 tutorials/frontend/from_mxnet.py | 120 +++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 tutorials/frontend/from_mxnet.py

diff --git a/tutorials/frontend/from_mxnet.py b/tutorials/frontend/from_mxnet.py
new file mode 100644
index 000000000000..a465350a0df8
--- /dev/null
+++ b/tutorials/frontend/from_mxnet.py
@@ -0,0 +1,120 @@
+"""
+.. _tutorial-from-mxnet:
+
+Compile MXNet Models
+====================
+**Author**: `Joshua Z. Zhang <https://zhreshold.github.io/>`_, \
+            `Kazutaka Morita <https://github.com/kazum>`_
+
+This article is an introductory tutorial to deploy mxnet models with Relay.
+
+For us to begin with, mxnet module is required to be installed.
+
+A quick solution is
+
+.. code-block:: bash
+
+    pip install mxnet --user
+
+or please refer to offical installation guide.
+https://mxnet.incubator.apache.org/versions/master/install/index.html
+"""
+# some standard imports
+import mxnet as mx
+import tvm
+import tvm.relay as relay
+import numpy as np
+
+######################################################################
+# Download Resnet18 model from Gluon Model Zoo
+# ---------------------------------------------
+# In this section, we download a pretrained imagenet model and classify an image.
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+from matplotlib import pyplot as plt
+block = get_model('resnet18_v1', pretrained=True)
+img_name = 'cat.png'
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+image = Image.open(img_name).resize((224, 224))
+plt.imshow(image)
+plt.show()
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+print('x', x.shape)
+
+######################################################################
+# Compile the Graph
+# -----------------
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+shape_dict = {'data': x.shape}
+func, params = relay.frontend.from_mxnet(block, shape_dict)
+## we want a probability so add a softmax operator
+func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
+
+######################################################################
+# now compile the graph
+target = 'cuda'
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+######################################################################
+# Execute the portable graph on TVM
+# ---------------------------------
+# Now, we would like to reproduce the same forward computation using TVM.
+from tvm.contrib import graph_runtime
+ctx = tvm.gpu(0)
+dtype = 'float32'
+m = graph_runtime.create(graph, lib, ctx)
+# set inputs
+m.set_input('data', tvm.nd.array(x.astype(dtype)))
+m.set_input(**params)
+# execute
+m.run()
+# get outputs
+tvm_output = m.get_output(0)
+top1 = np.argmax(tvm_output.asnumpy()[0])
+print('TVM prediction top-1:', top1, synset[top1])
+
+######################################################################
+# Use MXNet symbol with pretrained weights
+# ----------------------------------------
+# MXNet often use `arg_params` and `aux_params` to store network parameters
+# separately, here we show how to use these weights with existing API
+def block2symbol(block):
+    data = mx.sym.Variable('data')
+    sym = block(data)
+    args = {}
+    auxs = {}
+    for k, v in block.collect_params().items():
+        args[k] = mx.nd.array(v.data().asnumpy())
+    return sym, args, auxs
+mx_sym, args, auxs = block2symbol(block)
+# usually we would save/load it as checkpoint
+mx.model.save_checkpoint('resnet18_v1', 0, mx_sym, args, auxs)
+# there are 'resnet18_v1-0000.params' and 'resnet18_v1-symbol.json' on disk
+
+######################################################################
+# for a normal mxnet model, we start from here
+mx_sym, args, auxs = mx.model.load_checkpoint('resnet18_v1', 0)
+# now we use the same API to get Relay computation graph
+relay_func, relay_params = relay.frontend.from_mxnet(mx_sym, shape_dict,
+                                                     arg_params=args, aux_params=auxs)
+# repeat the same steps to run this model using TVM

From 6f193a9f464aea6c933805c0b24c07f6cab7b452 Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Thu, 28 Feb 2019 21:45:02 +0300
Subject: [PATCH 46/93] Few docs fixes (#2703)

---
 docs/install/from_source.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 81d06f1dc27f..7c0f5432ec94 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -5,7 +5,7 @@ Install from Source
 This page gives instructions on how to build and install the tvm package from
 scratch on various systems. It consists of two steps:
 
-1. First build the shared library from the C++ codes (`libtvm.so` for linux/osx and `libtvm.dll` for windows).
+1. First build the shared library from the C++ codes (`libtvm.so` for linux, `libtvm.dylib` for macOS and `libtvm.dll` for windows).
 2. Setup for the language packages (e.g. Python Package).
 
 To get started, clone tvm repo from github. It is important to clone the submodules along, with ``--recursive`` option.
@@ -28,7 +28,7 @@ Build the Shared Library
 Our goal is to build the shared libraries:
 
 - On Linux the target library are `libtvm.so, libtvm_topi.so`
-- On OSX the target library are `libtvm.dylib, libtvm_topi.dylib`
+- On macOS the target library are `libtvm.dylib, libtvm_topi.dylib`
 - On Windows the target library are `libtvm.dll, libtvm_topi.dll`
 
 
@@ -60,7 +60,7 @@ The configuration of tvm can be modified by `config.cmake`.
 
 - Edit ``build/config.cmake`` to customize the compilation options
 
-  - On macOS, for some versions of XCode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
+  - On macOS, for some versions of Xcode, you need to add ``-lc++abi`` in the LDFLAGS or you'll get link errors.
   - Change ``set(USE_CUDA OFF)`` to ``set(USE_CUDA ON)`` to enable CUDA backend. So do other backends and libraries
     (OpenCL, RCOM, METAL, VULKAN, ...).
 

From 27475e769f023d9fe0c630cdec9476c80ebf2d67 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Fri, 1 Mar 2019 02:35:01 +0000
Subject: [PATCH 47/93] Pin pylint version 2.2.2 (#2698)

---
 docker/Dockerfile.ci_lint                       | 2 +-
 docker/install/ubuntu_install_python_package.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 132e8ebb7df9..461a5f1f1135 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y sudo wget
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 RUN apt-get install -y doxygen graphviz
-RUN pip3 install cpplint pylint mypy
+RUN pip3 install cpplint pylint==2.2.2 mypy
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index da8ade668619..3e54271afa48 100644
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,3 @@
 # install libraries for python package on ubuntu
-pip2 install nose pylint six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
-pip3 install nose pylint six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
+pip2 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
+pip3 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs

From e8d54a056b4aae8d47390ae6c78fd1786628d383 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Fri, 1 Mar 2019 09:23:43 -0800
Subject: [PATCH 48/93] [Relay] fix checkwellform (#2705)

* do

* address comment
---
 src/relay/pass/well_formed.cc             | 11 ++++++++-
 tests/python/relay/test_ir_well_formed.py | 27 +++++++++++++++++++----
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/src/relay/pass/well_formed.cc b/src/relay/pass/well_formed.cc
index d9c6b617ca5f..159e073673da 100644
--- a/src/relay/pass/well_formed.cc
+++ b/src/relay/pass/well_formed.cc
@@ -5,6 +5,7 @@
  */
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
+#include <tvm/relay/pattern_functor.h>
 #include <unordered_set>
 
 namespace tvm {
@@ -12,7 +13,7 @@ namespace relay {
 
 
 //! brief make sure each Var is bind at most once.
-class WellFormedChecker : private ExprVisitor {
+class WellFormedChecker : private ExprVisitor, PatternVisitor {
   bool well_formed = true;
 
   std::unordered_set<Var, NodeHash, NodeEqual> s;
@@ -39,6 +40,14 @@ class WellFormedChecker : private ExprVisitor {
     CheckWellFormed(f->body);
   }
 
+  void VisitPattern(const Pattern& p) final {
+    PatternVisitor::VisitPattern(p);
+  }
+
+  void VisitVar(const Var& v) final {
+    Check(v);
+  }
+
  public:
   bool CheckWellFormed(const Expr& e) {
     this->VisitExpr(e);
diff --git a/tests/python/relay/test_ir_well_formed.py b/tests/python/relay/test_ir_well_formed.py
index 725b2fbd3c3d..b9e907144785 100644
--- a/tests/python/relay/test_ir_well_formed.py
+++ b/tests/python/relay/test_ir_well_formed.py
@@ -1,9 +1,10 @@
 import tvm
 from tvm import relay
 from tvm.relay.ir_pass import well_formed
+from tvm.relay.prelude import Prelude
 
-def test_well_formed():
-    x = relay.Var('x')
+def test_let():
+    x = relay.Var("x")
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     ty = None
@@ -18,7 +19,7 @@ def test_well_formed():
 
 
 def test_tuple():
-    x = relay.Var('x')
+    x = relay.Var("x")
     assert well_formed(x)
     v = relay.Constant(tvm.nd.array(10))
     let = relay.Let(x, v, x)
@@ -28,5 +29,23 @@ def test_tuple():
 
 
 def test_tuple_get_item():
-    t = relay.Var('t')
+    t = relay.Var("t")
     assert well_formed(relay.TupleGetItem(t, 2))
+
+
+def test_adt():
+    mod = relay.Module()
+    p = Prelude(mod)
+    x = relay.Var("x")
+    s_case = relay.Clause(relay.PatternConstructor(p.s, [relay.PatternVar(x)]), x)
+    default_case = relay.Clause(relay.PatternVar(x), x)
+    m0 = relay.Match(p.z(), [default_case])
+    m1 = relay.Match(p.z(), [s_case, default_case])
+    assert well_formed(m0)
+    assert not well_formed(m1)
+
+if __name__ == "__main__":
+    test_let()
+    test_tuple()
+    test_tuple_get_item()
+    test_adt()

From cc10159218bb38a0321c0b0b068e65437c07a4f2 Mon Sep 17 00:00:00 2001
From: Hao Jin <hjjn.amzn@gmail.com>
Date: Fri, 1 Mar 2019 09:26:45 -0800
Subject: [PATCH 49/93] support MXNet _minimum and _maximum (#2709)

---
 nnvm/python/nnvm/frontend/mxnet.py            |  9 ++-
 .../python/frontend/mxnet/test_forward.py     | 64 +++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index d8855693e7d5..0b994861deef 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -294,10 +294,15 @@ def _symbol_ring_buffer(inputs, attrs):
 def _copy(inputs, _):
     return _get_nnvm_op('copy')(inputs[0], **{})
 
-
 def _argmax(inputs, attrs):
     return _get_nnvm_op('argmax')(*inputs, **attrs)
 
+def _minimum(inputs, attrs):
+    return _get_nnvm_op('broadcast_min')(*inputs, **attrs)
+
+def _maximum(inputs, attrs):
+    return _get_nnvm_op('broadcast_max')(*inputs, **attrs)
+
 def _ones(_, attrs):
     op_name = 'ones'
     return _get_nnvm_op(op_name)(**attrs)
@@ -342,6 +347,8 @@ def _argmin(inputs, attrs):
     '_rminus_scalar': _rename('__rsub_scalar__'),
     '_contrib_MultiBoxPrior' : _rename('multibox_prior'),
     '_contrib_MultiBoxDetection' : _contrib_multibox_detection,
+    '_minimum'      : _minimum,
+    '_maximum'      : _maximum,
     '_ones'         : _ones,
     '_zeros'        : _zeros,
     'argmax'        : _argmax,
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index 97ffa20b3edc..e046f39f02ca 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -227,6 +227,68 @@ def test_forward_slice():
     mx_sym = mx.sym.slice(data, begin=(-1, 1), end=(-3, 4), step=(-1, 2))
     verify_mxnet_frontend_impl(mx_sym, (3, 4), (2, 2))
 
+def test_forward_maximum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._maximum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._maximum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
+def test_forward_minimum():
+    a = mx.sym.var('a')
+    b = mx.sym.var('b')
+    dshape = (10, 20)
+    dtype = 'float32'
+    mx_sym = mx.sym._internal._minimum(a, b)
+    np_a = np.random.uniform(size=dshape).astype(dtype)
+    np_b = np.random.uniform(size=dshape).astype(dtype)
+    mx_a = mx.nd.array(np_a)
+    mx_b = mx.nd.array(np_b)
+    mod = mx.mod.Module(mx_sym, label_names=None, data_names=['a', 'b'])
+    mod.bind(data_shapes=[('a', dshape), ('b', dshape)], for_training=False)
+    mod.init_params()
+    args, auxs = mod.get_params()
+    mx_out = mx.nd._internal._minimum(mx_a, mx_b).asnumpy()
+    out_shape = dshape
+    new_sym, params = frontend.from_mxnet(mx_sym, args, auxs)
+    shape_dict = {'a': dshape, 'b': dshape}
+    for target, ctx in ctx_list():
+        with nnvm.compiler.build_config(opt_level=3):
+            graph, lib, params = nnvm.compiler.build(new_sym, target, shape_dict, params=params)
+        m = graph_runtime.create(graph, lib, ctx)
+        # set inputs
+        m.set_input("a", tvm.nd.array(np_a))
+        m.set_input("b", tvm.nd.array(np_b))
+        m.set_input(**params)
+        m.run()
+        # get outputs
+        tvm_out = m.get_output(0, tvm.nd.empty(out_shape, dtype)).asnumpy()
+        tvm.testing.assert_allclose(mx_out, tvm_out, rtol=1e-5, atol=1e-5)
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -251,4 +313,6 @@ def test_forward_slice():
     test_forward_argmin()
     test_forward_where()
     test_forward_slice()
+    test_forward_maximum()
+    test_forward_minimum()
 

From 851fbbd1094ecc146e703d39f7c5784baf2f1d91 Mon Sep 17 00:00:00 2001
From: eqy <eqy@cs.washington.edu>
Date: Fri, 1 Mar 2019 09:44:27 -0800
Subject: [PATCH 50/93] [TOPI][Relay] Fix default `out_dtype` for
 `conv2d_NCHWc` and Relay (#2707)

---
 topi/python/topi/x86/conv2d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index e8ccee8bd818..7bad04ddcd46 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -294,6 +294,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     padding = attrs.get_int_tuple("padding")
     strides = attrs.get_int_tuple("strides")
     dilation = attrs.get_int_tuple("dilation")
+    out_dtype = attrs["out_dtype"]
 
     layout_name = 'layout' if F == sym else 'data_layout'
 
@@ -301,7 +302,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     kh, kw = attrs.get_int_tuple("kernel_size")
 
     dtype = data.dtype
-    out_dtype = dtype if attrs["out_dtype"] == "same" else attrs["out_dtype"]
+    out_dtype = dtype if out_dtype in ("same", "") else out_dtype
     is_depthwise = groups == in_channel and groups == out_channel
 
     # only optimize for NCHW

From 240910cb3b8cce611ef841ab233471a60fc9dffb Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Fri, 1 Mar 2019 18:09:35 +0000
Subject: [PATCH 51/93] Improve task_lint.sh robustness (#2711)

* [SCRIPT] Refactor grep for multiple patterns

Tidy up the use of grep.  Use -E rather than run multiple grep
instances.

* [SCRIPT] Refactor grep use in pipeline.

Prefer to use stdin redirection rather than create a pipeline.

* [SCRIPT] Refactor placement and cleanup of temporary files.

Place temporary files in the conventional /tmp location. Avoid
poisoning file name space by using $$. Ensure the temporary files get
cleaned up, even when the script fails / exits early.

* [SCRIPT] Improve robustness of task_lint.sh error handling.

Ensure script failures are caught and propagated.  Rather than trying
to explicitly catch and propagate failures with explicit "|| exit"
annotations, use the "set -e" idom from docker/install scripts and
have the shell catch and propagate errors in the general case and
special case the grep instances where non zero exit is permitted and
should be ignored.
---
 tests/scripts/task_lint.sh | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/tests/scripts/task_lint.sh b/tests/scripts/task_lint.sh
index 95f700172ed5..318671b082e6 100755
--- a/tests/scripts/task_lint.sh
+++ b/tests/scripts/task_lint.sh
@@ -1,17 +1,26 @@
 #!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+cleanup()
+{
+  rm -rf /tmp/$$.*
+}
+trap cleanup 0
+
 echo "Check codestyle of c++ code..."
-make cpplint || exit -1
+make cpplint
 echo "Check codestyle of python code..."
-make pylint || exit -1
+make pylint
 echo "Check codestyle of jni code..."
-make jnilint || exit -1
+make jnilint
 echo "Check documentations of c++ code..."
-make doc 2>log.txt
-(cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag") > logclean.txt
+make doc 2>/tmp/$$.log.txt
+
+grep -v -E "ENABLE_PREPROCESSING|unsupported tag" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true
 echo "---------Error Log----------"
-cat logclean.txt
+cat /tmp/$$.logclean.txt
 echo "----------------------------"
-(cat logclean.txt|grep warning) && exit -1
-(cat logclean.txt|grep error) && exit -1
-rm logclean.txt
-rm log.txt
+grep -E "warning|error" < /tmp/$$.logclean.txt || true

From 548dcd97fc99896c8ce533cc01aa2eff3a568cf4 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Fri, 1 Mar 2019 18:12:24 +0000
Subject: [PATCH 52/93] Docker build script robustness (#2710)

* [DOCKER] Make all install .sh scripts directly executable.

* [DOCKER] Use curl -L consistently.

Make the use of the curl -L option in docker build scripts consistent.

* [DOCKER] Drop use of --force-yes

The --force-yes option is generally not recommend, it can leave
systems in an undefined state.  The use of --allow-* options is
preferred.  In this particular case the --force-yes option appears to
serve no purpose.  Dropping it.

* [DOCKER] Drop superflous repeated apt-get update.

The "apt-get update && apt-get install" idiom is necessary and
specific to Dockerfile.  In shell the repeated apt-get update is
superflous.  Drop the duplicates.

* [DOCKER] Robustness -e -u -o pipefail

The install scripts used to construct docker environments do not, in
general, propagate errors.  Some of the scripts use adhoc &&
directives to chain together short sequences of commands but there are
numerous failure modes which are silently ignored.  This patch puts in
place some consistent, basic, shell error trapping across all of the
install scripts.

Note this is a step forward towards more robust scripts but it is not
a complete solution.

* [DOCKER] Shallow clone.

Use shallow clone to reduce bandwidth requirements of repeated docker
(re)-builds.

* [DOCKER] Use clone --branch rather than clone then checkout

Use the git clone --branch idiom rather than git clone && git
checkout.  This paves the way for using --depth=1
---
 docker/Dockerfile.ci_gpu                      |  2 +-
 docker/Dockerfile.demo_opencl                 |  2 +-
 docker/install/install_tvm_cpu.sh             |  8 +++++++-
 docker/install/install_tvm_gpu.sh             |  8 +++++++-
 docker/install/ubuntu_install_androidsdk.sh   |  5 ++++-
 docker/install/ubuntu_install_antlr.sh        |  6 ++++++
 docker/install/ubuntu_install_caffe2.sh       |  6 ++++++
 docker/install/ubuntu_install_core.sh         |  8 +++++++-
 docker/install/ubuntu_install_coreml.sh       |  6 ++++++
 docker/install/ubuntu_install_darknet.sh      |  6 ++++++
 docker/install/ubuntu_install_emscripten.sh   |  6 ++++++
 docker/install/ubuntu_install_gluoncv.sh      |  6 ++++++
 docker/install/ubuntu_install_golang.sh       | 13 ++++++++++---
 docker/install/ubuntu_install_gradle.sh       |  3 +++
 docker/install/ubuntu_install_iverilog.sh     |  8 +++++++-
 docker/install/ubuntu_install_java.sh         |  4 ++++
 docker/install/ubuntu_install_keras.sh        |  6 ++++++
 docker/install/ubuntu_install_llvm.sh         |  8 +++++++-
 docker/install/ubuntu_install_mxnet.sh        |  6 ++++++
 docker/install/ubuntu_install_nnpack.sh       | 12 +++++++-----
 docker/install/ubuntu_install_nodejs.sh       | 16 ++++++++++++++--
 docker/install/ubuntu_install_onnx.sh         |  6 ++++++
 docker/install/ubuntu_install_opencl.sh       |  8 +++++++-
 docker/install/ubuntu_install_opengl.sh       | 10 ++++++++--
 docker/install/ubuntu_install_python.sh       | 17 +++++++++++++----
 .../install/ubuntu_install_python_package.sh  |  6 ++++++
 docker/install/ubuntu_install_redis.sh        |  6 ++++++
 docker/install/ubuntu_install_rocm.sh         |  8 +++++++-
 docker/install/ubuntu_install_rust.sh         | 10 ++++++++--
 docker/install/ubuntu_install_sgx.sh          | 19 ++++++++++++-------
 docker/install/ubuntu_install_sphinx.sh       |  6 ++++++
 docker/install/ubuntu_install_tensorflow.sh   |  6 ++++++
 docker/install/ubuntu_install_tflite.sh       |  8 +++++++-
 docker/install/ubuntu_install_vulkan.sh       |  6 +++++-
 34 files changed, 224 insertions(+), 37 deletions(-)
 mode change 100644 => 100755 docker/install/install_tvm_cpu.sh
 mode change 100644 => 100755 docker/install/install_tvm_gpu.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_androidsdk.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_antlr.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_caffe2.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_core.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_coreml.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_darknet.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_emscripten.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_gluoncv.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_golang.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_gradle.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_iverilog.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_java.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_keras.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_llvm.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_mxnet.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_nnpack.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_nodejs.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_onnx.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_opencl.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_opengl.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_python.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_python_package.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_redis.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_rocm.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_rust.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_sgx.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_sphinx.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_tensorflow.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_tflite.sh
 mode change 100644 => 100755 docker/install/ubuntu_install_vulkan.sh

diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu
index 6a599b1e3917..a83d7000d0fe 100644
--- a/docker/Dockerfile.ci_gpu
+++ b/docker/Dockerfile.ci_gpu
@@ -24,7 +24,7 @@ COPY install/ubuntu_install_sphinx.sh /install/ubuntu_install_sphinx.sh
 RUN bash /install/ubuntu_install_sphinx.sh
 
 # Fix recommonmark to latest version
-RUN git clone https://github.com/rtfd/recommonmark
+RUN git clone --depth=1 https://github.com/rtfd/recommonmark
 RUN cd recommonmark; python3 setup.py install
 
 # Enable doxygen for c++ doc build
diff --git a/docker/Dockerfile.demo_opencl b/docker/Dockerfile.demo_opencl
index 460b901bf08f..2d0b45983902 100644
--- a/docker/Dockerfile.demo_opencl
+++ b/docker/Dockerfile.demo_opencl
@@ -45,7 +45,7 @@ RUN echo "Cloning TVM source & submodules"
 ENV TVM_PAR_DIR="/usr"
 RUN mkdir -p TVM_PAR_DIR && \
 	cd ${TVM_PAR_DIR} && \
-	git clone https://github.com/dmlc/tvm --recursive
+	git clone --depth=1 https://github.com/dmlc/tvm --recursive
 #RUN git submodule update --init --recursive
 
 
diff --git a/docker/install/install_tvm_cpu.sh b/docker/install/install_tvm_cpu.sh
old mode 100644
new mode 100755
index 461ad244d37c..04153559d27e
--- a/docker/install/install_tvm_cpu.sh
+++ b/docker/install/install_tvm_cpu.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr
-git clone https://github.com/dmlc/tvm --recursive
+git clone --depth=1 https://github.com/dmlc/tvm --recursive
 cd /usr/tvm
 echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
 echo set\(USE_RPC ON\) >> config.cmake
diff --git a/docker/install/install_tvm_gpu.sh b/docker/install/install_tvm_gpu.sh
old mode 100644
new mode 100755
index 8a1324646fd5..d31e10ce9ab9
--- a/docker/install/install_tvm_gpu.sh
+++ b/docker/install/install_tvm_gpu.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr
-git clone https://github.com/dmlc/tvm --recursive
+git clone --depth=1 https://github.com/dmlc/tvm --recursive
 cd /usr/tvm
 echo set\(USE_LLVM llvm-config-6.0\) >> config.cmake
 echo set\(USE_CUDA ON\) >> config.cmake
diff --git a/docker/install/ubuntu_install_androidsdk.sh b/docker/install/ubuntu_install_androidsdk.sh
old mode 100644
new mode 100755
index fa21d57c410f..96fdbe168d6d
--- a/docker/install/ubuntu_install_androidsdk.sh
+++ b/docker/install/ubuntu_install_androidsdk.sh
@@ -1,6 +1,9 @@
+#!/bin/bash
+
 . /etc/profile
 
 set -o errexit -o nounset
+set -o pipefail
 
 ANDROID_HOME=/opt/android-sdk-linux
 ASDKTOOLS_HOME=/opt/android-sdk-tools
@@ -58,7 +61,7 @@ EOF
 
 mkdir /root/.android 2>/dev/null || true
 touch /root/.android/repositories.cfg
-yes | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
+(yes || true) | sdkmanager --licenses --sdk_root="$ANDROID_HOME"
 sdkmanager --verbose --package_file=/install/package-list-minimal.txt --sdk_root="$ANDROID_HOME"
 test -d "${ANDROID_HOME}/build-tools/27.0.3"
 test -d "${ANDROID_HOME}/ndk-bundle"
diff --git a/docker/install/ubuntu_install_antlr.sh b/docker/install/ubuntu_install_antlr.sh
old mode 100644
new mode 100755
index 6eb004213778..6dae3ae12d56
--- a/docker/install/ubuntu_install_antlr.sh
+++ b/docker/install/ubuntu_install_antlr.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 cd /usr/local/lib
 wget -q https://www.antlr.org/download/antlr-4.7.1-complete.jar
 cd -
diff --git a/docker/install/ubuntu_install_caffe2.sh b/docker/install/ubuntu_install_caffe2.sh
old mode 100644
new mode 100755
index 5fe827927e87..bb9322704918
--- a/docker/install/ubuntu_install_caffe2.sh
+++ b/docker/install/ubuntu_install_caffe2.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 python3 -m caffe2.python.models.download -i -f squeezenet
 python3 -m caffe2.python.models.download -i -f resnet50
 python3 -m caffe2.python.models.download -i -f vgg19
diff --git a/docker/install/ubuntu_install_core.sh b/docker/install/ubuntu_install_core.sh
old mode 100644
new mode 100755
index efc69c946b97..c7e2918971fd
--- a/docker/install/ubuntu_install_core.sh
+++ b/docker/install/ubuntu_install_core.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install libraries for building c++ core on ubuntu
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends \
         git make libgtest-dev cmake wget unzip libtinfo-dev libz-dev\
         libcurl4-openssl-dev libopenblas-dev g++ sudo
 
diff --git a/docker/install/ubuntu_install_coreml.sh b/docker/install/ubuntu_install_coreml.sh
old mode 100644
new mode 100755
index 4b0fd126c61d..51afc1423961
--- a/docker/install/ubuntu_install_coreml.sh
+++ b/docker/install/ubuntu_install_coreml.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install coremltools
diff --git a/docker/install/ubuntu_install_darknet.sh b/docker/install/ubuntu_install_darknet.sh
old mode 100644
new mode 100755
index ecfb19626bc5..5c350b848bf7
--- a/docker/install/ubuntu_install_darknet.sh
+++ b/docker/install/ubuntu_install_darknet.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 #install the necessary dependancies, cffi, opencv
 wget -q 'https://github.com/siju-samuel/darknet/blob/master/lib/libdarknet.so?raw=true' -O libdarknet.so
 pip2 install opencv-python cffi
diff --git a/docker/install/ubuntu_install_emscripten.sh b/docker/install/ubuntu_install_emscripten.sh
old mode 100644
new mode 100755
index 4902538c9cfa..4671c898438a
--- a/docker/install/ubuntu_install_emscripten.sh
+++ b/docker/install/ubuntu_install_emscripten.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 alias make="make -j4"
 
 # Get latest cmake
diff --git a/docker/install/ubuntu_install_gluoncv.sh b/docker/install/ubuntu_install_gluoncv.sh
old mode 100644
new mode 100755
index 0ca1a34cbc24..adfbdce7c7b1
--- a/docker/install/ubuntu_install_gluoncv.sh
+++ b/docker/install/ubuntu_install_gluoncv.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install gluoncv
diff --git a/docker/install/ubuntu_install_golang.sh b/docker/install/ubuntu_install_golang.sh
old mode 100644
new mode 100755
index 2361ccfbd2e4..c29e764cbb3a
--- a/docker/install/ubuntu_install_golang.sh
+++ b/docker/install/ubuntu_install_golang.sh
@@ -1,4 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 #install the necessary dependancies for golang build
-apt-get update && apt-get install -y golang-1.10-go
-apt-get update && apt-get install -y golang-1.10-doc
-apt-get update && apt-get install -y golint
+apt-get update
+apt-get install -y golang-1.10-go
+apt-get install -y golang-1.10-doc
+apt-get install -y golint
diff --git a/docker/install/ubuntu_install_gradle.sh b/docker/install/ubuntu_install_gradle.sh
old mode 100644
new mode 100755
index 9cc3a170e8ea..7f62406ca710
--- a/docker/install/ubuntu_install_gradle.sh
+++ b/docker/install/ubuntu_install_gradle.sh
@@ -1,6 +1,9 @@
+#!/bin/bash
+
 . /etc/profile
 
 set -o errexit -o nounset
+set -o pipefail
 
 GRADLE_HOME=/opt/gradle
 GRADLE_VERSION=4.10-rc-2
diff --git a/docker/install/ubuntu_install_iverilog.sh b/docker/install/ubuntu_install_iverilog.sh
old mode 100644
new mode 100755
index 358bf9dc8376..2304f697affd
--- a/docker/install/ubuntu_install_iverilog.sh
+++ b/docker/install/ubuntu_install_iverilog.sh
@@ -1,4 +1,10 @@
-apt-get install -y --no-install-recommends --force-yes make bison flex
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get install -y --no-install-recommends make bison flex
 wget -q ftp://icarus.com/pub/eda/verilog/v10/verilog-10.1.tar.gz
 tar xf verilog-10.1.tar.gz
 cd verilog-10.1
diff --git a/docker/install/ubuntu_install_java.sh b/docker/install/ubuntu_install_java.sh
old mode 100644
new mode 100755
index 462edc491627..e1f431bee845
--- a/docker/install/ubuntu_install_java.sh
+++ b/docker/install/ubuntu_install_java.sh
@@ -1,4 +1,8 @@
+#!/bin/bash
+
 set -o errexit -o nounset
+set -o pipefail
+
 apt-get update && apt-get install -y openjdk-8-jdk maven
 test -d "/usr/lib/jvm/java-8-openjdk-amd64/jre"
 echo "export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre" >> /etc/profile
diff --git a/docker/install/ubuntu_install_keras.sh b/docker/install/ubuntu_install_keras.sh
old mode 100644
new mode 100755
index 33bc38c80972..b689949d0dff
--- a/docker/install/ubuntu_install_keras.sh
+++ b/docker/install/ubuntu_install_keras.sh
@@ -1,2 +1,8 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip2 install keras tensorflow h5py
 pip3 install keras tensorflow h5py
diff --git a/docker/install/ubuntu_install_llvm.sh b/docker/install/ubuntu_install_llvm.sh
old mode 100644
new mode 100755
index 6a20fb227d38..a562c3258628
--- a/docker/install/ubuntu_install_llvm.sh
+++ b/docker/install/ubuntu_install_llvm.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 echo deb http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
      >> /etc/apt/sources.list.d/llvm.list
 echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial-4.0 main\
@@ -19,4 +25,4 @@ echo deb-src http://apt.llvm.org/xenial/ llvm-toolchain-xenial main\
      >> /etc/apt/sources.list.d/llvm.list
 
 wget -q -O - http://apt.llvm.org/llvm-snapshot.gpg.key|sudo apt-key add -
-apt-get update && apt-get install -y --force-yes llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
+apt-get update && apt-get install -y llvm-4.0 llvm-5.0 llvm-6.0 clang-6.0
diff --git a/docker/install/ubuntu_install_mxnet.sh b/docker/install/ubuntu_install_mxnet.sh
old mode 100644
new mode 100755
index 0e7e9e3939a8..a15dca7def07
--- a/docker/install/ubuntu_install_mxnet.sh
+++ b/docker/install/ubuntu_install_mxnet.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install mxnet
diff --git a/docker/install/ubuntu_install_nnpack.sh b/docker/install/ubuntu_install_nnpack.sh
old mode 100644
new mode 100755
index 83225d4aa820..1cf044a9b257
--- a/docker/install/ubuntu_install_nnpack.sh
+++ b/docker/install/ubuntu_install_nnpack.sh
@@ -1,11 +1,13 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes git cmake
+#!/bin/bash
 
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends git cmake
 
-git clone https://github.com/Maratyszcza/NNPACK NNPACK
-cd NNPACK
 # TODO: specific tag?
-git checkout 1e005b0c2
-cd -
+git clone --branch=1e005b0c2 --depth=1 https://github.com/Maratyszcza/NNPACK NNPACK
 
 mkdir -p NNPACK/build
 cd NNPACK/build
diff --git a/docker/install/ubuntu_install_nodejs.sh b/docker/install/ubuntu_install_nodejs.sh
old mode 100644
new mode 100755
index fd43b4149af4..dfdd0432e4db
--- a/docker/install/ubuntu_install_nodejs.sh
+++ b/docker/install/ubuntu_install_nodejs.sh
@@ -1,4 +1,16 @@
-apt-get update && apt-get install -y curl
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update
+apt-get install -y curl
+
+# The node install script fetched and executed here will update the
+# apt source list, hence the second apt-get update is necessary.
 curl -s -S -L https://deb.nodesource.com/setup_6.x | bash -
-apt-get update && apt-get install -y nodejs
+apt-get update
+apt-get install -y nodejs
+
 npm install eslint jsdoc ws
diff --git a/docker/install/ubuntu_install_onnx.sh b/docker/install/ubuntu_install_onnx.sh
old mode 100644
new mode 100755
index 517ea77ab81e..2778a2489667
--- a/docker/install/ubuntu_install_onnx.sh
+++ b/docker/install/ubuntu_install_onnx.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # fix to certain version for now
 pip2 install onnx>=1.1.0
 pip3 install onnx>=1.1.0
diff --git a/docker/install/ubuntu_install_opencl.sh b/docker/install/ubuntu_install_opencl.sh
old mode 100644
new mode 100755
index ca4d1d04fd5c..f16de615c4b1
--- a/docker/install/ubuntu_install_opencl.sh
+++ b/docker/install/ubuntu_install_opencl.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Install OpenCL runtime in nvidia docker.
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+apt-get update && apt-get install -y --no-install-recommends \
         ocl-icd-opencl-dev \
         clinfo && \
     rm -rf /var/lib/apt/lists/*
diff --git a/docker/install/ubuntu_install_opengl.sh b/docker/install/ubuntu_install_opengl.sh
old mode 100644
new mode 100755
index f8be6e351581..82050c14f307
--- a/docker/install/ubuntu_install_opengl.sh
+++ b/docker/install/ubuntu_install_opengl.sh
@@ -1,4 +1,10 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 apt-get update --fix-missing
 
-apt-get install -y --no-install-recommends --force-yes \
-        libgl1-mesa-dev libglfw3-dev
\ No newline at end of file
+apt-get install -y --no-install-recommends \
+        libgl1-mesa-dev libglfw3-dev
diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh
old mode 100644
new mode 100755
index ec30e77fb400..43c27b1b2def
--- a/docker/install/ubuntu_install_python.sh
+++ b/docker/install/ubuntu_install_python.sh
@@ -1,10 +1,19 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install python and pip, don't modify this, modify install_python_package.sh
-apt-get update && apt-get install -y python-dev
+apt-get update
+apt-get install -y python-dev
 
 # python 3.6
-apt-get update && apt-get install -y software-properties-common
-add-apt-repository ppa:jonathonf/python-3.6 &&\
-    apt-get update && apt-get install -y python-pip python-dev python3.6 python3.6-dev
+apt-get install -y software-properties-common
+
+add-apt-repository ppa:jonathonf/python-3.6
+apt-get update
+apt-get install -y python-pip python-dev python3.6 python3.6-dev
 
 rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3
 
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
old mode 100644
new mode 100755
index 3e54271afa48..c15ff75f260e
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # install libraries for python package on ubuntu
 pip2 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
 pip3 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
diff --git a/docker/install/ubuntu_install_redis.sh b/docker/install/ubuntu_install_redis.sh
old mode 100644
new mode 100755
index dfc9a3c381b6..d079170b0536
--- a/docker/install/ubuntu_install_redis.sh
+++ b/docker/install/ubuntu_install_redis.sh
@@ -1,3 +1,9 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 apt-get update && apt-get install -y redis-server
 pip2 install xgboost psutil
 pip3 install xgboost psutil
diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh
old mode 100644
new mode 100755
index d050c20078b8..be7f2364bf63
--- a/docker/install/ubuntu_install_rocm.sh
+++ b/docker/install/ubuntu_install_rocm.sh
@@ -1,4 +1,10 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Install ROCm cross compilation toolchain.
 wget -qO - http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add -
 echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list
-apt-get update && apt-get install -y --force-yes rocm-dev
+apt-get update && apt-get install -y rocm-dev
diff --git a/docker/install/ubuntu_install_rust.sh b/docker/install/ubuntu_install_rust.sh
old mode 100644
new mode 100755
index ab75802c84ed..67bcd15cbc84
--- a/docker/install/ubuntu_install_rust.sh
+++ b/docker/install/ubuntu_install_rust.sh
@@ -1,9 +1,15 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes curl
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends curl
 
 export RUSTUP_HOME=/opt/rust
 export CARGO_HOME=/opt/rust
 # this rustc is one supported by the installed version of rust-sgx-sdk
-curl -s -S https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
+curl -s -S -L https://sh.rustup.rs -sSf | sh -s -- -y --no-modify-path --default-toolchain nightly-2019-01-28
 . $CARGO_HOME/env
 rustup toolchain add nightly
 rustup component add rust-src
diff --git a/docker/install/ubuntu_install_sgx.sh b/docker/install/ubuntu_install_sgx.sh
old mode 100644
new mode 100755
index aea93d294d27..d2958e5d0893
--- a/docker/install/ubuntu_install_sgx.sh
+++ b/docker/install/ubuntu_install_sgx.sh
@@ -1,21 +1,26 @@
-apt-get update && apt-get install -y --no-install-recommends --force-yes \
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
+apt-get update && apt-get install -y --no-install-recommends \
     build-essential git cmake \
     wget python pkg-config software-properties-common \
     autoconf automake libtool ocaml \
     protobuf-compiler libprotobuf-dev \
     libssl-dev libcurl4-openssl-dev curl
 
-git clone https://github.com/intel/linux-sgx.git
+git clone --branch=sgx_2.2 --depth=1 https://github.com/intel/linux-sgx.git
 cd linux-sgx
-git checkout sgx_2.2
-curl -s -S 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
+curl -s -S -L 'https://gist.githubusercontent.com/nhynes/c770b0e91610f8c020a8d1a803a1e7cb/raw/8f5372d9cb88929b3cc49a384943bb363bc06827/intel-sgx.patch' | git apply
 ./download_prebuilt.sh
 make -j4 sdk && make -j4 sdk_install_pkg
 ./linux/installer/bin/sgx_linux_x64_sdk*.bin --prefix /opt
 cd -
 
-git clone https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
+tag=6098af # v1.0.5
+git clone --branch=$tag --depth=1 https://github.com/baidu/rust-sgx-sdk.git /opt/rust-sgx-sdk
 cd /opt/rust-sgx-sdk
-git checkout 6098af # v1.0.5
-curl -s -S 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
+curl -s -S -L 'https://gist.githubusercontent.com/nhynes/37164039c5d3f33aa4f123e4ba720036/raw/b0de575fe937231799930764e76c664b92975163/rust-sgx-sdk.diff' | git apply
 cd -
diff --git a/docker/install/ubuntu_install_sphinx.sh b/docker/install/ubuntu_install_sphinx.sh
old mode 100644
new mode 100755
index ba04c2e25e6f..50e1e92796c3
--- a/docker/install/ubuntu_install_sphinx.sh
+++ b/docker/install/ubuntu_install_sphinx.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install sphinx sphinx-gallery sphinx_rtd_theme sphinx_autodoc_annotation matplotlib Image commonmark>=0.7.3 docutils>=0.11
diff --git a/docker/install/ubuntu_install_tensorflow.sh b/docker/install/ubuntu_install_tensorflow.sh
old mode 100644
new mode 100755
index b773fcfb027b..4fdf9c0d46ab
--- a/docker/install/ubuntu_install_tensorflow.sh
+++ b/docker/install/ubuntu_install_tensorflow.sh
@@ -1 +1,7 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 pip3 install tensorflow
diff --git a/docker/install/ubuntu_install_tflite.sh b/docker/install/ubuntu_install_tflite.sh
old mode 100644
new mode 100755
index 5df01f186c26..ed8ea1deff3f
--- a/docker/install/ubuntu_install_tflite.sh
+++ b/docker/install/ubuntu_install_tflite.sh
@@ -1,5 +1,11 @@
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
+
 # Download, build and install flatbuffers
-git clone --recursive https://github.com/google/flatbuffers.git
+git clone --depth=1 --recursive https://github.com/google/flatbuffers.git
 cd flatbuffers
 cmake -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release
 make install -j8
diff --git a/docker/install/ubuntu_install_vulkan.sh b/docker/install/ubuntu_install_vulkan.sh
old mode 100644
new mode 100755
index 72a6139905e6..6772b029cc90
--- a/docker/install/ubuntu_install_vulkan.sh
+++ b/docker/install/ubuntu_install_vulkan.sh
@@ -1,4 +1,8 @@
-#/bin/bash
+#!/bin/bash
+
+set -e
+set -u
+set -o pipefail
 
 wget -q https://sdk.lunarg.com/sdk/download/1.0.65.0/linux/vulkansdk-linux-x86_64-1.0.65.0.run
 

From e2ec7bdaab5a93ad5279d9afc10aec11857177d4 Mon Sep 17 00:00:00 2001
From: Hiroyuki Makino <makihiro@users.noreply.github.com>
Date: Sat, 2 Mar 2019 03:19:49 +0900
Subject: [PATCH 53/93] [Doc] Relay tutorial - Deploy the Pretrained Model on
 Raspberry Pi (#2693)

---
 tutorials/relay/deploy_model_on_rasp.py | 207 ++++++++++++++++++++++++
 1 file changed, 207 insertions(+)
 create mode 100644 tutorials/relay/deploy_model_on_rasp.py

diff --git a/tutorials/relay/deploy_model_on_rasp.py b/tutorials/relay/deploy_model_on_rasp.py
new file mode 100644
index 000000000000..b90127b3858e
--- /dev/null
+++ b/tutorials/relay/deploy_model_on_rasp.py
@@ -0,0 +1,207 @@
+"""
+.. _tutorial-deploy-model-on-rasp:
+
+Deploy the Pretrained Model on Raspberry Pi
+===========================================
+**Author**: `Ziheng Jiang <https://ziheng.org/>`_, \
+            `Hiroyuki Makino <https://makihiro.github.io/>`_
+
+This is an example of using Relay to compile a ResNet model and deploy
+it on Raspberry Pi.
+"""
+
+import tvm
+import tvm.relay as relay
+from tvm import rpc
+from tvm.contrib import util, graph_runtime as runtime
+
+######################################################################
+# .. _build-tvm-runtime-on-device:
+#
+# Build TVM Runtime on Device
+# ---------------------------
+#
+# The first step is to build tvm runtime on the remote device.
+#
+# .. note::
+#
+#   All instructions in both this section and next section should be
+#   executed on the target device, e.g. Raspberry Pi. And we assume it
+#   has Linux running.
+# 
+# Since we do compilation on local machine, the remote device is only used
+# for running the generated code. We only need to build tvm runtime on
+# the remote device.
+#
+# .. code-block:: bash
+#
+#   git clone --recursive https://github.com/dmlc/tvm
+#   cd tvm
+#   mkdir build
+#   cp cmake/config.cmake build
+#   cd build
+#   cmake ..
+#   make runtime -j4
+#
+# After building runtime successfully, we need to set environment varibles
+# in :code:`~/.bashrc` file. We can edit :code:`~/.bashrc`
+# using :code:`vi ~/.bashrc` and add the line below (Assuming your TVM 
+# directory is in :code:`~/tvm`):
+#
+# .. code-block:: bash
+#
+#   export PYTHONPATH=$PYTHONPATH:~/tvm/python
+#
+# To update the environment variables, execute :code:`source ~/.bashrc`.
+
+######################################################################
+# Set Up RPC Server on Device
+# ---------------------------
+# To start an RPC server, run the following command on your remote device
+# (Which is Raspberry Pi in our example).
+#
+#   .. code-block:: bash
+#
+#     python -m tvm.exec.rpc_server --host 0.0.0.0 --port=9090
+#
+# If you see the line below, it means the RPC server started
+# successfully on your device.
+#
+#    .. code-block:: bash
+#
+#      INFO:root:RPCServer: bind to 0.0.0.0:9090
+#
+
+######################################################################
+# Prepare the Pre-trained Model
+# -----------------------------
+# Back to the host machine, which should have a full TVM installed (with LLVM).
+# 
+# We will use pre-trained model from
+# `MXNet Gluon model zoo <https://mxnet.incubator.apache.org/api/python/gluon/model_zoo.html>`_.
+# You can found more details about this part at tutorial :ref:`tutorial-from-mxnet`.
+
+from mxnet.gluon.model_zoo.vision import get_model
+from mxnet.gluon.utils import download
+from PIL import Image
+import numpy as np
+
+# one line to get the model
+block = get_model('resnet18_v1', pretrained=True)
+
+######################################################################
+# In order to test our model, here we download an image of cat and
+# transform its format.
+img_name = 'cat.png'
+download('https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true', img_name)
+image = Image.open(img_name).resize((224, 224))
+
+def transform_image(image):
+    image = np.array(image) - np.array([123., 117., 104.])
+    image /= np.array([58.395, 57.12, 57.375])
+    image = image.transpose((2, 0, 1))
+    image = image[np.newaxis, :]
+    return image
+
+x = transform_image(image)
+
+######################################################################
+# synset is used to transform the label from number of ImageNet class to
+# the word human can understand.
+synset_url = ''.join(['https://gist.githubusercontent.com/zhreshold/',
+                      '4d0b62f3d01426887599d4f7ede23ee5/raw/',
+                      '596b27d23537e5a1b5751d2b0481ef172f58b539/',
+                      'imagenet1000_clsid_to_human.txt'])
+synset_name = 'synset.txt'
+download(synset_url, synset_name)
+with open(synset_name) as f:
+    synset = eval(f.read())
+
+######################################################################
+# Now we would like to port the Gluon model to a portable computational graph.
+# It's as easy as several lines.
+
+# We support MXNet static graph(symbol) and HybridBlock in mxnet.gluon
+shape_dict = {'data': x.shape}
+func, params = relay.frontend.from_mxnet(block, shape_dict)
+# we want a probability so add a softmax operator
+func = relay.Function(func.params, relay.nn.softmax(func.body), None, func.type_params, func.attrs)
+
+######################################################################
+# Here are some basic data workload configurations.
+batch_size = 1
+num_classes = 1000
+image_shape = (3, 224, 224)
+data_shape = (batch_size,) + image_shape
+
+######################################################################
+# Compile The Graph
+# -----------------
+# To compile the graph, we call the :any:`relay.build` function
+# with the graph configuration and parameters. However, You cannot to
+# deploy a x86 program on a device with ARM instruction set. It means
+# Relay also needs to know the compilation option of target device,
+# apart from arguments :code:`net` and :code:`params` to specify the
+# deep learning workload. Actually, the option matters, different option
+# will lead to very different performance.
+
+######################################################################
+# If we run the example on our x86 server for demonstration, we can simply
+# set it as :code:`llvm`. If running it on the Raspberry Pi, we need to
+# specify its instruction set. Set :code:`local_demo` to False if you want
+# to run this tutorial with a real device.
+
+local_demo = True
+
+if local_demo:
+    target = tvm.target.create('llvm')
+else:
+    target = tvm.target.arm_cpu('rasp3b')
+    # The above line is a simple form of
+    # target = tvm.target.create('llvm -device=arm_cpu -model=bcm2837 -target=armv7l-linux-gnueabihf -mattr=+neon')
+
+with relay.build_config(opt_level=3):
+    graph, lib, params = relay.build(func, target, params=params)
+
+# After `relay.build`, you will get three return values: graph,
+# library and the new parameter, since we do some optimization that will
+# change the parameters but keep the result of model as the same.
+
+# Save the library at local temporary directory.
+tmp = util.tempdir()
+lib_fname = tmp.relpath('net.tar')
+lib.export_library(lib_fname)
+
+######################################################################
+# Deploy the Model Remotely by RPC
+# --------------------------------
+# With RPC, you can deploy the model remotely from your host machine
+# to the remote device.
+
+# obtain an RPC session from remote device.
+if local_demo:
+    remote = rpc.LocalSession()
+else:
+    # The following is my environment, change this to the IP address of your target device
+    host = '10.77.1.162'
+    port = 9090
+    remote = rpc.connect(host, port)
+
+# upload the library to remote device and load it
+remote.upload(lib_fname)
+rlib = remote.load_module('net.tar')
+
+# create the remote runtime module
+ctx = remote.cpu(0)
+module = runtime.create(graph, rlib, ctx)
+# set parameter (upload params to the remote device. This may take a while)
+module.set_input(**params)
+# set input data
+module.set_input('data', tvm.nd.array(x.astype('float32')))
+# run
+module.run()
+# get output
+out = module.get_output(0)
+# get top1 result
+top1 = np.argmax(out.asnumpy())
+print('TVM prediction top-1: {}'.format(synset[top1]))

From 6fae462ada85b20ab535f6995e04b5d9905a731a Mon Sep 17 00:00:00 2001
From: Salem Derisavi <derisavi@users.noreply.github.com>
Date: Fri, 1 Mar 2019 13:46:21 -0500
Subject: [PATCH 54/93] Defined a common base class for TensorComputeOp and
 ComputeOp (#2587)

* Defined a common base class for TensorComputeOp and ComputeOp

* Made changes requested by @ZihengJiang

* added a testcase to assert that `tensorize` does not have any effect on TensorComputeOp ops.
---
 include/tvm/operation.h                       |  59 ++++----
 python/tvm/tensor.py                          |  10 +-
 src/op/compute_op.cc                          |  34 +++--
 src/op/compute_op.h                           |   2 +-
 src/op/tensor_compute_op.cc                   | 136 +-----------------
 .../unittest/test_schedule_tensorize.py       |  78 ++++++++++
 6 files changed, 140 insertions(+), 179 deletions(-)

diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 3509b133cfc3..5e1f1fc73917 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -184,22 +184,45 @@ class PlaceholderOpNode : public OperationNode {
 
 /*!
  * \brief A Compute op that compute a tensor on certain domain.
+ * This is the base class for ComputeOp (operating on a scalar at a time) and
+ * TensorComputeOp (operating on a TensorSlice at a time)
  */
-class TVM_DLL ComputeOpNode : public OperationNode {
+class TVM_DLL BaseComputeOpNode : public OperationNode {
  public:
   /*! \brief IterVar on each axis */
   Array<IterVar> axis;
   /*! \brief IterVar on each reduction axis, if the body is a Reduce */
   Array<IterVar> reduce_axis;
+  // override functions
+  Array<IterVar> root_iter_vars() const final;
+  Array<Expr> output_shape(size_t idx) const final;
+  void GatherBound(
+          const Operation& self,
+          const std::unordered_map<Tensor, TensorDom>& tensor_dom,
+          std::unordered_map<IterVar, Range>* out_dom_map) const final;
+  Stmt BuildRealize(
+          const Stage& stage,
+          const std::unordered_map<IterVar, Range>& realize_map,
+          const Stmt& body) const final;
+  virtual size_t num_schedulable_dims() const = 0;
+
+  static constexpr const char* _type_key = "BaseComputeOp";
+  TVM_DECLARE_BASE_NODE_INFO(BaseComputeOpNode, OperationNode);
+};
+
+
+/*!
+ * \brief A Compute op that compute a tensor on certain domain.
+ */
+class TVM_DLL ComputeOpNode : public BaseComputeOpNode {
+ public:
   /*! \brief the compute expression */
   Array<Expr> body;
   /*! \brief constructor */
   ComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   Type output_dtype(size_t i) const final;
-  Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -208,18 +231,11 @@ class TVM_DLL ComputeOpNode : public OperationNode {
       const Operation& self,
       const std::unordered_map<const Variable*, IntSet>& dom_map,
       std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(
-      const Operation& self,
-      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-      std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(
-      const Stage& stage,
-      const std::unordered_map<IterVar, Range>& realize_map,
-      const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
+  size_t num_schedulable_dims() const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
@@ -236,18 +252,14 @@ class TVM_DLL ComputeOpNode : public OperationNode {
                         Array<Expr> body);
 
   static constexpr const char* _type_key = "ComputeOp";
-  TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, OperationNode);
+  TVM_DECLARE_NODE_TYPE_INFO(ComputeOpNode, BaseComputeOpNode);
 };
 
 /*!
  * \brief A TenorCompute op that compute a tensor with an tensor intrinsic.
  */
-class TensorComputeOpNode : public OperationNode {
+class TensorComputeOpNode : public BaseComputeOpNode {
  public:
-  /*! \brief IterVar on each axis */
-  Array<IterVar> axis;
-  /*! \brief IterVar on each reduction axis, if the intrin will use the reduce axis */
-  Array<IterVar> reduce_axis;
   /*! \brief number of axes that can be scheduled */
   int schedulable_ndim;
   /*! \brief TensorIntrin used to compute */
@@ -260,9 +272,7 @@ class TensorComputeOpNode : public OperationNode {
   TensorComputeOpNode() {}
   // override functions
   int num_outputs() const final;
-  Array<IterVar> root_iter_vars() const final;
   Type output_dtype(size_t i) const final;
-  Array<Expr> output_shape(size_t i) const final;
   Array<Tensor> InputTensors() const final;
   Operation ReplaceInputs(
       const Operation& self,
@@ -271,18 +281,11 @@ class TensorComputeOpNode : public OperationNode {
       const Operation& self,
       const std::unordered_map<const Variable*, IntSet>& dom_map,
       std::unordered_map<Tensor, TensorDom>* out_dom_map) const final;
-  void GatherBound(
-      const Operation& self,
-      const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-      std::unordered_map<IterVar, Range>* out_dom_map) const final;
-  Stmt BuildRealize(
-      const Stage& stage,
-      const std::unordered_map<IterVar, Range>& realize_map,
-      const Stmt& body) const final;
   Stmt BuildProvide(
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop) const final;
+  size_t num_schedulable_dims() const final;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("name", &name);
@@ -304,7 +307,7 @@ class TensorComputeOpNode : public OperationNode {
                         Array<Region> regions);
 
   static constexpr const char* _type_key = "TensorComputeOp";
-  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, OperationNode);
+  TVM_DECLARE_NODE_TYPE_INFO(TensorComputeOpNode, BaseComputeOpNode);
 };
 
 /*!
diff --git a/python/tvm/tensor.py b/python/tvm/tensor.py
index ce8f16d6a309..a9c862a268cf 100644
--- a/python/tvm/tensor.py
+++ b/python/tvm/tensor.py
@@ -146,7 +146,7 @@ class PlaceholderOp(Operation):
 
 
 @register_node
-class ComputeOp(Operation):
+class BaseComputeOp(Operation):
     """Compute operation."""
     @property
     def axis(self):
@@ -160,7 +160,13 @@ def reduce_axis(self):
 
 
 @register_node
-class TensorComputeOp(Operation):
+class ComputeOp(BaseComputeOp):
+    """Scalar operation."""
+    pass
+
+
+@register_node
+class TensorComputeOp(BaseComputeOp):
     """Tensor operation."""
 
 
diff --git a/src/op/compute_op.cc b/src/op/compute_op.cc
index a6dd39f79b1f..d5fc32ca0ff4 100644
--- a/src/op/compute_op.cc
+++ b/src/op/compute_op.cc
@@ -40,7 +40,7 @@ int ComputeOpNode::num_outputs() const {
   return body.size();
 }
 
-Array<IterVar> ComputeOpNode::root_iter_vars() const {
+Array<IterVar> BaseComputeOpNode::root_iter_vars() const {
   if (reduce_axis.size() == 0) return axis;
   Array<IterVar> ret = axis;
   for (IterVar iv : reduce_axis) {
@@ -54,15 +54,15 @@ Type ComputeOpNode::output_dtype(size_t idx) const {
   return body[idx].type();
 }
 
-Array<Expr> ComputeOpNode::output_shape(size_t idx) const {
+Array<Expr> BaseComputeOpNode::output_shape(size_t idx) const {
   CHECK_LT(idx, num_outputs());
-  // for now, all outputs of ComputeOp have the same shape
-  std::vector<Expr> shape;
-  for (size_t i = 0; i < axis.size(); ++i) {
-    const Range& r = axis[i]->dom;
+  // for now, all outputs of a BaseComputeOp have the same shape
+  Array<Expr> shape;
+  for (const auto& ivar : this->axis) {
+    const Range& r = ivar->dom;
     shape.push_back(r->extent);
   }
-  return Array<Expr>(shape);
+  return shape;
 }
 
 Tensor compute(Array<Expr> shape,
@@ -208,7 +208,7 @@ void ComputeOpNode::PropBoundToInputs(
   for (auto& e : body) ir::PostOrderVisit(e, fvisit);
 }
 
-void ComputeOpNode::GatherBound(
+void BaseComputeOpNode::GatherBound(
     const Operation& self,
     const std::unordered_map<Tensor, TensorDom>& tensor_dom,
     std::unordered_map<IterVar, Range>* out_dom_map) const {
@@ -225,22 +225,22 @@ void ComputeOpNode::GatherBound(
   }
 }
 
-Stmt ComputeOpNode::BuildRealize(
+Stmt BaseComputeOpNode::BuildRealize(
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& realize_map,
-    const Stmt& realize_body) const {
+    const Stmt& body) const {
   CHECK_EQ(stage->op.get(), this);
   HalideIR::Internal::Region bounds;
   for (IterVar iv : this->axis) {
     bounds.push_back(realize_map.at(iv));
   }
-  Stmt realize = realize_body;
+  Stmt realize = body;
   for (int i = this->num_outputs(); i > 0; --i) {
     Tensor t = stage->op.output(i-1);
     realize = ir::Realize::make(t->op, t->value_index,
       t->dtype, bounds, const_true(), realize);
     // alignment requirement, only useful for compute
-    for (size_t i = 0; i < this->axis.size(); ++i) {
+    for (size_t i = 0; i < num_schedulable_dims(); ++i) {
       auto it = stage->iter_var_attrs.find(this->axis[i]);
       if (it != stage->iter_var_attrs.end()) {
         IterVarAttr attr = (*it).second;
@@ -259,6 +259,10 @@ Stmt ComputeOpNode::BuildRealize(
   return realize;
 }
 
+size_t ComputeOpNode::num_schedulable_dims() const {
+  return axis.size();
+}
+
 // Build a reduction body.
 void MakeReduction(const ComputeOpNode* op,
                    const Array<Tensor>& tensors,
@@ -414,7 +418,7 @@ Stmt ComputeOpNode::BuildProvide(
 }
 
 ComputeLoopNest ComputeLoopNest::make(
-    const ComputeOpNode* self,
+    const BaseComputeOpNode* self,
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& dom_map,
     bool debug_keep_trivial_loop) {
@@ -440,8 +444,8 @@ ComputeLoopNest ComputeLoopNest::make(
     for (IterVar iv : self->reduce_axis) {
       update_state[iv] = 2;
     }
-    for (IterVar iv : self->axis) {
-      update_state[iv] = 1;
+    for (size_t i = 0; i < self->num_schedulable_dims(); ++i) {
+      update_state[self->axis[i]] = 1;
     }
     // find which iter var is related to reduction and which is related to axis.
     schedule::PassDownBitMaskOr(stage, &update_state);
diff --git a/src/op/compute_op.h b/src/op/compute_op.h
index 87b0814c1ad9..b0264835da5f 100644
--- a/src/op/compute_op.h
+++ b/src/op/compute_op.h
@@ -41,7 +41,7 @@ struct ComputeLoopNest {
    * \return The constructed loop nest
    */
   static ComputeLoopNest make(
-      const ComputeOpNode* self,
+      const BaseComputeOpNode* self,
       const Stage& stage,
       const std::unordered_map<IterVar, Range>& dom_map,
       bool debug_keep_trivial_loop);
diff --git a/src/op/tensor_compute_op.cc b/src/op/tensor_compute_op.cc
index 0262db7d8fc5..3ccce0c5d38a 100644
--- a/src/op/tensor_compute_op.cc
+++ b/src/op/tensor_compute_op.cc
@@ -28,27 +28,10 @@ int TensorComputeOpNode::num_outputs() const {
   return static_cast<int>(this->intrin->buffers.size() - this->inputs.size());
 }
 
-Array<IterVar> TensorComputeOpNode::root_iter_vars() const {
-  Array<IterVar> ret = axis;
-  for (IterVar iv : reduce_axis) {
-    ret.push_back(iv);
-  }
-  return ret;
-}
-
 Type TensorComputeOpNode::output_dtype(size_t i) const {
   return this->intrin->buffers[this->inputs.size() + i]->dtype;
 }
 
-Array<Expr> TensorComputeOpNode::output_shape(size_t i) const {
-  Array<Expr> shape;
-  for (const auto& ivar : this->axis) {
-    shape.push_back(ivar->dom->extent);
-  }
-  return shape;
-}
-
-
 Operation TensorComputeOpNode::make(std::string name,
                                     std::string tag,
                                     Array<IterVar> axis,
@@ -121,123 +104,10 @@ void TensorComputeOpNode::PropBoundToInputs(
   }
 }
 
-void TensorComputeOpNode::GatherBound(
-    const Operation& self,
-    const std::unordered_map<Tensor, TensorDom>& tensor_dom,
-    std::unordered_map<IterVar, Range>* out_dom_map) const {
-  const TensorDom& tdom = tensor_dom.at(self.output(0));
-  for (size_t i = 0; i < this->axis.size(); ++i) {
-    Range r = arith::Union(tdom.data.at(i)).cover_range(this->axis[i]->dom);
-    CHECK(!out_dom_map->count(this->axis[i]));
-    (*out_dom_map)[this->axis[i]] = r;
-  }
-  for (size_t i = 0; i < this->reduce_axis.size(); ++i) {
-    CHECK(!out_dom_map->count(this->reduce_axis[i]));
-    (*out_dom_map)[this->reduce_axis[i]] = this->reduce_axis[i]->dom;
-  }
-}
-
-Stmt TensorComputeOpNode::BuildRealize(
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& realize_map,
-    const Stmt& body) const {
-  CHECK_EQ(stage->op.get(), this);
-  HalideIR::Internal::Region bounds;
-  for (IterVar iv : this->axis) {
-    bounds.push_back(realize_map.at(iv));
-  }
-  Stmt realize = body;
-  for (int i = this->num_outputs(); i > 0; --i) {
-    Tensor t = stage->op.output(i-1);
-    realize = ir::Realize::make(t->op, t->value_index,
-      t->dtype, bounds, const_true(), realize);
-    // alignment requirement, only useful for compute
-    for (int i = 0; i < schedulable_ndim; ++i) {
-      auto it = stage->iter_var_attrs.find(this->axis[i]);
-      if (it != stage->iter_var_attrs.end()) {
-        IterVarAttr attr = (*it).second;
-        if (attr->dim_align_factor != 0) {
-          Array<Expr> tuple = {static_cast<int>(i),
-                               attr->dim_align_factor,
-                               attr->dim_align_offset};
-          realize = ir::AttrStmt::make(
-              t, ir::attr::buffer_dim_align,
-              Call::make(Handle(), ir::intrinsic::tvm_tuple, tuple, Call::Intrinsic),
-              realize);
-        }
-      }
-    }
-  }
-  return realize;
-}
-
-ComputeLoopNest MakeLoopNest(
-    const TensorComputeOpNode* self,
-    const Stage& stage,
-    const std::unordered_map<IterVar, Range>& dom_map,
-    bool debug_keep_trivial_loop) {
-  CHECK_EQ(stage->op.operator->(), self);
-  ComputeLoopNest ret;
-  // make main loop nest
-  ret.main_nest = op::MakeLoopNest(
-      stage, dom_map, 0, false, std::unordered_set<IterVar>(), &ret.main_vmap,
-      debug_keep_trivial_loop);
-  ret.main_predicates = schedule::MakeBoundCheck(
-      stage, dom_map, ret.main_vmap, false,
-      std::unordered_set<IterVar>());
-  for (auto& e : ret.main_predicates) {
-    e = likely(e);
-  }
-  if (stage->store_predicate.defined()) {
-    ret.main_predicates.push_back(stage->store_predicate);
-  }
-  if (self->reduce_axis.size() != 0) {
-    // try to find the location to insert the initialization.
-    // Fuse the initialization and provide loop when possible.
-    std::unordered_map<IterVar, int> update_state;
-    for (IterVar iv : self->reduce_axis) {
-      update_state[iv] = 2;
-    }
-    for (int i = 0; i < self->schedulable_ndim; ++i) {
-      update_state[self->axis[i]] = 1;
-    }
-    // find which iter var is related to reduction and which is related to axis.
-    schedule::PassDownBitMaskOr(stage, &update_state);
-    auto leaf_iter_vars = stage->leaf_iter_vars;
-    // first first loop that is related to reduction.
-    size_t begin_loop = leaf_iter_vars.size();
-    for (size_t i = 0; i < leaf_iter_vars.size(); ++i) {
-      auto iv = leaf_iter_vars[i];
-      int flag = update_state.at(iv);
-      if ((flag & 2) != 0) {
-        begin_loop = i; break;
-      }
-      ret.init_vmap[iv] = ret.main_vmap.at(iv);
-    }
-    ret.num_common_loop = begin_loop;
-    // skip loops that are related to reduction and are unrelated to axis.
-    std::unordered_set<IterVar> skip_iter;
-    for (auto kv : update_state) {
-      int flag = kv.second;
-      if (flag == 2) skip_iter.insert(kv.first);
-    }
-    ret.init_nest = op::MakeLoopNest(
-        stage, dom_map, begin_loop, true,
-        skip_iter, &(ret.init_vmap), debug_keep_trivial_loop);
-    ret.init_predicates = schedule::MakeBoundCheck(
-        stage, dom_map, ret.init_vmap, true, skip_iter);
-    for (auto& e : ret.init_predicates) {
-      e = likely(e);
-    }
-  } else {
-    CHECK_EQ(ret.main_nest.size(), stage->leaf_iter_vars.size() + 1);
-    ret.num_common_loop = stage->leaf_iter_vars.size();
-  }
-  // copy elison here.
-  return ret;
+size_t TensorComputeOpNode::num_schedulable_dims() const {
+  return schedulable_ndim;
 }
 
-
 Stmt TensorComputeOpNode::BuildProvide(
     const Stage& stage,
     const std::unordered_map<IterVar, Range>& dom_map,
@@ -296,7 +166,7 @@ Stmt TensorComputeOpNode::BuildProvide(
   ir::ArgBinder binder(&vmap);
 
   size_t tloc = stage->leaf_iter_vars.size();
-  ComputeLoopNest n = MakeLoopNest(this, stage, dom_map, debug_keep_trivial_loop);
+  ComputeLoopNest n = ComputeLoopNest::make(this, stage, dom_map, debug_keep_trivial_loop);
 
   if (this->reduce_axis.size() == 0) {
     std::vector<std::vector<Stmt> > nest(
diff --git a/tests/python/unittest/test_schedule_tensorize.py b/tests/python/unittest/test_schedule_tensorize.py
index ca5836143ef3..259c302eddd8 100644
--- a/tests/python/unittest/test_schedule_tensorize.py
+++ b/tests/python/unittest/test_schedule_tensorize.py
@@ -229,7 +229,85 @@ def intrin_func(ins, outs):
     s = s.normalize()
     tvm.lower(s, [A, B])
 
+# This test asserts that tensorize does not have any effect on
+# TensorComputeOp operations
+def test_tensorize_tensor_compute_op():
+    # an intrinsic called "multivadd" whose definition (pattern)
+    # is a loop of another intrinsic called "vadd"
+    def intrin_multivadd(n):
+        n_a = tvm.var("n_a")
+        Ab = tvm.decl_buffer((n, ), tvm.float32, strides=[n_a])
+
+        n_b = tvm.var("n_b")
+        Bb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_b])
+
+        n_c = tvm.var("n_c")
+        Cb = tvm.decl_buffer((n, ), tvm.float32, strides=[n_c])
+
+        z = tvm.compute((n,), lambda i: tvm.call_extern("float32", 'vadd',
+                                                        Ab.access_ptr("w", offset=n_a*i),
+                                                        Bb.access_ptr("r", offset=n_b*i),
+                                                        Cb.access_ptr("r", offset=n_c*i)))
+
+        # replace the pattern with the multivadd call. I need to figure out
+        # how to pass it the right parameters.
+        def intrin_func(ins, outs):
+            return tvm.call_packed("multivadd")
+
+        with tvm.build_config():
+            return tvm.decl_tensor_intrin(z.op, intrin_func, name="multivadd")
+
+    def intrin_vadd(n):
+        dtype = 'float32'
+        x = tvm.placeholder((n,), dtype=dtype, name='vx')
+        y = tvm.placeholder((n,), dtype=dtype, name='vy')
+        z = tvm.compute(x.shape, lambda i: x[i] + y[i], name='z')
+        s = tvm.create_schedule(z.op)
+
+        def create_buffer(t):
+            return tvm.decl_buffer(t.shape, t.dtype,
+                                   name='W'+t.name,
+                                   offset_factor=16)
+
+        def intrin_func(ins, outs):
+            ib = tvm.ir_builder.create()
+            ib.emit(tvm.call_extern("float32", 'vadd',
+                                    ins[0].access_ptr("r"), ins[1].access_ptr('r'),
+                                    outs[0].access_ptr('wr')))
+            return ib.get()
+
+        with tvm.build_config(offset_factor=16):
+            return tvm.decl_tensor_intrin(z.op, intrin_func, binds={x: create_buffer(x),
+                                                                    y: create_buffer(y),
+                                                                    z: create_buffer(z)})
+
+    # cache_read, cache_write
+    M = 1024
+    factor = 16
+    dtype = 'float32'
+
+    A = tvm.placeholder((M//factor, factor), name="A", dtype=dtype)
+    B = tvm.placeholder((M//factor, factor), name="B", dtype=dtype)
+
+    vadd = intrin_vadd(factor)
+    C = tvm.compute((M//factor, factor),
+                    lambda i: vadd(A[i, 0:factor], B[i, 0:factor]), name='C')
+
+    s = tvm.create_schedule(C.op)
+    multivadd = intrin_multivadd(64)
+    s[C].tensorize(C.op.axis[0], multivadd)
+    s = s.normalize()
+    dom_map = tvm.schedule.InferBound(s)
+    stmt = tvm.schedule.ScheduleOps(s, dom_map)
+    # The loop that we tried to tensorize still exists in the code
+    # That means tensorize didn't work as expected
+    assert isinstance(stmt.body.body.body, tvm.stmt.For)
+    assert stmt.body.body.body.loop_var.name == C.op.axis[0].var.name
+
+
+
 if __name__ == "__main__":
     test_tensorize_vadd()
     test_tensorize_matmul()
     test_tensorize_op()
+    test_tensorize_tensor_compute_op()

From 70b6687164f221dc965981ba019f89d17d515878 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Fri, 1 Mar 2019 14:53:46 -0800
Subject: [PATCH 55/93] [Relay/TOPI][Op] Add batch_matmul in relay and TOPI
 (#2561)

* Add batch_dot and cpu schedule

* Add relay support for batch_dot

* Rename batch_dot to batch_matmul

* nits

* Add missing file

* Put batch_matmul and dense x86 schedule in separate files

* Fix pylint

* Remove unused import

* Add cuda schedule for batch_matmul

* Add test case with larger batch size

* Add batch_matmul in api doc

* Fix quantize pass rounding error

* Fix pylint and minor change

* bug fix
---
 docs/api/python/topi.rst                    |   2 +
 docs/langref/relay_op.rst                   |   2 +
 python/tvm/relay/frontend/mxnet.py          |  14 +-
 python/tvm/relay/op/nn/_nn.py               |  15 ++
 python/tvm/relay/op/nn/nn.py                |  25 +++
 src/relay/op/nn/nn.cc                       |  63 ++++++
 tests/python/relay/test_op_level1.py        |   1 -
 tests/python/relay/test_op_level10.py       |  36 +++-
 tests/python/relay/test_pass_quantize.py    |   2 +-
 topi/include/topi/nn/batch_matmul.h         |  49 +++++
 topi/python/topi/cuda/__init__.py           |   1 +
 topi/python/topi/cuda/batch_matmul.py       |  89 +++++++++
 topi/python/topi/generic/nn.py              |   6 +
 topi/python/topi/nn/__init__.py             |   1 +
 topi/python/topi/nn/batch_matmul.py         |  35 ++++
 topi/python/topi/testing/__init__.py        |   1 +
 topi/python/topi/testing/batch_matmul.py    |  26 +++
 topi/python/topi/util.py                    |  26 +++
 topi/python/topi/x86/batch_matmul.py        |  53 +++++
 topi/python/topi/x86/dense.py               | 208 +++++++++++++++++++
 topi/python/topi/x86/nn.py                  | 209 +-------------------
 topi/src/topi.cc                            |  10 +
 topi/tests/python/test_topi_batch_matmul.py |  53 +++++
 23 files changed, 715 insertions(+), 212 deletions(-)
 create mode 100644 topi/include/topi/nn/batch_matmul.h
 create mode 100644 topi/python/topi/cuda/batch_matmul.py
 create mode 100644 topi/python/topi/nn/batch_matmul.py
 create mode 100644 topi/python/topi/testing/batch_matmul.py
 create mode 100644 topi/python/topi/x86/batch_matmul.py
 create mode 100644 topi/python/topi/x86/dense.py
 create mode 100644 topi/tests/python/test_topi_batch_matmul.py

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index 9680adc1231b..e8b63637ffb5 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -41,6 +41,7 @@ List of operators
    topi.nn.upsampling
    topi.nn.softmax
    topi.nn.dense
+   topi.nn.batch_matmul
    topi.nn.log_softmax
    topi.nn.conv2d_nchw
    topi.nn.conv2d_hwcn
@@ -138,6 +139,7 @@ topi.nn
 .. autofunction:: topi.nn.upsampling
 .. autofunction:: topi.nn.softmax
 .. autofunction:: topi.nn.dense
+.. autofunction:: topi.nn.batch_matmul
 .. autofunction:: topi.nn.log_softmax
 .. autofunction:: topi.nn.conv2d_nchw
 .. autofunction:: topi.nn.conv2d_hwcn
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index e2da42b6ab32..7958d6cbe553 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -152,6 +152,7 @@ This level support backpropagation of broadcast operators. It is temporary.
    tvm.relay.device_copy
    tvm.relay.annotation.on_device
    tvm.relay.reverse_reshape
+   tvm.relay.nn.batch_matmul
 
 
 Level 1 Definitions
@@ -264,3 +265,4 @@ Level 10 Definitions
 .. autofunction:: tvm.relay.device_copy
 .. autofunction:: tvm.relay.annotation.on_device
 .. autofunction:: tvm.relay.reverse_reshape
+.. autofunction:: tvm.relay.nn.batch_matmul
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 9ef5f626393a..3d3bb8e4fd84 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -283,6 +283,18 @@ def _mx_multibox_detection(inputs, attrs):
     return _op.vision.nms(ret[0], ret[1], **new_attrs1)
 
 
+def _mx_batch_dot(inputs, attrs):
+    assert len(inputs) == 2
+    a, b = inputs
+    transpose_a = attrs.get_bool("transpose_a", False)
+    transpose_b = attrs.get_bool("transpose_b", False)
+    if transpose_a is True:
+        raise RuntimeError("batch_dot: only support transpose_a=False")
+    if transpose_b is False:
+        b = _op.transpose(b, axes=[0, 2, 1])
+    return _op.batch_matmul(a, b)
+
+
 def _mx_arange(inputs, attrs):
     assert len(inputs) == 0
     if attrs.get_int("repeat", 1) != 1:
@@ -389,6 +401,7 @@ def _mx_roi_align(inputs, attrs):
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
     "concat"        : _mx_concat,
+    "batch_dot"     : _mx_batch_dot,
     "LeakyReLU"     : _mx_leaky_relu,
     "_arange"       : _mx_arange,
     "SoftmaxOutput" : _mx_softmax_output,
@@ -403,7 +416,6 @@ def _mx_roi_align(inputs, attrs):
     # "broadcast_to",
     # "gather_nd",
     # "Crop"          : _crop_like,
-
 }
 
 # set identity list
diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py
index a4b41d92371e..0c2733ecae92 100644
--- a/python/tvm/relay/op/nn/_nn.py
+++ b/python/tvm/relay/op/nn/_nn.py
@@ -46,6 +46,21 @@ def schedule_dense(attrs, outputs, target):
 reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
 
 
+# batch_matmul
+@reg.register_compute("nn.batch_matmul")
+def compute_batch_matmul(attrs, inputs, out_type, target):
+    """Compute definition of batch_matmul"""
+    return [topi.nn.batch_matmul(inputs[0], inputs[1])]
+
+@reg.register_schedule("nn.batch_matmul")
+def schedule_batch_matmul(attrs, outputs, target):
+    """Schedule definition of batch_matmul"""
+    with target:
+        return topi.generic.schedule_batch_matmul(outputs)
+
+reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE)
+
+
 # conv2d
 @reg.register_compute("nn.conv2d")
 def compute_conv2d(attrs, inputs, out_type, target):
diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
index 06cd79a8ff8b..41b2148ec390 100644
--- a/python/tvm/relay/op/nn/nn.py
+++ b/python/tvm/relay/op/nn/nn.py
@@ -767,6 +767,31 @@ def batch_norm(data,
     return TupleWrapper(result, 3)
 
 
+def batch_matmul(x, y):
+    r"""
+    Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data
+    in batch.
+
+    .. math::
+
+        \mbox{batch_matmul}(x, y)[i, :, :] = \mbox{matmul}(x[i, :, :], y[i, :, :]^T)
+
+    Parameters
+    ----------
+    x : tvm.relay.Expr
+        The first input.
+
+    y : tvm.relay.Expr
+        The second input.
+
+    Returns
+    -------
+    result: tvm.relay.Expr
+        The computed result.
+    """
+    return _make.batch_matmul(x, y)
+
+
 def contrib_conv2d_winograd_without_weight_transform(data,
                                                      weight,
                                                      tile_size,
diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc
index 9ab841cf4286..59f68d9d8880 100644
--- a/src/relay/op/nn/nn.cc
+++ b/src/relay/op/nn/nn.cc
@@ -654,5 +654,68 @@ axis to be the last item in the input shape.
 .set_support_level(1)
 .add_type_rel("BatchNorm", BatchNormRel);
 
+
+// relay.nn.batch_matmul
+bool BatchMatmulRel(const Array<Type>& types,
+                    int num_inputs,
+                    const Attrs& attrs,
+                    const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 3);
+  const auto* x = types[0].as<TensorTypeNode>();
+  const auto* y = types[1].as<TensorTypeNode>();
+  if (x == nullptr || y == nullptr) return false;
+  if (x->shape.size() != 3 || y->shape.size() != 3) return false;
+  CHECK(reporter->AssertEQ(x->shape[0], y->shape[0]))
+      << "BatchDot: batch dimension doesn't match, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+  CHECK(reporter->AssertEQ(x->shape[2], y->shape[2]))
+      << "BatchDot: shapes of x and y is inconsistent, "
+      << " x shape=" << x->shape
+      << ", y shape=" << y->shape;
+
+  Array<tvm::Expr> oshape = x->shape;
+  oshape.Set(2, y->shape[1]);
+
+  // assign output type
+  reporter->Assign(types[2], TensorTypeNode::make(oshape, x->dtype));
+  return true;
+}
+
+
+// Positional relay function to create batch_matmul operator used by frontend FFI.
+Expr MakeBatchMatmul(Expr x,
+                     Expr y) {
+  static const Op& op = Op::Get("nn.batch_matmul");
+  return CallNode::make(op, {x, y}, Attrs(), {});
+}
+
+
+TVM_REGISTER_API("relay.op.nn._make.batch_matmul")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeBatchMatmul, args, rv);
+  });
+
+
+RELAY_REGISTER_OP("nn.batch_matmul")
+.describe(R"code(Computes matrix multiplication of `x` and `y` when `x` and `y`
+are data in batch.
+
+.. math::
+
+  batch\_matmul(x, y)[i, :, :] = matmul(x[i, :, :], y[i, :, :]^T)
+
+- **x**: `(b, m, k)`
+- **y**: `(b, n, k)`
+- **out**: `(b, m, n)`.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(2)
+.add_argument("x", "3D Tensor", "First input.")
+.add_argument("y", "3D Tensor", "Second input.")
+.set_support_level(10)
+.add_type_rel("BatchMatmul", BatchMatmulRel);
+
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py
index d29b808be0d1..b954e42bf1ab 100644
--- a/tests/python/relay/test_op_level1.py
+++ b/tests/python/relay/test_op_level1.py
@@ -306,7 +306,6 @@ def test_dense():
         tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
 
 
-
 if __name__ == "__main__":
     test_concatenate()
     test_bias_add()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index a6e169e23a6c..34285d2b18dd 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -4,6 +4,8 @@
 import tvm
 from tvm import relay
 from tvm.relay.testing import ctx_list
+import topi
+import topi.testing
 
 def test_collapse_sum_like():
     shape = (3, 4, 5, 6)
@@ -126,7 +128,6 @@ def verify_reverse_reshape(shape, newshape, oshape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
         z = relay.reverse_reshape(x, newshape=newshape)
         zz = relay.ir_pass.infer_type(z)
-        print(zz.checked_type)
         assert "newshape=" in z.astext()
         assert zz.checked_type == relay.ty.TensorType(oshape, "float32")
 
@@ -144,8 +145,41 @@ def verify_reverse_reshape(shape, newshape, oshape):
     verify_reverse_reshape((2, 3, 4), (-1, 0), (6, 4))
     verify_reverse_reshape((2, 3, 4), (0, -3), (2, 12))
 
+def verify_batch_matmul(x_shape, y_shape, out_shape, dtype="float32"):
+    x = relay.var("x", relay.TensorType(x_shape, dtype))
+    y = relay.var("y", relay.TensorType(y_shape, dtype))
+    z = relay.nn.batch_matmul(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.ty.TensorType(out_shape, dtype)
+
+    func = relay.Function([x, y], z)
+    x_np = np.random.uniform(size=x_shape).astype(dtype)
+    y_np = np.random.uniform(size=y_shape).astype(dtype)
+    z_np = topi.testing.batch_matmul(x_np, y_np)
+
+    for target, ctx in ctx_list():
+        for kind in ["graph", "debug"]:
+            intrp = relay.create_executor(kind, ctx=ctx, target=target)
+            z = intrp.evaluate(func)(x_np, y_np)
+            tvm.testing.assert_allclose(z.asnumpy(), z_np, rtol=1e-5)
+
+def test_batch_matmul():
+    b, m, n, k = tvm.var("b"), tvm.var("m"), tvm.var("n"), tvm.var("k")
+    x = relay.var("x", relay.TensorType((b, m, k), "float32"))
+    y = relay.var("y", relay.TensorType((b, n, k), "float32"))
+    z = relay.nn.batch_matmul(x, y)
+    zz = relay.ir_pass.infer_type(z)
+    assert zz.checked_type == relay.TensorType((b, m, n), "float32")
+
+    verify_batch_matmul((1, 16, 32), (1, 16, 32), (1, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 16, 32), (5, 16, 16))
+    verify_batch_matmul((5, 16, 32), (5, 20, 32), (5, 16, 20))
+    verify_batch_matmul((30, 16, 32), (30, 20, 32), (30, 16, 20))
+
+
 if __name__ == "__main__":
     test_collapse_sum_like()
     test_broadcast_to_like()
     test_slice_like()
     test_reverse_reshape()
+    test_batch_matmul()
diff --git a/tests/python/relay/test_pass_quantize.py b/tests/python/relay/test_pass_quantize.py
index 6d65d7b2d9ee..2e2389d16244 100644
--- a/tests/python/relay/test_pass_quantize.py
+++ b/tests/python/relay/test_pass_quantize.py
@@ -75,7 +75,7 @@ def make_qgraph(data, weight):
     graph = relay.create_executor('graph')
     res0 = graph.evaluate(qgraph0)(dataset[0]['data'])
     res1 = graph.evaluate(qgraph1)(dataset[0]['data'])
-    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy())
+    tvm.testing.assert_allclose(res0.asnumpy(), res1.asnumpy(), rtol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/topi/include/topi/nn/batch_matmul.h b/topi/include/topi/nn/batch_matmul.h
new file mode 100644
index 000000000000..968e1b0c697c
--- /dev/null
+++ b/topi/include/topi/nn/batch_matmul.h
@@ -0,0 +1,49 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \brief Batch matmul op constructions
+ * \file nn/batch_matmul.h
+ */
+#ifndef TOPI_NN_BATCH_MATMUL_H_
+#define TOPI_NN_BATCH_MATMUL_H_
+
+#include <string>
+
+#include "topi/tags.h"
+#include "tvm/tvm.h"
+
+namespace topi {
+namespace nn {
+using namespace tvm;
+
+/*!
+* \brief Creates an operation that calculates matrix multiplication in batch.
+*
+* \param x Tensor with shape [batch, M, K]
+* \param y Tensor with shape [batch, N, K]
+*
+* \return Tensor with shape [batch, M, N]
+*/
+inline tvm::Tensor batch_matmul(const tvm::Tensor& x,
+                                const tvm::Tensor& y) {
+  CHECK_EQ(x->shape.size(), 3) << "batch_matmul requires 3-D data";
+  CHECK_EQ(y->shape.size(), 3) << "batch_matmul requires 3-D data";
+
+  auto batch = x->shape[0];
+  auto M = x->shape[1];
+  auto K = x->shape[2];
+  auto N = y->shape[1];
+
+  auto k = tvm::reduce_axis(Range(0, K), "k");
+  auto result = tvm::compute(
+      { batch, M, N },
+      [&](Var b, Var i, Var j) {
+        return tvm::sum(x(b, i, k) * y(b, j, k), { k });
+      }, "tensor", "batch_matmul");
+
+  return result;
+}
+
+}  // namespace nn
+}  // namespace topi
+
+#endif  // TOPI_NN_BATCH_MATMUL_H_
diff --git a/topi/python/topi/cuda/__init__.py b/topi/python/topi/cuda/__init__.py
index 91c2235fcf70..ba577cd944f0 100644
--- a/topi/python/topi/cuda/__init__.py
+++ b/topi/python/topi/cuda/__init__.py
@@ -14,6 +14,7 @@
 from .pooling import schedule_pool, schedule_global_pool
 from .extern import schedule_extern
 from .nn import schedule_lrn, schedule_l2_normalize
+from .batch_matmul import schedule_batch_matmul
 from .vision import *
 from . import ssd
 from .ssd import *
diff --git a/topi/python/topi/cuda/batch_matmul.py b/topi/python/topi/cuda/batch_matmul.py
new file mode 100644
index 000000000000..a1fa256028da
--- /dev/null
+++ b/topi/python/topi/cuda/batch_matmul.py
@@ -0,0 +1,89 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""cuda batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+
+
+@generic.schedule_batch_matmul.register(["cuda", "gpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _schedule(op):
+        C = op.output(0)
+        A, B = s[C].op.input_tensors
+        _, M, N = get_const_tuple(C.shape)
+        AA = s.cache_read(A, "shared", [C])
+        AL = s.cache_read(AA, "local", [C])
+        BB = s.cache_read(B, "shared", [C])
+        BL = s.cache_read(BB, "local", [C])
+        CC = s.cache_write(C, "local")
+
+        b, y, x = s[C].op.axis
+        y_bn = get_max_power2_factor(M, 64)
+        x_bn = get_max_power2_factor(N, 64)
+        by, y = s[C].split(y, y_bn)
+        bx, x = s[C].split(x, x_bn)
+        y_nthreads = min(y_bn, 8)
+        x_nthreads = min(x_bn, 8)
+        ty, yi = s[C].split(y, nparts=y_nthreads)
+        tx, xi = s[C].split(x, nparts=x_nthreads)
+        thread_x = tvm.thread_axis((0, x_nthreads), "threadIdx.x")
+        thread_y = tvm.thread_axis((0, y_nthreads), "threadIdx.y")
+
+        s[C].reorder(b, by, bx, ty, tx, yi, xi)
+        s[C].bind(b, tvm.thread_axis("blockIdx.z"))
+        s[C].bind(by, tvm.thread_axis("blockIdx.y"))
+        s[C].bind(bx, tvm.thread_axis("blockIdx.x"))
+        s[C].bind(ty, thread_y)
+        s[C].bind(tx, thread_x)
+        s[C].pragma(yi, "auto_unroll_max_step", 16)
+
+        s[CC].compute_at(s[C], tx)
+        _, yi, xi = s[CC].op.axis
+        k, = s[CC].op.reduce_axis
+        ko, ki = s[CC].split(k, 8)
+        s[CC].reorder(ko, ki, yi, xi)
+        s[CC].pragma(ki, "auto_unroll_max_step", 16)
+
+        s[AA].compute_at(s[CC], ko)
+        s[AL].compute_at(s[CC], ki)
+        s[BB].compute_at(s[CC], ko)
+        s[BL].compute_at(s[CC], ki)
+        _, y, k = s[AA].op.axis
+        ty, yi = s[AA].split(y, nparts=y_nthreads)
+        tx, ki = s[AA].split(k, nparts=x_nthreads)
+        s[AA].reorder(ty, tx, yi, ki)
+        s[AA].bind(ty, thread_y)
+        s[AA].bind(tx, thread_x)
+        s[AA].pragma(yi, "auto_unroll_max_step", 16)
+
+        _, x, k = s[BB].op.axis
+        ty, xi = s[BB].split(x, nparts=y_nthreads)
+        tx, ki = s[BB].split(k, nparts=x_nthreads)
+        s[BB].bind(ty, thread_y)
+        s[BB].bind(tx, thread_x)
+        s[BB].reorder(ty, tx, xi, ki)
+        s[BB].pragma(xi, "auto_unroll_max_step", 16)
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            _schedule(op)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py
index 8c303e5be182..00b742f24e64 100644
--- a/topi/python/topi/generic/nn.py
+++ b/topi/python/topi/generic/nn.py
@@ -410,3 +410,9 @@ def schedule_l2_normalize(outs):
     target = tvm.target.current_target(allow_none=False)
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
+
+@tvm.target.generic_func
+def schedule_batch_matmul(outs):
+    target = tvm.target.current_target(allow_none=False)
+    cpp_target = cpp.TEST_create_target(target.target_name)
+    return cpp.generic.default_schedule(cpp_target, outs, False)
diff --git a/topi/python/topi/nn/__init__.py b/topi/python/topi/nn/__init__.py
index cfb9e566279a..941fec91a6bd 100644
--- a/topi/python/topi/nn/__init__.py
+++ b/topi/python/topi/nn/__init__.py
@@ -17,3 +17,4 @@
 from .local_response_norm import *
 from .bitserial_conv2d import *
 from .l2_normalize import *
+from .batch_matmul import *
diff --git a/topi/python/topi/nn/batch_matmul.py b/topi/python/topi/nn/batch_matmul.py
new file mode 100644
index 000000000000..07e363868b05
--- /dev/null
+++ b/topi/python/topi/nn/batch_matmul.py
@@ -0,0 +1,35 @@
+"""Binary Neural Network (BNN) Operators"""
+# pylint: disable=invalid-name
+from __future__ import absolute_import as _abs
+import tvm
+from ..util import get_const_tuple
+
+
+def batch_matmul(x, y):
+    """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are
+    data in batch.
+
+    Parameters
+    ----------
+    x : tvm.Tensor
+        3-D with shape [batch, M, K]
+
+    y : tvm.TEnsor
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    output : tvm.Tensor
+        3-D with shape [batch, M, N]
+    """
+    assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul"
+    x_shape = get_const_tuple(x.shape)
+    y_shape = get_const_tuple(y.shape)
+    assert x_shape[0] == y_shape[0], "batch dimension doesn't match"
+    assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant"
+    batch, M, K = x.shape
+    N = y.shape[1]
+    k = tvm.reduce_axis((0, K), name='k')
+    return tvm.compute((batch, M, N),
+                       lambda b, i, j: tvm.sum(x[b, i, k] * y[b, j, k], axis=k),
+                       tag='batch_matmul')
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 81dd379257e0..0ccc422010c1 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -19,3 +19,4 @@
 from .l2_normalize_python import l2_normalize_python
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
+from .batch_matmul import batch_matmul
diff --git a/topi/python/topi/testing/batch_matmul.py b/topi/python/topi/testing/batch_matmul.py
new file mode 100644
index 000000000000..a7b2f9344f29
--- /dev/null
+++ b/topi/python/topi/testing/batch_matmul.py
@@ -0,0 +1,26 @@
+# pylint: disable=invalid-name
+"""Batch matmul in python"""
+import numpy as np
+
+def batch_matmul(x, y):
+    """batch_matmul operator implemented in numpy.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        3-D with shape [batch, M, K]
+
+    y : numpy.ndarray
+        3-D with shape [batch, N, K]
+
+    Returns
+    -------
+    out : numpy.ndarray
+        3-D with shape [batch, M, N]
+    """
+    batch, M, _ = x.shape
+    N = y.shape[1]
+    out = np.zeros((batch, M, N)).astype(x.dtype)
+    for i in range(batch):
+        out[i] = np.dot(x[i], y[i].T)
+    return out
diff --git a/topi/python/topi/util.py b/topi/python/topi/util.py
index 6d7326580f6d..d630628b4379 100644
--- a/topi/python/topi/util.py
+++ b/topi/python/topi/util.py
@@ -255,3 +255,29 @@ def select_array(i, j):
         return now
 
     return tvm.compute(matrix.shape, select_array, name=name)
+
+
+def get_max_power2_factor(n, max_value=None):
+    """Get max factor of n in power of 2. If max_value is specificed, max factor
+    value will be no more max_value,
+
+    Parameter
+    ---------
+    n : int
+        The input value
+
+    max_value : int, optional
+        The max value for the factor
+
+    Returns
+    -------
+    factor : int
+        The max factor in power of 2.
+    """
+    x = 1
+    while n % 2 == 0:
+        if max_value is not None and max_value < x * 2:
+            break
+        x *= 2
+        n /= 2
+    return x
diff --git a/topi/python/topi/x86/batch_matmul.py b/topi/python/topi/x86/batch_matmul.py
new file mode 100644
index 000000000000..37890e389366
--- /dev/null
+++ b/topi/python/topi/x86/batch_matmul.py
@@ -0,0 +1,53 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 batch_matmul operators"""
+from __future__ import absolute_import as _abs
+import tvm
+
+from .. import generic
+from ..util import traverse_inline, get_const_tuple, get_max_power2_factor
+
+
+@generic.schedule_batch_matmul.register(["cpu"])
+def schedule_batch_matmul(outs):
+    """Schedule for batch_matmul
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of batch_matmul
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "batch_matmul" in op.tag:
+            C = op.output(0)
+            A, B = s[C].op.input_tensors
+            _, M, N = get_const_tuple(C.shape)
+            k, = s[C].op.reduce_axis
+            ko, ki = s[C].split(k, 16)
+            CC = s.rfactor(C, ki)
+
+            b, y, x = s[C].op.axis
+            y_bn = get_max_power2_factor(M, 8)
+            x_bn = get_max_power2_factor(N, 8)
+            yo, yi = s[C].split(y, y_bn)
+            xo, xi = s[C].split(x, x_bn)
+            s[C].reorder(b, yo, xo, yi, xi)
+            bxyo = s[C].fuse(b, yo, xo)
+            s[C].parallel(bxyo)
+            s[C].fuse(yi, xi)
+
+            s[CC].compute_at(s[C], bxyo)
+            _, _, y, x = s[CC].op.axis
+            s[CC].fuse(y, x)
+            s[CC].vectorize(s[CC].op.axis[0])
+            s[C].pragma(bxyo, 'auto_unroll_max_step', 16)
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
diff --git a/topi/python/topi/x86/dense.py b/topi/python/topi/x86/dense.py
new file mode 100644
index 000000000000..33575b4c399d
--- /dev/null
+++ b/topi/python/topi/x86/dense.py
@@ -0,0 +1,208 @@
+# pylint: disable=invalid-name,too-many-locals,unused-variable
+"""x86 dense operators"""
+from __future__ import absolute_import as _abs
+import tvm
+from tvm import autotvm
+from tvm.autotvm.task.space import SplitEntity
+
+from .util import get_fp32_len
+from .. import generic, tag, nn
+from ..util import traverse_inline, get_const_tuple
+
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
+def _declaration_dense(cfg, data, weight, bias=None):
+    batch, _ = get_const_tuple(data.shape)
+
+    # For small batch sizes, don't pack weight into cache-friendly layout
+    # because of overhead in packing and limited reuse from batch dimension
+    # TODO(icemelon9): use a more systematic way to determine which schedule to use
+    if batch <= 16:
+        return _declaration_dense_nopack(cfg, data, weight, bias)
+    return _declaration_dense_pack(cfg, data, weight, bias)
+
+
+# Declare dense compute with packing weight into cache-friendly layout
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
+def _declaration_dense_pack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_y", batch, num_outputs=3)
+    cfg.define_split("tile_x", out_dim, num_outputs=3)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
+
+    packw_bn = cfg["tile_x"].size[-1]
+    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
+    packw = tvm.compute(packw_shape,
+                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
+
+    k = tvm.reduce_axis((0, in_dim), name="k")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(
+                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
+                        axis=k),
+                    tag="dense_pack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+    return C
+
+
+# Declare dense compute without packing weight
+@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
+def _declaration_dense_nopack(cfg, data, weight, bias=None):
+    batch, in_dim = get_const_tuple(data.shape)
+    out_dim, _ = get_const_tuple(weight.shape)
+    # create tuning space
+    cfg.define_split("tile_x", out_dim, num_outputs=2)
+    cfg.define_split("tile_y", batch, num_outputs=2)
+    cfg.define_split("tile_k", in_dim, num_outputs=2)
+    if cfg.is_fallback:
+        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
+
+    vec = cfg["tile_k"].size[-1]
+    k = tvm.reduce_axis((0, in_dim // vec), "k")
+    CC = tvm.compute((batch, out_dim, vec),
+                     lambda z, y, x: tvm.sum(
+                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
+
+    kk = tvm.reduce_axis((0, vec), "kk")
+    C = tvm.compute((batch, out_dim),
+                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
+                    tag="dense_nopack")
+    if bias is not None:
+        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
+                        tag=tag.BROADCAST)
+
+    return C
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
+def _schedule_dense(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+        elif 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
+def _schedule_dense_pack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_pack" in op.tag:
+            _schedule_dense_pack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
+def _schedule_dense_nopack(cfg, outs):
+    s = tvm.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if 'dense_nopack' in op.tag:
+            _schedule_dense_nopack_template(cfg, s, op.output(0))
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
+def _schedule_dense_pack_template(cfg, s, C):
+    A, packedB = s[C].op.input_tensors
+
+    CC = s.cache_write(C, "global")
+    y, x = s[C].op.axis
+    k, = s[CC].op.reduce_axis
+
+    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
+    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yt, xt, yo, xo, yi, xi)
+    xyt = s[C].fuse(yt, xt)
+    s[C].parallel(xyt)
+    xyo = s[C].fuse(yo, xo)
+    s[C].unroll(yi)
+    s[C].vectorize(xi)
+
+    s[CC].compute_at(s[C], xyo)
+    y, x = s[CC].op.axis
+    ko, ki = cfg["tile_k"].apply(s, CC, k)
+    s[CC].reorder(ko, ki, y, x)
+    s[CC].vectorize(x)
+    s[CC].unroll(y)
+    s[CC].unroll(ki)
+
+    z, y, x = s[packedB].op.axis
+    s[packedB].reorder(z, x, y)
+    s[packedB].parallel(z)
+    s[packedB].vectorize(y)
+    return s
+
+
+def _schedule_dense_nopack_template(cfg, s, C):
+    y, x = s[C].op.axis
+    kk, = s[C].op.reduce_axis
+    yo, yi = cfg["tile_y"].apply(s, C, y)
+    xo, xi = cfg["tile_x"].apply(s, C, x)
+    s[C].reorder(yo, xo, yi, xi)
+    xyo = s[C].fuse(yo, xo)
+    s[C].parallel(xyo)
+    s[C].unroll(kk)
+
+    CC, = s[C].op.input_tensors
+    s[CC].compute_at(s[C], xyo)
+    z, y, x = s[CC].op.axis
+    k, = s[CC].op.reduce_axis
+    yz = s[CC].fuse(z, y)
+    s[CC].reorder(k, yz, x)
+    s[CC].unroll(yz)
+    s[CC].vectorize(x)
+    return s
+
+
+def _default_dense_pack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+
+    tilex_ii = 1
+    for bn in range(vec_width*2, 0, -1):
+        if N % bn == 0:
+            tilex_ii = bn
+            break
+    NN = N // tilex_ii
+    tilex_oi = 1
+    while NN // tilex_oi > 4:
+        if (NN // tilex_oi) % 2 == 1:
+            break
+        tilex_oi *= 2
+
+    tiley_ii = 8
+    while M % tiley_ii != 0:
+        tiley_ii //= 2
+    MM = M // tiley_ii
+    tiley_oi = 1
+    while MM // tiley_oi > 4:
+        if (MM // tiley_oi) % 2 == 1:
+            break
+        tiley_oi *= 2
+
+    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
+    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
+    cfg["tile_k"] = SplitEntity([K, 1])
+
+
+def _default_dense_nopack_config(cfg, M, N, K):
+    vec_width = get_fp32_len()
+    tilek_bn = 1
+    for bn in range(vec_width*2, 0, -1):
+        if K % bn == 0:
+            tilek_bn = bn
+            break
+    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
+    cfg["tile_x"] = SplitEntity([N, 1])
+    cfg["tile_y"] = SplitEntity([1, M])
diff --git a/topi/python/topi/x86/nn.py b/topi/python/topi/x86/nn.py
index ab6dda40cc9d..73463242e96d 100644
--- a/topi/python/topi/x86/nn.py
+++ b/topi/python/topi/x86/nn.py
@@ -2,12 +2,7 @@
 """x86 nn operators"""
 from __future__ import absolute_import as _abs
 import tvm
-from tvm import autotvm
-from tvm.autotvm.task.space import SplitEntity
-
-from .util import get_fp32_len
-from .. import generic, tag, nn
-from ..util import traverse_inline, get_const_tuple
+from .. import generic
 
 @generic.schedule_softmax.register(["cpu"])
 def schedule_softmax(outs):
@@ -37,205 +32,3 @@ def schedule_softmax(outs):
     else:
         s[x].parallel(s[x].op.axis[0])
     return s
-
-
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct")
-def _declaration_dense(cfg, data, weight, bias=None):
-    batch, _ = get_const_tuple(data.shape)
-
-    # For small batch sizes, don't pack weight into cache-friendly layout
-    # because of overhead in packing and limited reuse from batch dimension
-    # TODO(icemelon9): use a more systematic way to determine which schedule to use
-    if batch <= 16:
-        return _declaration_dense_nopack(cfg, data, weight, bias)
-    return _declaration_dense_pack(cfg, data, weight, bias)
-
-
-# Declare dense compute with packing weight into cache-friendly layout
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_pack")
-def _declaration_dense_pack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_y", batch, num_outputs=3)
-    cfg.define_split("tile_x", out_dim, num_outputs=3)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_pack_config(cfg, batch, out_dim, in_dim)
-
-    packw_bn = cfg["tile_x"].size[-1]
-    packw_shape = (out_dim // packw_bn, in_dim, packw_bn)
-    packw = tvm.compute(packw_shape,
-                        lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight")
-
-    k = tvm.reduce_axis((0, in_dim), name="k")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(
-                        data[y, k] * packw[x // packw_bn, k, x % packw_bn],
-                        axis=k),
-                    tag="dense_pack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-    return C
-
-
-# Declare dense compute without packing weight
-@autotvm.register_topi_compute(nn.dense, "cpu", "direct_nopack")
-def _declaration_dense_nopack(cfg, data, weight, bias=None):
-    batch, in_dim = get_const_tuple(data.shape)
-    out_dim, _ = get_const_tuple(weight.shape)
-    # create tuning space
-    cfg.define_split("tile_x", out_dim, num_outputs=2)
-    cfg.define_split("tile_y", batch, num_outputs=2)
-    cfg.define_split("tile_k", in_dim, num_outputs=2)
-    if cfg.is_fallback:
-        _default_dense_nopack_config(cfg, batch, out_dim, in_dim)
-
-    vec = cfg["tile_k"].size[-1]
-    k = tvm.reduce_axis((0, in_dim // vec), "k")
-    CC = tvm.compute((batch, out_dim, vec),
-                     lambda z, y, x: tvm.sum(
-                         data[z, k * vec + x] * weight[y, k * vec + x], axis=k))
-
-    kk = tvm.reduce_axis((0, vec), "kk")
-    C = tvm.compute((batch, out_dim),
-                    lambda y, x: tvm.sum(CC[y, x, kk], axis=kk),
-                    tag="dense_nopack")
-    if bias is not None:
-        C = tvm.compute((batch, out_dim), lambda i, j: C[i, j] + bias[j],
-                        tag=tag.BROADCAST)
-
-    return C
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct")
-def _schedule_dense(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-        elif 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_pack")
-def _schedule_dense_pack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if "dense_pack" in op.tag:
-            _schedule_dense_pack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-@autotvm.register_topi_schedule(generic.schedule_dense, "cpu", "direct_nopack")
-def _schedule_dense_nopack(cfg, outs):
-    s = tvm.create_schedule([x.op for x in outs])
-    scheduled_ops = []
-
-    def _callback(op):
-        if 'dense_nopack' in op.tag:
-            _schedule_dense_nopack_template(cfg, s, op.output(0))
-    traverse_inline(s, outs[0].op, _callback)
-    return s
-
-
-def _schedule_dense_pack_template(cfg, s, C):
-    A, packedB = s[C].op.input_tensors
-
-    CC = s.cache_write(C, "global")
-    y, x = s[C].op.axis
-    k, = s[CC].op.reduce_axis
-
-    yt, yo, yi = cfg["tile_y"].apply(s, C, y)
-    xt, xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yt, xt, yo, xo, yi, xi)
-    xyt = s[C].fuse(yt, xt)
-    s[C].parallel(xyt)
-    xyo = s[C].fuse(yo, xo)
-    s[C].unroll(yi)
-    s[C].vectorize(xi)
-
-    s[CC].compute_at(s[C], xyo)
-    y, x = s[CC].op.axis
-    ko, ki = cfg["tile_k"].apply(s, CC, k)
-    s[CC].reorder(ko, ki, y, x)
-    s[CC].vectorize(x)
-    s[CC].unroll(y)
-    s[CC].unroll(ki)
-
-    z, y, x = s[packedB].op.axis
-    s[packedB].reorder(z, x, y)
-    s[packedB].parallel(z)
-    s[packedB].vectorize(y)
-    return s
-
-
-def _schedule_dense_nopack_template(cfg, s, C):
-    y, x = s[C].op.axis
-    kk, = s[C].op.reduce_axis
-    yo, yi = cfg["tile_y"].apply(s, C, y)
-    xo, xi = cfg["tile_x"].apply(s, C, x)
-    s[C].reorder(yo, xo, yi, xi)
-    xyo = s[C].fuse(yo, xo)
-    s[C].parallel(xyo)
-    s[C].unroll(kk)
-
-    CC, = s[C].op.input_tensors
-    s[CC].compute_at(s[C], xyo)
-    z, y, x = s[CC].op.axis
-    k, = s[CC].op.reduce_axis
-    yz = s[CC].fuse(z, y)
-    s[CC].reorder(k, yz, x)
-    s[CC].unroll(yz)
-    s[CC].vectorize(x)
-    return s
-
-
-def _default_dense_pack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-
-    tilex_ii = 1
-    for bn in range(vec_width*2, 0, -1):
-        if N % bn == 0:
-            tilex_ii = bn
-            break
-    NN = N // tilex_ii
-    tilex_oi = 1
-    while NN // tilex_oi > 4:
-        if (NN // tilex_oi) % 2 == 1:
-            break
-        tilex_oi *= 2
-
-    tiley_ii = 8
-    while M % tiley_ii != 0:
-        tiley_ii //= 2
-    MM = M // tiley_ii
-    tiley_oi = 1
-    while MM // tiley_oi > 4:
-        if (MM // tiley_oi) % 2 == 1:
-            break
-        tiley_oi *= 2
-
-    cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii])
-    cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii])
-    cfg["tile_k"] = SplitEntity([K, 1])
-
-
-def _default_dense_nopack_config(cfg, M, N, K):
-    vec_width = get_fp32_len()
-    tilek_bn = 1
-    for bn in range(vec_width*2, 0, -1):
-        if K % bn == 0:
-            tilek_bn = bn
-            break
-    cfg["tile_k"] = SplitEntity([K // tilek_bn, tilek_bn])
-    cfg["tile_x"] = SplitEntity([N, 1])
-    cfg["tile_y"] = SplitEntity([1, M])
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index aac2d1653c78..6fa748547cd9 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -27,6 +27,7 @@
 #include <topi/nn/upsampling.h>
 #include <topi/nn/l2_normalize.h>
 #include <topi/nn/local_response_norm.h>
+#include <topi/nn/batch_matmul.h>
 
 #include <topi/vision/reorg.h>
 #include <topi/image/resize.h>
@@ -351,6 +352,12 @@ TVM_REGISTER_GLOBAL("topi.nn.dense")
   *rv = nn::dense(args[0], args[1], args[2]);
   });
 
+/* Ops from nn/batch_matmul.h */
+TVM_REGISTER_GLOBAL("topi.nn.batch_matmul")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = nn::batch_matmul(args[0], args[1]);
+  });
+
 /* Ops from nn/dilate.h */
 TVM_REGISTER_GLOBAL("topi.nn.dilate")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
@@ -589,6 +596,9 @@ TVM_REGISTER_GENERIC_FUNC(schedule_dense)
 .register_func({ "cuda", "gpu" }, WrapSchedule(topi::cuda::schedule_dense))
 .register_func({ "rocm" }, WrapSchedule(topi::rocm::schedule_dense));
 
+TVM_REGISTER_GENERIC_FUNC(schedule_batch_matmul)
+.set_default(WrapSchedule(topi::generic::default_schedule));
+
 TVM_REGISTER_GENERIC_FUNC(schedule_pool)
 .set_default(WrapSchedule(topi::generic::default_schedule))
 .register_func({ "cpu" }, WrapSchedule(topi::x86::default_schedule))
diff --git a/topi/tests/python/test_topi_batch_matmul.py b/topi/tests/python/test_topi_batch_matmul.py
new file mode 100644
index 000000000000..f699d6aa8dcb
--- /dev/null
+++ b/topi/tests/python/test_topi_batch_matmul.py
@@ -0,0 +1,53 @@
+"""Test code for batch_matmul operator"""
+import numpy as np
+import tvm
+import topi
+import topi.testing
+from topi.util import get_const_tuple
+from tvm.contrib.pickle_memoize import memoize
+
+from common import get_all_backend
+
+def verify_batch_matmul(batch, M, N, K):
+    x = tvm.placeholder((batch, M, K), name='x')
+    y = tvm.placeholder((batch, N, K), name='y')
+    dtype = x.dtype
+
+    # use memoize to pickle the test data for next time use
+    @memoize("topi.tests.test_topi_batch_matmul")
+    def get_ref_data():
+        a_np = np.random.uniform(size=(batch, M, K)).astype(dtype)
+        b_np = np.random.uniform(size=(batch, N, K)).astype(dtype)
+        c_np = topi.testing.batch_matmul(a_np, b_np)
+        return (a_np, b_np, c_np)
+    # get the test data
+    a_np, b_np, c_np = get_ref_data()
+
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            out = topi.nn.batch_matmul(x, y)
+            s = topi.generic.schedule_batch_matmul([out])
+        a = tvm.nd.array(a_np, ctx)
+        b = tvm.nd.array(b_np, ctx)
+        c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx)
+        f = tvm.build(s, [x, y, out], device, name="dense")
+        f(a, b, c)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def test_batch_matmul():
+    verify_batch_matmul(1, 16, 16, 32)
+    verify_batch_matmul(5, 16, 16, 32)
+    verify_batch_matmul(5, 16, 20, 32)
+    verify_batch_matmul(30, 16, 20, 32)
+
+
+if __name__ == "__main__":
+    test_batch_matmul()

From a681e0677cc352219d7d5b6bf074faa475dcb490 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 2 Mar 2019 14:35:23 -0800
Subject: [PATCH 56/93] [ARITH]  Analyzer Infra, ConstIntBound, Modular (#2668)

---
 include/tvm/arithmetic.h                      | 326 ++++++++++++---
 include/tvm/ir.h                              |   1 +
 python/tvm/arith.py                           | 157 ++++++-
 src/api/api_arith.cc                          |  56 ++-
 src/arithmetic/analyzer.cc                    |  44 ++
 src/arithmetic/const_int_bound.cc             | 393 ++++++++++++++++++
 src/arithmetic/int_op_overflow.h              |  78 ++++
 src/arithmetic/int_set_internal.h             |  17 -
 src/arithmetic/modular.cc                     | 168 --------
 src/arithmetic/modular_set.cc                 | 344 +++++++++++++++
 src/arithmetic/pattern_match.h                |  43 +-
 src/codegen/codegen_common.h                  |  59 ---
 src/codegen/llvm/codegen_llvm.cc              |  27 +-
 src/codegen/llvm/codegen_llvm.h               |   5 +-
 src/codegen/spirv/codegen_spirv.cc            |  28 +-
 src/codegen/spirv/codegen_spirv.h             |   4 +-
 src/pass/storage_rewrite.cc                   |   8 +-
 tests/cpp/pattern_match_test.cc               |  17 +
 tests/cpp/unittest.mk                         |  12 -
 .../unittest/test_arith_const_int_bound.py    | 219 ++++++++++
 tests/python/unittest/test_arith_modular.py   |  32 --
 .../python/unittest/test_arith_modular_set.py | 128 ++++++
 22 files changed, 1777 insertions(+), 389 deletions(-)
 create mode 100644 src/arithmetic/analyzer.cc
 create mode 100644 src/arithmetic/const_int_bound.cc
 create mode 100644 src/arithmetic/int_op_overflow.h
 delete mode 100644 src/arithmetic/modular.cc
 create mode 100644 src/arithmetic/modular_set.cc
 delete mode 100644 src/codegen/codegen_common.h
 delete mode 100644 tests/cpp/unittest.mk
 create mode 100644 tests/python/unittest/test_arith_const_int_bound.py
 delete mode 100644 tests/python/unittest/test_arith_modular.py
 create mode 100644 tests/python/unittest/test_arith_modular_set.py

diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index cc9e5374b888..44b00b5d89fa 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -9,14 +9,282 @@
 #include <vector>
 #include <unordered_map>
 #include <memory>
+#include <limits>
 #include "expr.h"
 
 namespace tvm {
-
+// forward delcare Tensor
 class Tensor;
-
 /*! \brief namespace of arithmetic */
 namespace arith {
+//-------------------------------------------------------
+// Base integer analysis API.
+//
+// We have multiple type of analyzers to do relaxed
+// integer set analysis(bound analysis, modulo) and
+// equivalence checking and simplification.
+//
+// Importantly, each analyzer may need result from
+// another analyzer.
+//-------------------------------------------------------
+
+// Forward declare Analyzer
+class Analyzer;
+/*!
+ * \brief reference class to ConstIntBoundNode
+ * \sa ConstIntBoundNode
+ */
+class ConstIntBound;
+/*!
+ * \brief Constant integer up and lower bound(inclusive).
+ *  Useful for value bound analysis.
+ *
+ *  set = [min_value, max_value]
+ */
+class ConstIntBoundNode : public Node {
+ public:
+  int64_t min_value;
+  int64_t max_value;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("min_value", &min_value);
+    v->Visit("max_value", &max_value);
+  }
+
+  TVM_DLL static ConstIntBound make(int64_t min_value, int64_t max_value);
+
+  /*! \brief Number to represent +inf */
+  static const constexpr int64_t kPosInf = std::numeric_limits<int64_t>::max();
+  /*!
+   * \brief Number to represent -inf
+   * \note We can make use the of fact that -kPosInf == kNegInf in the project.
+   */
+  static const constexpr int64_t kNegInf = -kPosInf;
+
+  static constexpr const char* _type_key = "arith.ConstIntBound";
+  TVM_DECLARE_NODE_TYPE_INFO(ConstIntBoundNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(ConstIntBound, ConstIntBoundNode);
+
+/*!
+ * \brief Analyzer to get constant integer bound over expression.
+ */
+class ConstIntBoundAnalyzer {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  ConstIntBound operator()(const Expr& expr);
+
+  /*!
+   * \brief Update constant int bound information of var.
+   *
+   * \param var The variable of interest.
+   * \param info The bound information.
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const ConstIntBound& info,
+              bool override = false);
+  /*!
+   * \brief Bind variable to a range.
+   *
+   * \param var The variable.
+   * \param range The range we bind to.
+   */
+  void Bind(const Var& var, const Range& range);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit ConstIntBoundAnalyzer(Analyzer* parent);
+  ~ConstIntBoundAnalyzer();
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
+  std::function<void()> EnterConstraint(const Expr& constraint);
+  struct Entry;
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
+/*!
+ * \brief reference of ModularSetNode
+ * \sa ModularSetNode
+ */
+class ModularSet;
+/*!
+ * \brief Range of a linear integer function.
+ *  Use to do specify the possible index values.
+ *
+ *  set = { coeff * x + base | x in Z }
+ *
+ *  When coeff != 0, it can also be written as
+ *  set = { n | n % coeff == base }
+ *
+ *  This is useful to decide if the index is dividable by certain value.
+ *  For example, if index = 0 + 4 x, then we know it can be divided by 4.
+ */
+class ModularSetNode : public Node {
+ public:
+  /*! \brief linear co-efficient */
+  int64_t coeff;
+  /*! \brief The base */
+  int64_t base;
+
+  void VisitAttrs(tvm::AttrVisitor* v) final {
+    v->Visit("coeff", &coeff);
+    v->Visit("base", &base);
+  }
+
+  TVM_DLL static ModularSet make(int64_t coeff, int64_t base);
+
+  static constexpr const char* _type_key = "arith.ModularSet";
+  TVM_DECLARE_NODE_TYPE_INFO(ModularSetNode, Node);
+};
+
+TVM_DEFINE_NODE_REF(ModularSet, ModularSetNode);
+
+/*!
+ * \brief Analyzer to get modular information over expression.
+ */
+class ModularSetAnalyzer {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  ModularSet operator()(const Expr& expr);
+  /*!
+   * \brief Update constant int bound information of var.
+   *
+   * \param var The variable of interest.
+   * \param info The bound information.
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const ModularSet& info,
+              bool override = false);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit ModularSetAnalyzer(Analyzer* parent);
+  ~ModularSetAnalyzer();
+  /*!
+   * \brief Update the internal state to enter constraint.
+   * \param constraint A constraint expression.
+   *
+   * \return an exit function that must be called to cleanup the constraint can be nullptr.
+   */
+  std::function<void()> EnterConstraint(const Expr& constraint);
+  struct Entry;
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
+/*!
+ * \brief A RAII constraint context.
+ *
+ * \code
+ *
+ *  Var("x");
+ *  arith::Analyzer analyzer;
+ *  {
+ *    arith::ConstraintContext cctx(&analyzer, x % 3 == 0);
+ *    CHECK_EQ(analyzer.modular_set(x)->coeff, 3);
+ *  }
+ *  // constraint no longer in effect.
+ *  CHECK_NE(analyzer.modular_set(x)->coeff, 3);
+ *
+ * \endcode
+ */
+class ConstraintContext {
+ public:
+  /*!
+   * \brief Construct a constraint context.
+   * \param analyzer The analyzer.
+   * \param constraint The constraint to be applied.
+   */
+  ConstraintContext(Analyzer* analyzer, const Expr& constraint) DMLC_THROW_EXCEPTION;
+  /*! \brief destructor */
+  ~ConstraintContext() DMLC_THROW_EXCEPTION {
+    exit_();
+  }
+
+ private:
+  /*! \brief function to be called in recovery */
+  std::function<void()> exit_;
+};
+
+/*!
+ * \brief Analyzer that contains bunch of sub-analyzers.
+ *
+ * Each sub-analyzer can make use of another sub-analyzer
+ * by weak reference of this.
+ *
+ * NOTE for sub-analyzer developers:
+ * If the analyzer uses memoization, we need to clear the internal
+ * cache when information about a Var has been overrideen.
+ */
+class Analyzer {
+ public:
+  /*! \brief sub-analyzer: const integer bound */
+  ConstIntBoundAnalyzer const_int_bound;
+  /*! \brief sub-analyzer: modular set */
+  ModularSetAnalyzer modular_set;
+  /*! \brief constructor */
+  Analyzer();
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to expr.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param expr The expression we bind to.
+   */
+  void Bind(const VarExpr& var, const Expr& expr);
+  /*!
+   * \brief Notify all the sub-analyzers that var
+   *        is created and binded to a range.
+   *
+   *  Each var can only be binded once.
+   *
+   * \param var The variable.
+   * \param range The range we bind to.
+   */
+  void Bind(const VarExpr& var, const Range& range);
+  /*!
+   * \brief Whether can we proof expr >= val.
+
+   *  Non-negative proof is very useful in integer analysis
+   *  to lower divisions and mods given difference in trunc and ceil mode.
+   *
+   * \param expr The expression.
+   * \param lower_bound The lower bound.
+   * \return Whether we can proof it.
+   *
+   * \note Analyzer will call into sub-analyzers to get the result.
+   */
+  bool CanProveGreaterEqual(const Expr& expr, int64_t lower_bound);
+};
+
+//-----------------------------------------------
+// Integer set abstraction API.
+//
+// This is a API build on top of the base
+// integer analysis API to provide set analysis.
+//------------------------------------------------
 /*!
  * \brief Sign of an expression or set.
  */
@@ -118,42 +386,6 @@ class IntSet : public NodeRef {
   static IntSet interval(Expr min, Expr max);
 };
 
-/*!
- * \brief Range of a linear integer function.
- *  Use to do specify the possible index values.
- *
- *  set = { coeff * x + base | x in Z }
- *
- *  When coeff != 0, it can also be written as
- *  set = { n | n % coeff == base }
- *
- *  This is useful to decide if the index is dividable by certain value.
- *  For example, if index = 0 + 4 x, then we know it can be divided by 4.
- */
-struct ModularEntry {
-  /*! \brief linear co-efficient */
-  int coeff{1};
-  /*! \brief The base */
-  int base{0};
-
-  /*! \return entry represent everything */
-  static ModularEntry everything() {
-    // always safe to set 0 + x, so it can be everything.
-    ModularEntry e;
-    e.coeff = 1;
-    e.base = 0;
-    return e;
-  }
-  /*!
-   * \brief Add two modular entries together to get a new modular entry.
-   * \param a The left operand.
-   * \param b The right operand.
-   * \return The combined modular entry.
-   */
-  static ModularEntry Add(const ModularEntry& a,
-                          const ModularEntry& b);
-};
-
 /*!
  * \brief Base class of all IntSet containers.
  */
@@ -300,24 +532,6 @@ IntSet DeduceBound(Expr v, Expr cond,
  */
 Domain DomainTouched(Stmt body, const Tensor &tensor, bool consider_calls, bool consider_provides);
 
-/*!
- * \brief Evaluate the expression with modular analysis
- * \param e The expression to be evaluated.
- * \param mod_map Map of modular statistics of known variables.
- * \return The ModularEntry covering all possible value of e.
- */
-ModularEntry EvalModular(
-    const Expr& e,
-    const std::unordered_map<const Variable*, ModularEntry>& mod_map);
-
-/*!
- * \brief Same as EvalModular, used by front-end.
- * \param e The expression to be evaluated.
- * \param mod_map Map of modular statistics of known variables.
- * \return A ModularSet covering all possible value of e.
- */
-IntSet EvalModular(const Expr& e,
-                   const Map<Var, IntSet>& mod_map);
 // implementation
 inline const IntSetNode* IntSet::operator->() const {
   return static_cast<const IntSetNode*>(node_.get());
diff --git a/include/tvm/ir.h b/include/tvm/ir.h
index 3ef955e834d0..0f05c98e0722 100644
--- a/include/tvm/ir.h
+++ b/include/tvm/ir.h
@@ -17,6 +17,7 @@
 namespace tvm {
 namespace ir {
 
+using HalideIR::Internal::BaseExprNode;
 using HalideIR::Internal::ExprNode;
 using HalideIR::Internal::StmtNode;
 using HalideIR::Internal::IRNodeType;
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index 778d761c659e..92aaa36aa10f 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -33,9 +33,162 @@ class StrideSet(IntSet):
     """Represent set of strided integers"""
 
 
-@register_node
-class ModularSet(IntSet):
+@register_node("arith.ModularSet")
+class ModularSet(NodeBase):
     """Represent range of (coeff * x + base) for x in Z """
+    def __init__(self, coeff, base):
+        self.__init_handle_by_constructor__(
+            _make_ModularSet, coeff, base)
+
+
+@register_node("arith.ConstIntBound")
+class ConstIntBound(NodeBase):
+    """Represent constant integer bound
+
+    Parameters
+    ----------
+    min_value : int
+        The minimum value of the bound.
+
+    max_value : int
+        The maximum value of the bound.
+    """
+    POS_INF = (1 << 63) - 1
+    NEG_INF = -POS_INF
+
+    def __init__(self, min_value, max_value):
+        self.__init_handle_by_constructor__(
+            _make_ConstIntBound, min_value, max_value)
+
+
+class ConstraintScope:
+    """Constraint scope.
+
+    Parameters
+    ----------
+    fenter : function
+        A function that will be called to create an enter context.
+
+    Note
+    ----
+    Do not create object directly, use Analyzer.constraint_scope
+    """
+    def __init__(self, fenter):
+        self._fenter = fenter
+        self._fexit = None
+
+    def __enter__(self):
+        self._fexit = self._fenter()
+
+    def __exit__(self, ptype, value, trace):
+        self._fexit()
+
+
+class Analyzer:
+    """Integer arithmetic analyzer
+
+    This is a stateful analyzer class that can
+    be used to perform various symbolic integer analysis.
+    """
+    def __init__(self):
+        _mod = _CreateAnalyzer()
+        self._const_int_bound = _mod("const_int_bound")
+        self._const_int_bound_update = _mod("const_int_bound_update")
+        self._bind = _mod("bind")
+        self._modular_set = _mod("modular_set")
+        self._enter_constraint_context = _mod("enter_constraint_context")
+
+    def const_int_bound(self, expr):
+        """Find constant integer bound for expr.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        bound : ConstIntBound
+            The result bound
+        """
+        return self._const_int_bound(expr)
+
+    def modular_set(self, expr):
+        """Find a modular set that expr belongs to.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        result : ModularSet
+            The result.
+        """
+        return self._modular_set(expr)
+
+    def bind(self, var, expr):
+        """Bind a variable to the expression.
+
+        Parameters
+        ----------
+        var : tvm.Var
+            The variable.
+
+        expr : tvm.Expr
+            The expression.
+        """
+        return self._bind(var, expr)
+
+    def constraint_scope(self, constraint):
+        """Create a constraint scope.
+
+        Parameters
+        ----------
+        constraint : tvm.Expr
+            The constraint expression.
+
+        returns
+        -------
+        scope : ConstraintScope
+            The constraint scope
+
+        Examples
+        --------
+        .. code-block:: python
+
+          x = tvm.var("x")
+          analyzer = tvm.arith.Analyzer()
+          with analzyer.constraint_scope(x % 3 == 0):
+              # constraint in effect
+              assert analyzer.modular_set(x).coeff == 3
+          # constraint no longer in effect
+          assert analyzer.modular_set(x).coeff != 3
+        """
+        def _fenter():
+            return self._enter_constraint_context(constraint)
+        return ConstraintScope(_fenter)
+
+    def update(self, var, info, override=False):
+        """Update infomation about var
+
+        Parameters
+        ----------
+        var : tvm.Var
+            The variable.
+
+        info : tvm.NodeBase
+            Related information.
+
+        override : bool
+            Whether allow override.
+        """
+        if isinstance(info, ConstIntBound):
+            self._const_int_bound_update(var, info, override)
+        else:
+            raise TypeError(
+                "Do not know how to handle type {}".format(type(info)))
 
 
 _init_api("tvm.arith")
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index 31ff5ccb3a15..cba70370f5b6 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -26,11 +26,6 @@ TVM_REGISTER_API("arith.intset_interval")
     *ret = IntSet::interval(args[0], args[1]);
   });
 
-TVM_REGISTER_API("arith.EvalModular")
-.set_body([](TVMArgs args, TVMRetValue *ret) {
-    *ret = EvalModular(args[0], Map<Var, IntSet>());
-  });
-
 TVM_REGISTER_API("arith.DetectLinearEquation")
 .set_body([](TVMArgs args, TVMRetValue *ret) {
     *ret = DetectLinearEquation(args[0], args[1]);
@@ -75,5 +70,56 @@ TVM_REGISTER_API("_IntSetIsEverything")
     *ret = args[0].operator IntSet().is_everything();
   });
 
+TVM_REGISTER_API("arith._make_ConstIntBound")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ConstIntBoundNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("arith._make_ModularSet")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    *ret = ModularSetNode::make(args[0], args[1]);
+  });
+
+TVM_REGISTER_API("arith._CreateAnalyzer")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+    using runtime::PackedFunc;
+    using runtime::TypedPackedFunc;
+    auto self = std::make_shared<Analyzer>();
+    auto f = [self](std::string name) -> PackedFunc {
+      if (name == "const_int_bound") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->const_int_bound(args[0]);
+          });
+      } else if (name == "modular_set") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->modular_set(args[0]);
+        });
+      } else if (name == "const_int_bound_update") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            self->const_int_bound.Update(args[0], args[1], args[2]);
+        });
+      } else if (name == "bind") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            auto& sptr = args[1].node_sptr();
+            if (sptr->is_type<Range::ContainerType>()) {
+              self->Bind(args[0], args[1].operator Range());
+            } else {
+              self->Bind(args[0], args[1].operator Expr());
+            }
+        });
+      } else if (name == "enter_constraint_context") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            auto ctx = std::make_shared<ConstraintContext>(self.get(), args[0]);
+            auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
+              ctx.reset();
+            };
+            *ret = PackedFunc(fexit);
+        });
+      }
+      return PackedFunc();
+    };
+    *ret = TypedPackedFunc<PackedFunc(std::string)>(f);
+});
+
 }  // namespace arith
 }  // namespace tvm
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
new file mode 100644
index 000000000000..236a21ba71f5
--- /dev/null
+++ b/src/arithmetic/analyzer.cc
@@ -0,0 +1,44 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/arithmetic/analyzer.cc
+ */
+#include <tvm/arithmetic.h>
+
+namespace tvm {
+namespace arith {
+
+Analyzer::Analyzer()
+    : const_int_bound(this),
+      modular_set(this) {
+}
+
+void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
+  Var var(v.node_);
+  this->const_int_bound.Update(var, this->const_int_bound(expr));
+  this->modular_set.Update(var, this->modular_set(expr));
+}
+
+void Analyzer::Bind(const VarExpr& v, const Range& range) {
+  Var var(v.node_);
+  this->const_int_bound.Bind(var, range);
+  // skip modular_set
+}
+
+ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint) {
+  // entering the scope.
+  auto f0 = analyzer->const_int_bound.EnterConstraint(constraint);
+  auto f1 = analyzer->modular_set.EnterConstraint(constraint);
+  // recovery function.
+  exit_ = [f0, f1]() {
+    if (f1 != nullptr) f1();
+    if (f0 != nullptr) f0();
+  };
+}
+
+bool Analyzer::CanProveGreaterEqual(const Expr& expr, int64_t lower_bound) {
+  auto bd = this->const_int_bound(expr);
+  if (bd->min_value >= lower_bound) return true;
+  return false;
+}
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/const_int_bound.cc b/src/arithmetic/const_int_bound.cc
new file mode 100644
index 000000000000..c83be8933b55
--- /dev/null
+++ b/src/arithmetic/const_int_bound.cc
@@ -0,0 +1,393 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file tvm/arithmetic/const_int_bound.cc
+ */
+#include <tvm/arithmetic.h>
+#include <tvm/ir_functor_ext.h>
+#include <algorithm>
+#include "int_op_overflow.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+TVM_REGISTER_NODE_TYPE(ConstIntBoundNode);
+
+ConstIntBound ConstIntBoundNode::make(
+    int64_t min_value, int64_t max_value) {
+  auto node = make_node<ConstIntBoundNode>();
+  node->min_value = min_value;
+  node->max_value = max_value;
+  return ConstIntBound(node);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<ConstIntBoundNode>([](const ConstIntBoundNode *op, IRPrinter *p) {
+    p->stream << "ConstIntBound"
+              << "[" << op->min_value << ", "
+              << op->max_value << ']';
+  });
+
+// internal entry for const int bound
+struct ConstIntBoundAnalyzer::Entry {
+  int64_t min_value;
+  int64_t max_value;
+
+  bool is_const(int64_t value) const {
+    return min_value == max_value && min_value == value;
+  }
+};
+
+class ConstIntBoundAnalyzer::Impl :
+      public ExprFunctor<ConstIntBoundAnalyzer::Entry(const Expr&)> {
+ public:
+  void Bind(const Var& var, const Range& range) {
+    Entry a = VisitExpr(range->min);
+    Entry b = VisitExpr(range->extent);
+    Entry ret;
+    ret.min_value = a.min_value;
+    ret.max_value = InfAwareAdd(a.max_value, InfAwareAdd(b.max_value, -1));
+    Update(var, ret, false);
+  }
+
+  void Update(const Var& var,
+              const Entry& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    var_map_[var] = info;
+  }
+
+  void Update(const Var& var,
+              const ConstIntBound& info,
+              bool override) {
+    Update(var, MakeBound(info->min_value, info->max_value), override);
+  }
+
+  // Override visitor behaviors
+  Entry VisitExprDefault_(const Node* op) final {
+    return Everything(
+        static_cast<const ir::BaseExprNode*>(op)->type);
+  }
+
+  Entry VisitExpr_(const Cast* op) final {
+    Entry a = VisitExpr(op->value);
+    Entry b = Everything(op->type);
+    return Intersect(a, b);
+  }
+
+  Entry VisitExpr_(const IntImm* op) final {
+    return MakeBound(op->value, op->value);
+  }
+
+  Entry VisitExpr_(const UIntImm* op) final {
+    if (op->value <= static_cast<uint64_t>(kPosInf)) {
+      return MakeBound(op->value, op->value);
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Add* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = InfAwareAdd(a.min_value, b.min_value);
+    ret.max_value = InfAwareAdd(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Sub* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = InfAwareAdd(a.min_value, -b.max_value);
+    ret.max_value = InfAwareAdd(a.max_value, -b.min_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Mul* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return BinaryOpBoundry(a, b, InfAwareMul);
+  }
+
+  Entry VisitExpr_(const Div* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    CHECK(!b.is_const(0)) << "divide by zero";
+    // assume no division by 0
+    if (b.min_value == 0) b.min_value = 1;
+    if (b.max_value == 0) b.max_value = -1;
+    return BinaryOpBoundry(a, b, InfAwareDiv);
+  }
+
+  Entry VisitExpr_(const Mod* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    if (b.min_value > 0) {
+      int64_t b_max_cap = InfAwareAdd(b.max_value, -1);
+      if (a.min_value >= 0) {
+        // 0 <= [a_min, a_max] < b_min
+        if (a.max_value < b.min_value) return a;
+        // other case, we can get close to 0
+        return MakeBound(0,
+                         std::min(a.max_value, b_max_cap));
+      } else {
+        return MakeBound(std::max(a.min_value, -b_max_cap),
+                         std::min(a.max_value, b_max_cap));
+      }
+    } else {
+      CHECK(!b.is_const(0)) << "mod by zero";
+      // mod by negative value is rare,
+      // and we just use the simpliest rule.
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Min* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = std::min(a.min_value, b.min_value);
+    ret.max_value = std::min(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Max* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.min_value = std::max(a.min_value, b.min_value);
+    ret.max_value = std::max(a.max_value, b.max_value);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Select* op) final {
+    Entry a = VisitExpr(op->true_value);
+    Entry b = VisitExpr(op->false_value);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Call* op) final {
+    // only special handle >> and & which can be
+    // used for index calculation.
+    if (op->is_intrinsic(Call::shift_right)) {
+      return VisitRightShift(op);
+    } else if (op->is_intrinsic(Call::bitwise_and)) {
+      return VisitBitwiseAnd(op);
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitExpr_(const Variable* op) final {
+    Var v = GetRef<Var>(op);
+    auto it = var_map_.find(v);
+    if (it != var_map_.end()) {
+      return it->second;
+    } else {
+      return Everything(op->type);
+    }
+  }
+
+  Entry VisitRightShift(const Call* op) {
+    Entry a = VisitExpr(op->args[0]);
+    Entry b = VisitExpr(op->args[1]);
+    return BinaryOpBoundry(a, b, InfAwareRightShift);
+  }
+
+  Entry VisitBitwiseAnd(const Call* op) {
+    Entry a = VisitExpr(op->args[0]);
+    Entry b = VisitExpr(op->args[1]);
+    // handle positive index case.
+    if (a.min_value >= 0 && b.min_value >= 0) {
+      return MakeBound(0, std::min(a.max_value, b.max_value));
+    } else {
+      if (b.min_value >= 0) {
+        return MakeBound(0, b.max_value);
+      }
+      if (a.min_value >= 0) {
+        return MakeBound(0, a.max_value);
+      }
+      return Everything(op->type);
+    }
+  }
+
+ private:
+  // internal variable map
+  std::unordered_map<Var, Entry, ExprHash, ExprEqual> var_map_;
+  // constants: the limit value means umlimited
+  // NOTE: kNegInf/kPosInf are used to represent infinity.
+  static const constexpr int64_t kNegInf = ConstIntBoundNode::kNegInf;
+  static const constexpr int64_t kPosInf = ConstIntBoundNode::kPosInf;
+  static_assert(-kNegInf == kPosInf, "invariant of inf");
+  // internal helper functions
+  /*!
+   * \brief Get boundary of binary op who are monotonic wrt to one argument.
+   * \param param a The entry of the left operand.
+   * \param param a The entry of the right operand.
+   * \param op The operator.
+   * \tparam F the operator function type.
+   * \return The result.
+   */
+  template<typename F>
+  static Entry BinaryOpBoundry(Entry a, Entry b, const F& op) {
+    Entry ret;
+    // The boundary point must be shihft of the original boundary.
+    int64_t v1 = op(a.min_value, b.min_value);
+    int64_t v2 = op(a.max_value, b.max_value);
+    int64_t v3 = op(a.min_value, b.max_value);
+    int64_t v4 = op(a.max_value, b.min_value);
+    ret.min_value = std::min(std::min(std::min(v1, v2), v3), v4);
+    ret.max_value = std::max(std::max(std::max(v1, v2), v3), v4);
+    return ret;
+  }
+  /*!
+   * \brief Compute x + y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareAdd(int64_t x, int64_t y) {
+    if (x == kPosInf) {
+      CHECK(y != kNegInf);
+      return kPosInf;
+    }
+    if (x == kNegInf) {
+      CHECK(y != kPosInf);
+      return kNegInf;
+    }
+    if (y == kPosInf || y == kNegInf) return y;
+    if (WillOverflow<Add>(x, y, kNegInf, kPosInf)) {
+      if (x > 0) return kPosInf;
+      return kNegInf;
+    }
+    return x + y;
+  }
+  /*!
+   * \brief Compute x * y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareMul(int64_t x, int64_t y) {
+    if (!WillOverflow<Mul>(x, y, kNegInf, kPosInf)) return x * y;
+    if ((x > 0 && y > 0) || (x < 0 && y < 0)) return kPosInf;
+    return kNegInf;
+  }
+  /*!
+   * \brief Compute x / y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareDiv(int64_t x, int64_t y) {
+    CHECK_NE(y, 0);
+    if (x == kPosInf || x == kNegInf) {
+      if (y > 0) return x;
+      return -x;
+    }
+    return x / y;
+  }
+  /*!
+   * \brief Compute x / y, aware of inf.
+   * \param x The left operand.
+   * \param y The right operand.
+   * \return the result.
+   */
+  static int64_t InfAwareRightShift(int64_t x, int64_t y) {
+    if (x == kPosInf || x == kNegInf) return x;
+    return x >> y;
+  }
+  /*!
+   * \brief Make a new bound entry.
+   */
+  static Entry MakeBound(int64_t min_value, int64_t max_value) {
+    Entry e;
+    e.min_value = min_value;
+    e.max_value = max_value;
+    return e;
+  }
+  /*!
+   * \brief Create union of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Union(Entry a, Entry b) {
+    Entry ret;
+    ret.min_value = std::min(a.min_value, b.min_value);
+    ret.max_value = std::max(a.max_value, b.max_value);
+    return ret;
+  }
+  /*!
+   * \brief Create intersect of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Intersect(Entry a, Entry b) {
+    Entry ret;
+    ret.min_value = std::max(a.min_value, b.min_value);
+    ret.max_value = std::min(a.max_value, b.max_value);
+    return ret;
+  }
+  /*!
+   * \brief return everything dtype can represent.
+   * \param dtype The data type.
+   * \return Bound that represent everything dtype can represent.
+   */
+  static Entry Everything(Type dtype) {
+    if (!dtype.is_int() && !dtype.is_uint()) {
+      return MakeBound(kNegInf, kPosInf);
+    }
+    Entry ret;
+    int64_t vbits = dtype.bits() - static_cast<int>(dtype.is_int());
+    if (dtype.is_uint()) {
+      ret.min_value = 0;
+    } else {
+      if (vbits >= 63) {
+        ret.min_value = kNegInf;
+      } else {
+        ret.min_value = -(static_cast<int64_t>(1) << vbits);
+      }
+    }
+    if (vbits >= 63) {
+      ret.max_value = kPosInf;
+    } else {
+      ret.max_value = (static_cast<int64_t>(1) << vbits) - 1;
+    }
+    return ret;
+  }
+};
+
+ConstIntBound ConstIntBoundAnalyzer::operator()(const Expr& expr) {
+  Entry ret = impl_->VisitExpr(expr);
+  return ConstIntBoundNode::make(ret.min_value, ret.max_value);
+}
+
+void ConstIntBoundAnalyzer::Update(const Var& var,
+                                   const ConstIntBound& info,
+                                   bool override) {
+  impl_->Update(var, info, override);
+}
+
+void ConstIntBoundAnalyzer::Bind(const Var& var, const Range& range) {
+  impl_->Bind(var, range);
+}
+
+std::function<void()> ConstIntBoundAnalyzer::EnterConstraint(const Expr& constraint) {
+  return nullptr;
+}
+
+ConstIntBoundAnalyzer::ConstIntBoundAnalyzer(Analyzer* parent)
+    : impl_(new Impl()) {
+}
+
+ConstIntBoundAnalyzer::~ConstIntBoundAnalyzer() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/int_op_overflow.h b/src/arithmetic/int_op_overflow.h
new file mode 100644
index 000000000000..ef637b4b9521
--- /dev/null
+++ b/src/arithmetic/int_op_overflow.h
@@ -0,0 +1,78 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file int_op_overflow.h
+ * \brief Utility functions to detect if an integer op will overflow.
+ */
+#ifndef TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
+#define TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
+
+#include <limits>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Check if an integer op with operand x, y will overflow.
+ * \param x The left operand.
+ * \param y The left operand.
+ * \param min_value The minimum value of the domain.
+ * \param max_value The maximum value of the domain.
+ * \return Whether overflow can happen.
+ * \tparam Op The integer operator.
+ */
+template<typename Op>
+inline bool WillOverflow(int64_t x,
+                         int64_t y,
+                         int64_t min_value,
+                         int64_t max_value) {
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Add>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if ((y > 0) && (x > max_value - y)) return true;
+  if ((y < 0) && (x < min_value - y)) return true;
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Sub>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if ((y > 0) && (x < min_value + y)) return true;
+  if ((y < 0) && (x > max_value + y)) return true;
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Mul>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  if (y == 0) return false;
+  if (y > 0) {
+    if (x < min_value / y)  return true;
+    if (x > max_value / y)  return true;
+  } else {
+    if (y == -1 && x == std::numeric_limits<int64_t>::min()) return true;
+    if (x > min_value / y)  return true;
+    if (x < max_value / y)  return true;
+  }
+  return false;
+}
+
+template<>
+bool WillOverflow<ir::Mod>(int64_t x,
+                           int64_t y,
+                           int64_t min_value,
+                           int64_t max_value) {
+  return y == 0;
+}
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITHMETIC_INT_OP_OVERFLOW_H_
diff --git a/src/arithmetic/int_set_internal.h b/src/arithmetic/int_set_internal.h
index e28fe2a9d958..cc2a4c307997 100644
--- a/src/arithmetic/int_set_internal.h
+++ b/src/arithmetic/int_set_internal.h
@@ -54,23 +54,6 @@ struct StrideSet : public IntSetNode {
   TVM_DECLARE_NODE_TYPE_INFO(StrideSet, IntSetNode);
 };
 
-/*!
- * \brief Set represented by range of ModularEntry.
- *  Used for front-end modular analysis.
- */
-struct ModularSet : public IntSetNode {
-  /*! \brief Internal modular entry */
-  ModularEntry e;
-
-  void VisitAttrs(AttrVisitor* v) final {
-    v->Visit("base", &(e.base));
-    v->Visit("coeff", &(e.coeff));
-  }
-  static constexpr const char* _type_key = "ModularSet";
-  TVM_DECLARE_NODE_TYPE_INFO(ModularSet, IntSetNode);
-};
-
-
 }  // namespace arith
 }  // namespace tvm
 
diff --git a/src/arithmetic/modular.cc b/src/arithmetic/modular.cc
deleted file mode 100644
index d79300eb7782..000000000000
--- a/src/arithmetic/modular.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/*!
- *  Copyright (c) 2017 by Contributors
- * \file modular.cc
- * \brief Modular analysis
- */
-#include <tvm/ir.h>
-#include <tvm/ir_functor_ext.h>
-#include <tvm/ir_visitor.h>
-#include <tvm/arithmetic.h>
-#include <limits>
-#include "int_set_internal.h"
-
-namespace tvm {
-namespace arith {
-
-using namespace ir;
-
-class ModularEvaluator
-    : public ExprFunctor<ModularEntry(const Expr&)> {
- public:
-  explicit ModularEvaluator(
-      const std::unordered_map<
-      const Variable*, ModularEntry>& mod_map)
-      : mod_map_(mod_map) {
-  }
-  ModularEntry Eval(const Expr& e) {
-    return VisitExpr(e);
-  }
-  // default
-  ModularEntry VisitExprDefault_(const Node*) final {
-    return ModularEntry::everything();
-  }
-  // override combination rules.
-  ModularEntry VisitExpr_(const IntImm* op) final {
-    if (op->value < std::numeric_limits<int>::max()) {
-      ModularEntry ret;
-      ret.base = static_cast<int>(op->value);
-      ret.coeff = 0;
-      return ret;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const UIntImm* op) final {
-    if (op->value < static_cast<uint64_t>(
-            std::numeric_limits<int>::max())) {
-      ModularEntry ret;
-      ret.base = static_cast<int>(op->value);
-      ret.coeff = 0;
-      return ret;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const Variable* op) final {
-    auto it = mod_map_.find(op);
-    if (it != mod_map_.end()) {
-      return it->second;
-    } else {
-      return ModularEntry::everything();
-    }
-  }
-  ModularEntry VisitExpr_(const Add* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base + b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Sub* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
-    ret.base = BaseSimplify(a.base - b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Mul* op) final {
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    // Simplification rule, x, y, z are in Z
-    // (p x + n) (q y + m)
-    // -> pq xy + pm x + qn y + mn
-    // -> pq z + pm x + qn y + mn
-    int pq = a.coeff * b.coeff;
-    int pm = a.coeff * b.base;
-    int qn = a.base * b.coeff;
-    ModularEntry ret;
-    ret.coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
-    ret.base = BaseSimplify(a.base * b.base, ret.coeff);
-    return ret;
-  }
-  ModularEntry VisitExpr_(const Div* op) final {
-    // a c x  / c -> a x
-    // We cannot do cases where offset is non-zero
-    // because of different integer rounding in pos/neg
-    ModularEntry a = Eval(op->a);
-    ModularEntry b = Eval(op->b);
-    if (b.coeff == 0 &&
-        a.base == 0) {
-      CHECK_NE(b.base, 0);
-      if (a.coeff % b.base == 0) {
-        ModularEntry ret;
-        ret.coeff = a.coeff / b.base;
-        ret.base = 0;
-        return ret;
-      }
-    }
-    return ModularEntry::everything();
-  }
-
- private:
-  const std::unordered_map<
-    const Variable*, ModularEntry>& mod_map_;
-  friend struct ModularEntry;
-  // simplify the base by putting it in range.
-  static int BaseSimplify(int base, int coeff) {
-    if (coeff == 0) return base;
-    base = base % coeff;
-    if (base < 0) base += coeff;
-    return base;
-  }
-  static int ZeroAwareGCD(int a, int b) {
-    CHECK_GE(a, 0);
-    CHECK_GE(b, 0);
-    if (a < b) std::swap(a, b);
-    if (b == 0) return a;
-    // perform GCD (greatest common divisor)
-    // ax + by = gcd(a, b) z if a != 0, b != 0
-    while (a % b != 0) {
-      a = a % b;
-      std::swap(a, b);
-    }
-    return b;
-  }
-};
-
-ModularEntry ModularEntry::Add(const ModularEntry& a,
-                               const ModularEntry& b) {
-  ModularEntry ret;
-  ret.coeff = ModularEvaluator::ZeroAwareGCD(a.coeff, b.coeff);
-  ret.base = ModularEvaluator::BaseSimplify(a.base + b.base, ret.coeff);
-  return ret;
-}
-
-
-ModularEntry EvalModular(
-    const Expr& e,
-    const std::unordered_map<const Variable*, ModularEntry>& mod_map) {
-  return ModularEvaluator(mod_map)(e);
-}
-
-IntSet EvalModular(const Expr& e,
-                   const Map<Var, IntSet>& mod_map) {
-  std::unordered_map<const Variable*, ModularEntry> mmap;
-  for (auto& kv : mod_map) {
-    const ModularSet* m = kv.second.as<ModularSet>();
-    CHECK(m) << "Need to pass ModularSet for Modular Analysis";
-    mmap[kv.first.get()] = m->e;
-  }
-  NodePtr<ModularSet> n = make_node<ModularSet>();
-  n->e = ModularEvaluator(mmap)(e);
-  return IntSet(n);
-}
-
-}  // namespace arith
-}  // namespace tvm
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
new file mode 100644
index 000000000000..8da6e91fc7fa
--- /dev/null
+++ b/src/arithmetic/modular_set.cc
@@ -0,0 +1,344 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file modular_set.cc
+ * \brief Modular set analysis
+ */
+#include <tvm/arithmetic.h>
+#include <tvm/ir_operator.h>
+#include <tvm/ir_functor_ext.h>
+#include <limits>
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+TVM_REGISTER_NODE_TYPE(ModularSetNode);
+
+ModularSet ModularSetNode::make(int64_t coeff, int64_t base) {
+  auto node = make_node<ModularSetNode>();
+  node->coeff = coeff;
+  node->base = base;
+  return ModularSet(node);
+}
+
+TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
+.set_dispatch<ModularSetNode>([](const ModularSetNode *op, IRPrinter *p) {
+    p->stream << "ModularSet("
+              << "coeff=" << op->coeff << ", base="
+              << op->base << ')';
+  });
+
+
+// internal entry for const int bound
+struct ModularSetAnalyzer::Entry {
+  int64_t coeff{1};
+  int64_t base{0};
+
+  bool is_const() const {
+    return coeff == 0;
+  }
+};
+
+class ModularSetAnalyzer::Impl :
+      public ExprFunctor<ModularSetAnalyzer::Entry(const Expr&)> {
+ public:
+  explicit Impl(Analyzer* parent)
+      : parent_(parent) {}
+
+  void Update(const Var& var,
+              const ModularSet& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    Entry e;
+    e.coeff = info->coeff;
+    e.base = info->base;
+    var_map_[var] = e;
+  }
+
+  // Detect useful constraints and use them in the analysis scope.
+  std::function<void()> EnterConstraint(const Expr& constraint) {
+    PVar<Var> var;
+    PVar<Integer> coeff, base;
+    // pattern match interesting constraints
+    if (((var % coeff) == base).Match(constraint)) {
+      Entry entry;
+      entry.coeff = coeff.Eval()->value;
+      entry.base = base.Eval()->value;
+      return UpdateByIntersect(var.Eval(), entry);
+    }
+    return nullptr;
+  }
+
+  // Override visitor behaviors
+  Entry VisitExprDefault_(const Node* op) final {
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Cast* op) final {
+    return VisitExpr(op->value);
+  }
+
+  Entry VisitExpr_(const IntImm* op) final {
+    Entry ret;
+    ret.base = op->value;
+    ret.coeff = 0;
+    return ret;
+  }
+
+  Entry VisitExpr_(const UIntImm* op) final {
+    if (op->value < std::numeric_limits<int64_t>::max()) {
+      Entry ret;
+      ret.base = static_cast<int>(op->value);
+      ret.coeff = 0;
+      return ret;
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitExpr_(const Add* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    ret.base = BaseSimplify(a.base + b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Sub* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    ret.base = BaseSimplify(a.base - b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry VisitExpr_(const Mul* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    // Simplification rule, x, y, z are in Z
+    // (p x + n) (q y + m)
+    // -> pq xy + pm x + qn y + mn
+    // -> pq z + pm x + qn y + mn
+    int64_t pq = a.coeff * b.coeff;
+    int64_t pm = a.coeff * b.base;
+    int64_t qn = a.base * b.coeff;
+    Entry ret;
+    ret.coeff = ZeroAwareGCD(pq, ZeroAwareGCD(pm, qn));
+    ret.base = BaseSimplify(a.base * b.base, ret.coeff);
+    return ret;
+  }
+
+  Entry DivByConst(const Expr& lhs,
+                   int64_t val,
+                   bool round_down) {
+    Entry a = VisitExpr(lhs);
+    CHECK_NE(val, 0);
+    if (a.coeff % val == 0) {
+      Entry ret;
+      if (a.base == 0) {
+        // a c x  / c -> a x
+        ret.coeff = std::abs(a.coeff / val);
+        ret.base = 0;
+        return ret;
+      }
+      // positive division have a clear rounding mode.
+      // Only handle case where we clearly know we need to round down.
+      if (a.base > 0 && val > 0 &&
+          (round_down || parent_->CanProveGreaterEqual(lhs, 0))) {
+        ret.coeff = a.coeff / val;
+        ret.base = a.base / val;
+        return ret;
+      }
+    }
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Div* op) final {
+    Entry b = VisitExpr(op->b);
+    if (b.is_const()) {
+      return DivByConst(op->a, b.base, false);
+    }
+    return Everything();
+  }
+
+  Entry VisitExpr_(const Min* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Max* op) final {
+    Entry a = VisitExpr(op->a);
+    Entry b = VisitExpr(op->b);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Select* op) final {
+    Entry a = VisitExpr(op->true_value);
+    Entry b = VisitExpr(op->false_value);
+    return Union(a, b);
+  }
+
+  Entry VisitExpr_(const Call* op) final {
+    // only special handle >> which can be
+    // used for index calculation.
+    if (op->is_intrinsic(Call::shift_right)) {
+      return VisitRightShift(op);
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitExpr_(const Variable* op) final {
+    Var v = GetRef<Var>(op);
+    auto it = var_map_.find(v);
+    if (it != var_map_.end()) {
+      return it->second;
+    } else {
+      return Everything();
+    }
+  }
+
+  Entry VisitRightShift(const Call* op) {
+    Entry b = VisitExpr(op->args[1]);
+    // a c x  / c -> a x
+    if (b.is_const()) {
+      return DivByConst(op->args[0], 1 << b.base, true);
+    }
+    return Everything();
+  }
+
+ private:
+  /*! \brief pointer to parent. */
+  Analyzer* parent_{nullptr};
+  // internal variable map
+  std::unordered_map<Var, Entry, ExprHash, ExprEqual> var_map_;
+  /*!
+   * \brief Update var by intersecting entry with var's current set.
+   * \param var The variable.
+   * \param entry The entry to be updated.
+   * \return The recovery function of the scope.
+   */
+  std::function<void()> UpdateByIntersect(const Var& var, Entry entry) {
+    Entry old = Everything();
+    auto it = var_map_.find(var);
+    if (it != var_map_.end()) {
+      old = it->second;
+    }
+    var_map_[var] = Intersect(old, entry);
+    // reover function.
+    return [this, old, var]() {
+      var_map_[var] = old;
+    };
+  }
+  /*!
+   * \brief Create union of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Union(Entry a, Entry b) {
+    // {ax + y} \cup {bz + h} => {gcd(a, b) x + {y or h}}
+    int64_t coeff = ZeroAwareGCD(a.coeff, b.coeff);
+    if (coeff == 0) {
+      if (a.base == b.base) return a;
+      return Everything();
+    }
+    int64_t base0 = a.base % coeff;
+    int64_t base1 = b.base % coeff;
+    Entry ret;
+    if (base0 == base1) {
+      ret.coeff = coeff;
+      ret.base = base0;
+      return ret;
+    } else {
+      ret.coeff = ZeroAwareGCD(ZeroAwareGCD(base0, base1), coeff);
+      ret.base = 0;
+      return ret;
+    }
+  }
+  /*!
+   * \brief Create interect of two sets.
+   * \param a The left operand.
+   * \param b the right operand.
+   */
+  static Entry Intersect(Entry a, Entry b) {
+    // simple rule for now: pick higher constraints.
+    // TODO(team-team): Use extended euclidean algorithm.
+    if (a.coeff == 0) return a;
+    if (b.coeff == 0) return b;
+    if (a.coeff >= b.coeff) return a;
+    return b;
+  }
+  /*!
+   * \brief Simplify base so that it is in [0, coeff) when coeff != 0.
+   * \param base The base value.
+   * \param coeff The coeff value.
+   * \return The simplified base.
+   */
+  static int64_t BaseSimplify(int64_t base, int64_t coeff) {
+    if (coeff == 0) return base;
+    base = base % coeff;
+    if (base < 0) base += coeff;
+    return base;
+  }
+  /*!
+   * \brief Take GCD of a and b.
+   * \param a The first operand.
+   * \param b The second operand.
+   * \return The result.
+   */
+  static int64_t ZeroAwareGCD(int64_t a, int64_t b) {
+    if (a < 0) a = -a;
+    if (b < 0) b = -b;
+    if (a < b) std::swap(a, b);
+    if (b == 0) return a;
+    // perform GCD (greatest common divisor)
+    // ax + by = gcd(a, b) z if a != 0, b != 0
+    while (a % b != 0) {
+      a = a % b;
+      std::swap(a, b);
+    }
+    return b;
+  }
+  /*!
+   * \brief return everything dtype can represent.
+   * \return Bound that represent everything dtype can represent.
+   */
+  static Entry Everything() {
+    Entry ret;
+    ret.coeff = 1; ret.base = 0;
+    return ret;
+  }
+};
+
+ModularSet ModularSetAnalyzer::operator()(const Expr& expr) {
+  Entry ret = impl_->VisitExpr(expr);
+  return ModularSetNode::make(ret.coeff, ret.base);
+}
+
+void ModularSetAnalyzer::Update(const Var& var,
+                                const ModularSet& info,
+                                bool override) {
+  impl_->Update(var, info, override);
+}
+
+std::function<void()> ModularSetAnalyzer::EnterConstraint(const Expr& constraint) {
+  return impl_->EnterConstraint(constraint);
+}
+
+ModularSetAnalyzer::ModularSetAnalyzer(Analyzer* parent)
+    : impl_(new Impl(parent)) {
+}
+
+ModularSetAnalyzer::~ModularSetAnalyzer() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/src/arithmetic/pattern_match.h b/src/arithmetic/pattern_match.h
index b4140d959759..50f2300dd4b7 100644
--- a/src/arithmetic/pattern_match.h
+++ b/src/arithmetic/pattern_match.h
@@ -25,6 +25,17 @@
  *    // The filled value is valid until the next call to Match.
  *    return (max(x, y) + z).Eval();
  *  }
+ *
+ *  tvm::Var tx, ty;
+ *  arith::PVar<Integer> c;
+ *  arith::PVar<Var> v;
+ *  // We can match integer and Var, both of which are
+ *  // special case container of Expr
+ *  CHECK((v * c).Match(tx * 3));
+ *  CHECK_EQ(c.Eval()->value, 3);
+ *  // cannot match c to ty
+ *  CHECK(!(v * c).Match(tx * ty));
+ *
  * \endcode
  *
  * \note The pattern matcher is not threadsafe,
@@ -109,6 +120,22 @@ class PEqualChecker<Expr> {
   }
 };
 
+template<>
+class PEqualChecker<Integer> {
+ public:
+  bool operator()(const Integer& lhs, const Integer& rhs) const {
+    return lhs->value == rhs->value;
+  }
+};
+
+template<>
+class PEqualChecker<Var> {
+ public:
+  bool operator()(const Var& lhs, const Var& rhs) const {
+    return lhs.same_as(rhs);
+  }
+};
+
 /*!
  * \brief Pattern variable container.
  *
@@ -123,7 +150,7 @@ template<typename T>
 class PVar : public Pattern<PVar<T> > {
  public:
   // Store PVars by reference in the expression.
-  using Nested = const PVar&;
+  using Nested = const PVar<T>&;
 
   void InitMatch_() const {
     filled_ = false;
@@ -139,12 +166,23 @@ class PVar : public Pattern<PVar<T> > {
     }
   }
 
+  template<typename NodeRefType,
+           typename = typename std::enable_if<
+             std::is_base_of<NodeRefType, T>::value>::type>
+  bool Match_(const NodeRefType& value) const {
+    if (const auto* ptr = value.template as<typename T::ContainerType>()) {
+      return Match_(GetRef<T>(ptr));
+    } else {
+      return false;
+    }
+  }
+
   T Eval() const {
     CHECK(filled_);
     return value_;
   }
 
- private:
+ protected:
   /*! \brief The matched value */
   mutable T value_;
   /*! \brief whether the variable has been filled */
@@ -171,6 +209,7 @@ class PConst : public Pattern<PConst<T> > {
   T Eval() const {
     return value_;
   }
+
  private:
   const T value_;
 };
diff --git a/src/codegen/codegen_common.h b/src/codegen/codegen_common.h
deleted file mode 100644
index 5e76af12e583..000000000000
--- a/src/codegen/codegen_common.h
+++ /dev/null
@@ -1,59 +0,0 @@
-/*!
- *  Copyright (c) 2018 by Contributors
- * \file codegen_common.h
- * \brief Common utility for codegen.
- */
-#ifndef TVM_CODEGEN_CODEGEN_COMMON_H_
-#define TVM_CODEGEN_CODEGEN_COMMON_H_
-
-#include <tvm/arithmetic.h>
-#include "../arithmetic/compute_expr.h"
-
-namespace tvm {
-namespace codegen {
-
-/*!
- * \brief Visit AssertStmt recursively, update align_map from condition.
- * \param op The AssertStmt
- * \param align_map The alignmap
- * \param fvisit The recursive visitor
- * \tparam FVisit the recursive visitor
- */
-template<typename FVisit>
-inline void VisitAssert(
-    const ir::AssertStmt* op,
-    std::unordered_map<const Variable*, arith::ModularEntry>* align_map,
-    FVisit fvisit) {
-  using namespace ir;
-  auto& align_map_ = *align_map;
-  // Detect useful invariant pattern and use them to visit child.
-  // Pattern: Var % const  == 0
-  // TODO(tqchen) merge these pattern to a generic scope info visitor.
-  if (const EQ* eq = op->condition.as<EQ>()) {
-    const Mod* mod = eq->a.as<Mod>();
-    int64_t factor = 0, offset = 0;
-    if (mod && arith::GetConst(eq->b, &offset)) {
-      const Variable *var = mod->a.as<Variable>();
-      if (var && arith::GetConst(mod->b, &factor)) {
-        arith::ModularEntry old = align_map_[var];
-        if (factor > old.coeff) {
-          arith::ModularEntry e;
-          e.coeff = static_cast<int>(factor);
-          e.base = static_cast<int>(offset);
-          // new alignment info,
-          align_map_[var] = e;
-          fvisit(op->body);
-          // restore old info
-          align_map_[var] = old;
-          return;
-        }
-      }
-    }
-  }
-  fvisit(op->body);
-}
-
-}  // namespace codegen
-}  // namespace tvm
-
-#endif  // TVM_CODEGEN_CODEGEN_COMMON_H_
diff --git a/src/codegen/llvm/codegen_llvm.cc b/src/codegen/llvm/codegen_llvm.cc
index f80bd9e8d436..6b69f97a66fe 100644
--- a/src/codegen/llvm/codegen_llvm.cc
+++ b/src/codegen/llvm/codegen_llvm.cc
@@ -9,7 +9,6 @@
 #include <tvm/runtime/c_runtime_api.h>
 #include "codegen_llvm.h"
 #include "codegen_cpu.h"
-#include "../codegen_common.h"
 #include "../../pass/ir_util.h"
 #include "../../arithmetic/compute_expr.h"
 
@@ -84,9 +83,9 @@ void CodeGenLLVM::AddFunction(const LoweredFunc& f) {
 void CodeGenLLVM::InitFuncState() {
   var_map_.clear();
   alias_var_set_.clear();
-  align_map_.clear();
   alloc_storage_info_.clear();
   volatile_buf_.clear();
+  analyzer_.reset(new arith::Analyzer());
 }
 
 void CodeGenLLVM::AddFunctionInternal(const LoweredFunc& f, bool ret_void) {
@@ -381,14 +380,16 @@ void CodeGenLLVM::GetAlignment(Type t,
     *p_native_bits = native_vector_bits_;
   }
 
-  arith::ModularEntry me = arith::EvalModular(index, align_map_);
+  arith::ModularSet me = analyzer_->modular_set(index);
+  int64_t base = me->base;
+  int64_t coeff = me->coeff;
 
   int align_bits = t.bits();
   while (align_bits < max_align_bits &&
-         me.base % 2  == 0 &&
-         me.coeff % 2 == 0) {
-    me.base =  me.base / 2;
-    me.coeff =  me.coeff / 2;
+         base % 2  == 0 &&
+         coeff % 2 == 0) {
+    base =  base / 2;
+    coeff =  coeff / 2;
     align_bits *= 2;
   }
   if (align_bits < 8) {
@@ -874,7 +875,7 @@ llvm::Value* CodeGenLLVM::VisitExpr_(const Select* op) {
 llvm::Value* CodeGenLLVM::VisitExpr_(const Let* op) {
   CHECK(!var_map_.count(op->var.get()));
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   return MakeValue(op->body);
 }
 
@@ -998,6 +999,7 @@ void CodeGenLLVM::VisitStmt_(const Store* op) {
 
 void CodeGenLLVM::VisitStmt_(const For* op) {
   CHECK(is_zero(op->min));
+  analyzer_->Bind(op->loop_var, Range::make_by_min_extent(op->min, op->extent));
   if (op->for_type == ForType::Unrolled) {
     LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, "
                  << " consider set unroll_explicit=True";
@@ -1078,6 +1080,7 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv);
+        analyzer_->Bind(iv->var, Range::make_by_min_extent(0, op->value));
       }
     }
   } else if (op->attr_key == ir::attr::storage_scope) {
@@ -1099,21 +1102,19 @@ void CodeGenLLVM::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenLLVM::VisitStmt_(const AssertStmt* op) {
-  VisitAssert(op, &align_map_, [this](const Stmt& body) {
-      this->VisitStmt(body);
-    });
+  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  this->VisitStmt(op->body);
 }
 
 void CodeGenLLVM::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  CHECK(!align_map_.count(op->var.get()));
   if (op->var.type().is_handle()) {
     if (!is_restricted_) {
       alias_var_set_.insert(op->var.get());
     }
   }
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm.h b/src/codegen/llvm/codegen_llvm.h
index 080306310370..ead1af883166 100644
--- a/src/codegen/llvm/codegen_llvm.h
+++ b/src/codegen/llvm/codegen_llvm.h
@@ -23,7 +23,6 @@ namespace codegen {
 
 using namespace ir;
 
-
 /*!
  * \brief A base class to generate a LLVM.
  */
@@ -267,8 +266,8 @@ class CodeGenLLVM :
   std::unordered_map<std::string, llvm::Constant*> str_map_;
   // Whether current function is restricted
   bool is_restricted_{true};
-  // The alignment information
-  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
+  // The analyzer information
+  std::unique_ptr<arith::Analyzer> analyzer_;
   // set of var that are not restricted(can alias)
   std::unordered_set<const Variable*> alias_var_set_;
   // set of volatile buffer.
diff --git a/src/codegen/spirv/codegen_spirv.cc b/src/codegen/spirv/codegen_spirv.cc
index 812fee4a114e..8b1cabd9e386 100644
--- a/src/codegen/spirv/codegen_spirv.cc
+++ b/src/codegen/spirv/codegen_spirv.cc
@@ -6,7 +6,7 @@
 #include <tvm/ir.h>
 #include <tvm/ir_pass.h>
 #include <string>
-#include "../codegen_common.h"
+#include "../../arithmetic/compute_expr.h"
 #include "codegen_spirv.h"
 
 namespace tvm {
@@ -66,7 +66,7 @@ void CodeGenSPIRV::InitFuncState() {
   std::fill(workgroup_size_, workgroup_size_ + 3, 1);
   var_map_.clear();
   storage_info_.clear();
-  align_map_.clear();
+  analyzer_.reset(new arith::Analyzer());
   builder_.reset(new spirv::IRBuilder());
   builder_->InitHeader();
 }
@@ -217,7 +217,7 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Select* op) {
 spirv::Value CodeGenSPIRV::VisitExpr_(const Let* op) {
   CHECK(!var_map_.count(op->var.get()));
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   return MakeValue(op->body);
 }
 
@@ -378,9 +378,9 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const Load* op) {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
           CHECK_EQ(ramp->lanes, op->type.lanes());
-          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
-          CHECK((me.coeff % ramp->lanes) == 0 &&
-                (me.base % ramp->lanes)  == 0)
+          arith::ModularSet me = analyzer_->modular_set(ramp->base);
+          CHECK((me->coeff % ramp->lanes) == 0 &&
+                (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
               ramp->base / make_const(ramp->base.type(), ramp->lanes));
@@ -458,9 +458,9 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
       if (const Ramp* ramp = op->index.as<Ramp>()) {
         if (is_one(ramp->stride)) {
           CHECK_EQ(ramp->lanes, op->value.type().lanes());
-          arith::ModularEntry me = arith::EvalModular(ramp->base, align_map_);
-          CHECK((me.coeff % ramp->lanes) == 0 &&
-                (me.base % ramp->lanes)  == 0)
+          arith::ModularSet me = analyzer_->modular_set(ramp->base);
+          CHECK((me->coeff % ramp->lanes) == 0 &&
+                (me->base % ramp->lanes)  == 0)
               << "Only aligned vector access is allowed in SPIRV";
           Expr vec_index = ir::Simplify(
               ramp->base / make_const(ramp->base.type(), ramp->lanes));
@@ -477,6 +477,7 @@ void CodeGenSPIRV::VisitStmt_(const Store* op) {
 
 void CodeGenSPIRV::VisitStmt_(const For* op) {
   CHECK(is_zero(op->min));
+  analyzer_->Bind(op->loop_var, Range::make_by_min_extent(op->min, op->extent));
   spirv::Value init_value = MakeValue(op->min);
   spirv::Value extent_value = MakeValue(op->extent);
   // Must get init label after making value(to make sure they are correct)
@@ -589,6 +590,7 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
     if (iv->thread_tag.length() != 0) {
       if (!var_map_.count(iv->var.get())) {
         var_map_[iv->var.get()] = GetThreadIndex(iv, op->value);
+        analyzer_->Bind(iv->var, Range::make_by_min_extent(0, op->value));
       }
     }
   } else if (op->attr_key == ir::attr::storage_scope) {
@@ -605,17 +607,15 @@ void CodeGenSPIRV::VisitStmt_(const AttrStmt* op) {
 }
 
 void CodeGenSPIRV::VisitStmt_(const AssertStmt* op) {
-  VisitAssert(op, &align_map_, [this](const Stmt& body) {
-      this->VisitStmt(body);
-    });
+  arith::ConstraintContext cctx(analyzer_.get(), op->condition);
+  this->VisitStmt(op->body);
 }
 
 void CodeGenSPIRV::VisitStmt_(const LetStmt* op) {
   CHECK(!var_map_.count(op->var.get()));
-  CHECK(!align_map_.count(op->var.get()));
   CHECK(!op->var.type().is_handle());
   var_map_[op->var.get()] = MakeValue(op->value);
-  align_map_[op->var.get()] = EvalModular(op->value, align_map_);
+  analyzer_->Bind(op->var, op->value);
   this->VisitStmt(op->body);
 }
 
diff --git a/src/codegen/spirv/codegen_spirv.h b/src/codegen/spirv/codegen_spirv.h
index 6a43182f7f2e..94cf761b9f84 100644
--- a/src/codegen/spirv/codegen_spirv.h
+++ b/src/codegen/spirv/codegen_spirv.h
@@ -122,8 +122,8 @@ class CodeGenSPIRV:
   std::unordered_map<const Variable*, StorageInfo> storage_info_;
   // The definition of local variable.
   std::unordered_map<const Variable*, spirv::Value> var_map_;
-  // The alignment information
-  std::unordered_map<const Variable*, arith::ModularEntry> align_map_;
+  // The analyzer.
+  std::unique_ptr<arith::Analyzer> analyzer_;
 };
 
 }  // namespace codegen
diff --git a/src/pass/storage_rewrite.cc b/src/pass/storage_rewrite.cc
index 9ba9dcde63c9..3f7fd9512eb2 100644
--- a/src/pass/storage_rewrite.cc
+++ b/src/pass/storage_rewrite.cc
@@ -936,10 +936,8 @@ class VectorAllocRewriter : public IRMutator {
         tvec[0].lanes() != op->type.lanes()) {
       int factor = tvec[0].lanes() / op->type.lanes();
       Array<Expr> extents = op->extents;
-      arith::ModularEntry me = EvalModular(
-          extents[extents.size() - 1],
-          std::unordered_map<const Variable*, arith::ModularEntry>());
-      if (me.base % factor == 0 && me.coeff % factor == 0) {
+      arith::ModularSet me = analyzer_.modular_set(extents[extents.size() - 1]);
+      if (me->base % factor == 0 && me->coeff % factor == 0) {
         extents.Set(extents.size() - 1,
                     extents[extents.size() - 1] / make_const(extents[0].type(), factor));
         return Allocate::make(
@@ -959,6 +957,8 @@ class VectorAllocRewriter : public IRMutator {
 
   // Internal access map
   std::unordered_map<const Variable*, std::vector<Type> > acc_map_;
+  // internal analyzer
+  arith::Analyzer analyzer_;
 };
 
 
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index cb746e65660b..1945339a259c 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -107,6 +107,23 @@ TEST(Pattern, Basic) {
   }
 }
 
+TEST(Pattern, Integer) {
+  using namespace tvm;
+  tvm::Var tx, ty;
+  arith::PVar<Integer> c;
+  arith::PVar<Var> v;
+  {
+    // We can match integer and Var, both of which are
+    // special case container of Expr
+    CHECK((v * c).Match(tx * 3));
+    CHECK_EQ(c.Eval()->value, 3);
+  }
+  // cannot match c to ty
+  CHECK(!(v * c).Match(tx * ty));
+  // cannot match tx + 1 to v
+  CHECK(!(v * c).Match((tx + 1) * 3));
+}
+
 int main(int argc, char ** argv) {
   testing::InitGoogleTest(&argc, argv);
   testing::FLAGS_gtest_death_test_style = "threadsafe";
diff --git a/tests/cpp/unittest.mk b/tests/cpp/unittest.mk
deleted file mode 100644
index b810d63ee4b1..000000000000
--- a/tests/cpp/unittest.mk
+++ /dev/null
@@ -1,12 +0,0 @@
-GTEST_LIB=$(GTEST_PATH)/lib/
-GTEST_INC=$(GTEST_PATH)/include/
-
-TEST_SRC = $(wildcard tests/cpp/*_test.cc)
-TEST = $(patsubst tests/cpp/%_test.cc, tests/cpp/%_test, $(TEST_SRC))
-
-tests/cpp/%_test: tests/cpp/%_test.cc lib/libtvm.so
-	$(CXX) -std=c++11 $(CFLAGS) -MM -MT tests/cpp/$* $< >tests/cpp/$*.d
-	$(CXX) -std=c++11 $(CFLAGS) -I$(GTEST_INC) -o $@ $(filter %.cc %.a, $^)  \
-		-L$(GTEST_LIB)  $(LDFLAGS) -lgtest -Llib -ltvm
-
--include tests/cpp/*.d
diff --git a/tests/python/unittest/test_arith_const_int_bound.py b/tests/python/unittest/test_arith_const_int_bound.py
new file mode 100644
index 000000000000..968692208f5d
--- /dev/null
+++ b/tests/python/unittest/test_arith_const_int_bound.py
@@ -0,0 +1,219 @@
+import tvm
+
+def test_dtype_bound():
+    analyzer = tvm.arith.Analyzer()
+
+    x = tvm.var("x", dtype="int64")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    x = tvm.var("x", dtype="int8")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == -128
+    assert bd.max_value == 127
+
+    x = tvm.var("x", dtype="uint8")
+    bd = analyzer.const_int_bound(x)
+    assert bd.min_value == 0
+    assert bd.max_value == 255
+
+
+def test_cast_bound():
+    analyzer = tvm.arith.Analyzer()
+    x = tvm.var("x", dtype="int8")
+    bd = analyzer.const_int_bound((x % 3).astype("uint32"))
+    assert bd.min_value == 0
+    assert bd.max_value == 2
+
+    bd = analyzer.const_int_bound(
+        (x % 3).astype("float32").astype("int32"))
+    assert bd.min_value == -2
+    assert bd.max_value == 2
+
+
+def test_add_sub_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(1, 10))
+    bd = analyzer.const_int_bound(x + y)
+    assert bd.min_value == 1
+    assert bd.max_value == 14
+
+    bd = analyzer.const_int_bound(x - y)
+    assert bd.min_value == -10
+    assert bd.max_value == 3
+
+    analyzer.update(x, tvm.arith.ConstIntBound(0, bd.POS_INF), override=True)
+    bd = analyzer.const_int_bound(x - y)
+    assert bd.min_value == -10
+    assert bd.max_value == bd.POS_INF
+
+    bd = analyzer.const_int_bound(1 - x)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == 1
+
+
+def test_mul_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-2, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x * y + 20)
+    assert bd.min_value == 0
+    assert bd.max_value == 60
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-3, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-8, 2), override=True)
+    bd = analyzer.const_int_bound(x * y)
+    assert bd.min_value == -32
+    assert bd.max_value == 24
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-8, 2), override=True)
+    bd = analyzer.const_int_bound(x * y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+
+def test_div_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == -2
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-2, 0), override=True)
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == -4
+    assert bd.max_value == 9
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, 4), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(-2, 1), override=True)
+    bd = analyzer.const_int_bound(x / y)
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == bd.POS_INF
+
+
+def test_mod_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 4))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == -9
+    assert bd.max_value == 4
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == -9
+    assert bd.max_value == 9
+
+    analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(x % y)
+    assert bd.min_value == 0
+    assert bd.max_value == 9
+
+
+def test_min_max_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+    bd = analyzer.const_int_bound(tvm.min(x, y))
+    assert bd.min_value == -9
+    assert bd.max_value == 10
+
+    analyzer.update(x, tvm.arith.ConstIntBound(bd.NEG_INF, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(tvm.min(x, y))
+    assert bd.min_value == bd.NEG_INF
+    assert bd.max_value == 10
+
+    bd = analyzer.const_int_bound(tvm.max(x, y))
+    assert bd.min_value == 4
+    assert bd.max_value == bd.POS_INF
+
+    analyzer.update(x, tvm.arith.ConstIntBound(1, bd.POS_INF), override=True)
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10), override=True)
+    bd = analyzer.const_int_bound(tvm.max(x, y))
+    assert bd.min_value == 4
+    assert bd.max_value == bd.POS_INF
+
+
+def test_select_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(4, 10))
+
+    bd = analyzer.const_int_bound(
+        tvm.expr.Select(x > 1, (y < 0).astype("int32"), y + 1))
+    assert bd.min_value == 0
+    assert bd.max_value == 11
+
+
+def test_shift_and_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+
+    analyzer.update(x, tvm.arith.ConstIntBound(-9, 11))
+    analyzer.update(y, tvm.arith.ConstIntBound(2, 10))
+
+    bd = analyzer.const_int_bound(x >> y)
+    assert bd.min_value == -3
+    assert bd.max_value == 2
+
+    bd = analyzer.const_int_bound(x & y)
+    assert bd.min_value == 0
+    assert bd.max_value == 10
+
+    analyzer.update(x, tvm.arith.ConstIntBound(10, 11), override=True)
+    bd = analyzer.const_int_bound(x & y)
+    assert bd.min_value == 0
+    assert bd.max_value == 10
+
+
+def test_mix_index_bound():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 24 - 1))
+    analyzer.update(y, tvm.arith.ConstIntBound(0, 3 - 1))
+    bd = analyzer.const_int_bound((x % 8) + (x / 8) * 8)
+    assert bd.min_value == 0
+    assert bd.max_value == 24 - 1
+
+    bd = analyzer.const_int_bound(y + x * 3)
+    assert bd.min_value == 0
+    assert bd.max_value == 24 * 3 - 1
+
+    bd = analyzer.const_int_bound((x % 7) + (x / 7) * 7)
+    assert bd.min_value == 0
+    assert bd.max_value == (23 // 7) * 7 + 6
+
+
+if __name__ == "__main__":
+    test_dtype_bound()
+    test_cast_bound()
+    test_add_sub_bound()
+    test_mul_bound()
+    test_div_bound()
+    test_mod_bound()
+    test_min_max_bound()
+    test_select_bound()
+    test_shift_and_bound()
+    test_mix_index_bound()
diff --git a/tests/python/unittest/test_arith_modular.py b/tests/python/unittest/test_arith_modular.py
deleted file mode 100644
index 58b5d3115d5e..000000000000
--- a/tests/python/unittest/test_arith_modular.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import tvm
-
-def test_basic():
-    a = tvm.var()
-    b = tvm.var()
-    m = tvm.arith.EvalModular(a * 4 + b * 6 + 7)
-    assert m.coeff == 2
-    assert m.base == 1
-
-    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 + 3))
-    assert m.coeff == 4
-    assert m.base == 3
-
-    m = tvm.arith.EvalModular((a * 4 + 1) / (b * 8 + 3))
-    assert m.coeff == 1
-    assert m.base == 0
-
-    m = tvm.arith.EvalModular((a * 4 + 1) * (b * 8 / 4))
-    assert m.coeff == 2
-    assert m.base == 0
-
-    m = tvm.arith.EvalModular((a * 12 + 1) - (b * 3 * 7  + 2))
-    assert m.coeff == 3
-    assert m.base == 2
-
-
-    m = tvm.arith.EvalModular(a * 12 + tvm.min(b * 3 * 7, 2))
-    assert m.coeff == 1
-    assert m.base == 0
-
-if __name__ == "__main__":
-    test_basic()
diff --git a/tests/python/unittest/test_arith_modular_set.py b/tests/python/unittest/test_arith_modular_set.py
new file mode 100644
index 000000000000..06ae5197b974
--- /dev/null
+++ b/tests/python/unittest/test_arith_modular_set.py
@@ -0,0 +1,128 @@
+import tvm
+
+
+def test_cast():
+    analyzer = tvm.arith.Analyzer()
+    x = tvm.var("x", dtype="int8")
+    m = analyzer.modular_set((x * 3).astype("uint32"))
+    assert m.coeff == 3
+    assert m.base == 0
+    m = analyzer.modular_set(
+        (x * 3 + 1).astype("float32").astype("int32"))
+    assert m.coeff == 3
+    assert m.base == 1
+
+
+def test_add_sub():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x", "int64"), tvm.var("y", "int64")
+    m = analyzer.modular_set(x * 6 + y * 4)
+    assert m.coeff == 2
+    assert m.base == 0
+
+    analyzer.bind(y, x * 4 + 1)
+    m = analyzer.modular_set(1 - y)
+    assert m.coeff == 4
+    assert m.base == 0
+
+
+def test_mul():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    m = analyzer.modular_set((x * 4 + 2) * (y * 6 + 1))
+    assert m.coeff == 4
+    assert m.base == 2
+
+
+def test_div_shift():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    # not sure if x is non-negative
+    m = analyzer.modular_set((x * 4 + 2) / 2)
+    assert m.coeff == 1
+    assert m.base == 0
+    # right shift always round down so it is fine
+    m = analyzer.modular_set((x * 4 + 2) >> 1)
+    assert m.coeff == 2
+    assert m.base == 1
+    # x is non-negative
+    analyzer.update(x, tvm.arith.ConstIntBound(0, 100))
+    m = analyzer.modular_set((x * 4 + 2) / 2)
+    assert m.coeff == 2
+    assert m.base == 1
+
+
+def test_min_max_select():
+    analyzer = tvm.arith.Analyzer()
+    x, y = tvm.var("x"), tvm.var("y")
+    m = analyzer.modular_set(tvm.min(x * 3, y * 9))
+    assert m.coeff == 3
+    assert m.base == 0
+
+    m = analyzer.modular_set(tvm.max(x * 3 + 1, y * 9 + 4))
+    assert m.coeff == 3
+    assert m.base == 1
+
+    m = analyzer.modular_set(tvm.expr.Select(x > 0, x * 3 + 1, y * 9 + 2))
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+def test_mix_index():
+    a = tvm.var("a")
+    b = tvm.var("b")
+    analyzer = tvm.arith.Analyzer()
+    m = analyzer.modular_set(a * 4 + b * 6 + 7)
+    assert m.coeff == 2
+    assert m.base == 1
+
+    m = analyzer.modular_set((a * 4 + 1) * (b * 8 + 3))
+    assert m.coeff == 4
+    assert m.base == 3
+
+    m = analyzer.modular_set((a * 4 + 1) / (b * 8 + 3))
+    assert m.coeff == 1
+    assert m.base == 0
+
+    m = analyzer.modular_set((a * 4 + 1) * (b * 8 / 4))
+    assert m.coeff == 2
+    assert m.base == 0
+
+    m = analyzer.modular_set((a * 12 + 1) - (b * 3 * 7  + 2))
+    assert m.coeff == 3
+    assert m.base == 2
+
+    m = analyzer.modular_set(a * 12 + tvm.min(b * 3 * 7, 2))
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+def test_constraint_scope():
+    a = tvm.var("a")
+    b = tvm.var("b")
+    analyzer = tvm.arith.Analyzer()
+    with analyzer.constraint_scope(b % 4 == 2):
+        m = analyzer.modular_set(b + 1)
+        assert m.coeff == 4
+        assert m.base == 3
+        with analyzer.constraint_scope(a % 2 == 1):
+            m = analyzer.modular_set(b + a * 2)
+            assert m.coeff == 4
+            assert m.base == 0
+        m = analyzer.modular_set(b + a * 2)
+        assert m.coeff == 2
+        assert m.base == 0
+
+    m = analyzer.modular_set(b + 1)
+    assert m.coeff == 1
+    assert m.base == 0
+
+
+if __name__ == "__main__":
+    test_cast()
+    test_add_sub()
+    test_mul()
+    test_div_shift()
+    test_min_max_select()
+    test_mix_index()
+    test_constraint_scope()

From f8b3ccf33d873d91e09fe7e166515f164139ea34 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 2 Mar 2019 19:17:01 -0800
Subject: [PATCH 57/93] [EXPR] ir_operator.h->expr_operator.h Centralize const
 folder logic (#2719)

---
 include/tvm/buffer.h                          |   2 +-
 include/tvm/data_layout.h                     |   2 +-
 .../tvm/{ir_operator.h => expr_operator.h}    |   8 +-
 include/tvm/operation.h                       |   2 +-
 include/tvm/tensor.h                          |   2 +-
 include/tvm/tvm.h                             |   2 +-
 src/api/api_ir.cc                             |   3 +-
 src/arithmetic/const_fold.h                   | 289 ++++++++++++++++++
 src/arithmetic/modular_set.cc                 |   2 +-
 src/lang/expr.cc                              |   2 +-
 src/lang/{ir_operator.cc => expr_operator.cc} | 260 ++++------------
 src/op/hybrid_op.cc                           |   2 +-
 src/pass/ir_util.h                            |   2 +-
 src/pass/storage_flatten.cc                   |   2 +-
 src/relay/op/nn/pad.cc                        |   2 +-
 src/relay/op/tensor/transform.cc              |   2 +-
 src/relay/pass/fuse_ops.cc                    |   2 +-
 tests/cpp/ir_mutator_test.cc                  |   2 +-
 18 files changed, 375 insertions(+), 213 deletions(-)
 rename include/tvm/{ir_operator.h => expr_operator.h} (99%)
 create mode 100644 src/arithmetic/const_fold.h
 rename src/lang/{ir_operator.cc => expr_operator.cc} (58%)

diff --git a/include/tvm/buffer.h b/include/tvm/buffer.h
index 2c72db169a2d..d95332c245b7 100644
--- a/include/tvm/buffer.h
+++ b/include/tvm/buffer.h
@@ -10,7 +10,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tvm/node/container.h"
 
 namespace tvm {
diff --git a/include/tvm/data_layout.h b/include/tvm/data_layout.h
index 99aebc3a1c31..3f5cb9a29546 100644
--- a/include/tvm/data_layout.h
+++ b/include/tvm/data_layout.h
@@ -16,7 +16,7 @@
 #include <utility>
 #include <algorithm>
 
-#include "ir_operator.h"
+#include "expr_operator.h"
 
 namespace tvm {
 
diff --git a/include/tvm/ir_operator.h b/include/tvm/expr_operator.h
similarity index 99%
rename from include/tvm/ir_operator.h
rename to include/tvm/expr_operator.h
index c2cdc5e7a923..c4d2d555f3a3 100644
--- a/include/tvm/ir_operator.h
+++ b/include/tvm/expr_operator.h
@@ -1,13 +1,13 @@
 /*!
  *  Copyright (c) 2018 by Contributors
- * \file tvm/ir_operator.h
+ * \file tvm/expr_operator.h
  * \brief Common operators defined for Expr.
  *
  * \note Most of the operator defined here perform simple constant folding
  *   when the type is int32 or int64 for simplifying the index expressions.
  */
-#ifndef TVM_IR_OPERATOR_H_
-#define TVM_IR_OPERATOR_H_
+#ifndef TVM_EXPR_OPERATOR_H_
+#define TVM_EXPR_OPERATOR_H_
 
 #include <algorithm>
 #include <type_traits>
@@ -617,4 +617,4 @@ TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator&&);
 TVM_DEFINE_LOGICAL_OP_CONST_VAL_OVERLOAD(operator||);
 
 }  // namespace tvm
-#endif  // TVM_IR_OPERATOR_H_
+#endif  // TVM_EXPR_OPERATOR_H_
diff --git a/include/tvm/operation.h b/include/tvm/operation.h
index 5e1f1fc73917..eafce72375cf 100644
--- a/include/tvm/operation.h
+++ b/include/tvm/operation.h
@@ -10,7 +10,7 @@
 #include <vector>
 #include <unordered_map>
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tensor.h"
 #include "schedule.h"
 #include "arithmetic.h"
diff --git a/include/tvm/tensor.h b/include/tvm/tensor.h
index 16f7363a9e73..87ced8b3cb2a 100644
--- a/include/tvm/tensor.h
+++ b/include/tvm/tensor.h
@@ -14,7 +14,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "arithmetic.h"
 
 namespace tvm {
diff --git a/include/tvm/tvm.h b/include/tvm/tvm.h
index 645c68357f13..5f81cb52fa31 100644
--- a/include/tvm/tvm.h
+++ b/include/tvm/tvm.h
@@ -8,7 +8,7 @@
 
 #include "base.h"
 #include "expr.h"
-#include "ir_operator.h"
+#include "expr_operator.h"
 #include "tensor.h"
 #include "operation.h"
 #include "packed_func_ext.h"
diff --git a/src/api/api_ir.cc b/src/api/api_ir.cc
index fa2d52e9fe85..a4c7842ffe90 100644
--- a/src/api/api_ir.cc
+++ b/src/api/api_ir.cc
@@ -5,9 +5,8 @@
  */
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
 #include <tvm/api_registry.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 
 namespace tvm {
 namespace ir {
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
new file mode 100644
index 000000000000..91613867115b
--- /dev/null
+++ b/src/arithmetic/const_fold.h
@@ -0,0 +1,289 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file const_fold.h
+ * \brief Centralized location for constant folding.
+ */
+#ifndef TVM_ARITHMETIC_CONST_FOLD_H_
+#define TVM_ARITHMETIC_CONST_FOLD_H_
+
+#include <tvm/ir.h>
+#include <algorithm>
+
+namespace tvm {
+namespace arith {
+
+/*!
+ * \brief Try to run binary compute with constant folding.
+ *
+ * \param a The left operand.
+ * \param b The right operand.
+ * \tparam Op The operator type.
+ *
+ * \note a and b Must already matched data types with each other.
+ * \return nullptr if constant fold fails, otherwise return folded result.
+ */
+template<typename Op>
+inline Expr TryConstFold(Expr a, Expr b);
+
+/*!
+ * \brief Try to run unary compute with constant folding.
+ *
+ * \param a The left operand.
+ * \tparam Op The operator type.
+ *
+ * \note a and b Must already matched data types with each other.
+ * \return nullptr if constant fold fails, otherwise return folded result.
+ */
+template<typename Op>
+inline Expr TryConstFold(Expr a);
+
+/*!
+ * \brief Check whether type is used to represent index.
+ *
+ * Index types are frequently used in shape computation
+ * and need to be aggressively constant-folded.
+ *
+ * \param type The type to represent index.
+ * \return the checked result.
+ */
+inline bool IsIndexType(const Type& type) {
+  return type.is_int() && type.lanes() == 1 &&
+      (type.bits() == 32 || type.bits() == 64);
+}
+
+
+#define TVM_ARITH_CONST_PROPAGATION(BODY)                               \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  using ir::FloatImm;                                                   \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const FloatImm* fa = a.as<FloatImm>();                                \
+  const FloatImm* fb = b.as<FloatImm>();                                \
+  BODY;
+
+
+#define TVM_INDEX_CONST_PROPAGATION(BODY)                               \
+  using ir::IntImm;                                                     \
+  using ir::UIntImm;                                                    \
+  const IntImm* pa = a.as<IntImm>();                                    \
+  const IntImm* pb = b.as<IntImm>();                                    \
+  const Type& ta = a.type();                                            \
+  const Type& tb = b.type();                                            \
+  if (arith::IsIndexType(ta) && arith::IsIndexType(tb)) {               \
+    BODY;                                                               \
+  }                                                                     \
+
+
+// specialization of constant folders.
+template<>
+inline Expr TryConstFold<ir::Add>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
+      if (pa && pa->value == 0) return b;
+      if (pb && pb->value == 0) return a;
+      if (fa && fb) return FloatImm::make(rtype, fa->value + fb->value);
+      if (fa && fa->value == 0) return b;
+      if (fb && fb->value == 0) return a;
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Sub>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
+      if (pb && pb->value == 0) return a;
+      if (fa && fb) return FloatImm::make(rtype, fa->value - fb->value);
+      if (fb && fb->value == 0) return a;
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Mul>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
+      if (pa) {
+        if (pa->value == 1) return b;
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return a;
+        if (pb->value == 0) return b;
+      }
+      if (fa && fb) return FloatImm::make(rtype, fa->value * fb->value);
+      if (fa) {
+        if (fa->value == 1) return b;
+        if (fa->value == 0) return a;
+      }
+      if (fb) {
+        if (fb->value == 1) return a;
+        if (fb->value == 0) return b;
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Div>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value / pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return a;
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+      if (fa && fb && fb->value != 0) {
+        return FloatImm::make(rtype, fa->value / fb->value);
+      }
+      if (fa && fa->value == 0) return a;
+      if (fb) {
+        if (fb->value == 1) return a;
+        CHECK_NE(fb->value, 0) << "Divide by zero";
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Mod>(Expr a, Expr b) {
+  TVM_INDEX_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      // due to division and mod can have different modes
+      // only constant fold positive number where rule is fixed.
+      if (pa && pb && pa->value >= 0 && pb->value > 0) {
+        return IntImm::make(rtype, pa->value % pb->value);
+      }
+      if (pa) {
+        if (pa->value == 0) return a;
+      }
+      if (pb) {
+        if (pb->value == 1) return make_zero(rtype);
+        CHECK_NE(pb->value, 0) << "Divide by zero";
+      }
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Min>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
+      if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Max>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      const Type& rtype = a.type();
+      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
+      if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::GT>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value > fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::GE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value >= fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::LT>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value < fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::LE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value <= fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::EQ>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value == fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::NE>(Expr a, Expr b) {
+  TVM_ARITH_CONST_PROPAGATION({
+      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
+      if (fa && fb) return UIntImm::make(UInt(1), fa->value != fb->value);
+    });
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::And>(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pa->value) return b;
+  if (pa && !pa->value) return a;
+  if (pb && pb->value) return a;
+  if (pb && !pb->value) return b;
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Or>(Expr a, Expr b) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  const UIntImm* pb = b.as<UIntImm>();
+  if (pa && pa->value) return a;
+  if (pa && !pa->value) return b;
+  if (pb && pb->value) return b;
+  if (pb && !pb->value) return a;
+  return Expr();
+}
+
+template<>
+inline Expr TryConstFold<ir::Not>(Expr a) {
+  using ir::UIntImm;
+  const UIntImm* pa = a.as<UIntImm>();
+  if (pa) {
+    return UIntImm::make(UInt(1), !(pa->value));
+  }
+  return Expr();
+}
+
+}  // namespace arith
+}  // namespace tvm
+#endif  // TVM_ARITHMETIC_CONST_FOLD_H_
diff --git a/src/arithmetic/modular_set.cc b/src/arithmetic/modular_set.cc
index 8da6e91fc7fa..8112beef7551 100644
--- a/src/arithmetic/modular_set.cc
+++ b/src/arithmetic/modular_set.cc
@@ -4,7 +4,7 @@
  * \brief Modular set analysis
  */
 #include <tvm/arithmetic.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/ir_functor_ext.h>
 #include <limits>
 #include "pattern_match.h"
diff --git a/src/lang/expr.cc b/src/lang/expr.cc
index 7ac0e372371c..3bf8fc9191fb 100644
--- a/src/lang/expr.cc
+++ b/src/lang/expr.cc
@@ -5,7 +5,7 @@
 #include <tvm/base.h>
 #include <tvm/expr.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <ir/IRPrinter.h>
 #include <memory>
 
diff --git a/src/lang/ir_operator.cc b/src/lang/expr_operator.cc
similarity index 58%
rename from src/lang/ir_operator.cc
rename to src/lang/expr_operator.cc
index beceb094c620..edbe0be3d5c5 100644
--- a/src/lang/ir_operator.cc
+++ b/src/lang/expr_operator.cc
@@ -1,28 +1,16 @@
 /*!
  *  Copyright (c) 2017 by Contributors
- * \file ir_operator.cc
+ * \file expr_operator.cc
  */
 #include <tvm/base.h>
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <cmath>
+// Centralized header for constant folders.
+#include "../arithmetic/const_fold.h"
 
 namespace tvm {
 
-/*!
- * \brief Check whether type is used to represent index.
- *
- * Index types are frequently used in shape computation
- * and need to be aggressively constant-folded.
- *
- * \param type The type to represent index.
- * \return the checked result.
- */
-inline bool IsIndexType(const Type& type) {
-  return type.is_int() && type.lanes() == 1 &&
-      (type.bits() == 32 || type.bits() == 64);
-}
-
 // simple cast that only checks if type matches and cast
 inline Expr SimpleCast(const Type& t, Expr value) {
   if (value.type() == t) return value;
@@ -135,45 +123,14 @@ Expr reinterpret(const Type& t, Expr value) {
   return ir::Call::make(t, ir::Call::reinterpret, { value }, ir::Call::PureIntrinsic);
 }
 
-#define TVM_INDEX_CONST_PROPAGATION(BODY)                               \
-  using ir::IntImm;                                                     \
-  using ir::UIntImm;                                                    \
-  const IntImm* pa = a.as<IntImm>();                                    \
-  const IntImm* pb = b.as<IntImm>();                                    \
-  const Type& ta = a.type();                                            \
-  const Type& tb = b.type();                                            \
-  if (IsIndexType(ta) && IsIndexType(tb)) {                             \
-    BODY;                                                               \
-  }                                                                     \
-  BinaryOpMatchTypes(a, b);
-
-#define TVM_ARITH_CONST_PROPAGATION(BODY)                               \
-  using ir::IntImm;                                                     \
-  using ir::UIntImm;                                                    \
-  using ir::FloatImm;                                                   \
-  BinaryOpMatchTypes(a, b);                                             \
-  const IntImm* pa = a.as<IntImm>();                                    \
-  const IntImm* pb = b.as<IntImm>();                                    \
-  const FloatImm* fa = a.as<FloatImm>();                                \
-  const FloatImm* fb = b.as<FloatImm>();                                \
-  BODY;
-
-
 Expr operator+(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value + pb->value);
-      if (pa && pa->value == 0) return SimpleCast(rtype, b);
-      if (pb && pb->value == 0) return SimpleCast(rtype, a);
-      if (fa && fb) return FloatImm::make(rtype, fa->value + fb->value);
-      if (fa && fa->value == 0) return SimpleCast(rtype, b);
-      if (fb && fb->value == 0) return SimpleCast(rtype, a);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Add>(a, b);
+  if (ret.defined()) return ret;
   return ir::Add::make(a, b);
 }
 
+// negation
 Expr operator-(Expr a) {
   using ir::IntImm;
   using ir::FloatImm;
@@ -185,114 +142,44 @@ Expr operator-(Expr a) {
 }
 
 Expr operator-(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value - pb->value);
-      if (pb && pb->value == 0) return SimpleCast(rtype, a);
-      if (fa && fb) return FloatImm::make(rtype, fa->value - fb->value);
-      if (fb && fb->value == 0) return SimpleCast(rtype, a);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Sub>(a, b);
+  if (ret.defined()) return ret;
   return ir::Sub::make(a, b);
 }
 
 Expr operator*(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, pa->value * pb->value);
-      if (pa) {
-        if (pa->value == 1) return SimpleCast(rtype, b);
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return SimpleCast(rtype, a);
-        if (pb->value == 0) return SimpleCast(rtype, b);
-      }
-      if (fa && fb) return FloatImm::make(rtype, fa->value * fb->value);
-      if (fa) {
-        if (fa->value == 1) return SimpleCast(rtype, b);
-        if (fa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (fb) {
-        if (fb->value == 1) return SimpleCast(rtype, a);
-        if (fb->value == 0) return SimpleCast(rtype, b);
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Mul>(a, b);
+  if (ret.defined()) return ret;
   return ir::Mul::make(a, b);
 }
 
 Expr operator/(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      // due to division and mod can have different modes
-      // only constant fold positive number where rule is fixed.
-      if (pa && pb && pa->value >= 0 && pb->value > 0) {
-        return IntImm::make(rtype, pa->value / pb->value);
-      }
-      if (pa) {
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return SimpleCast(rtype, a);
-        CHECK_NE(pb->value, 0) << "Divide by zero";
-      }
-      if (fa && fb && fb->value != 0) {
-        return FloatImm::make(rtype, fa->value / fb->value);
-      }
-      if (fa && fa->value == 0) {
-        return SimpleCast(rtype, a);
-      }
-      if (fb) {
-        if (fb->value == 1) return SimpleCast(rtype, a);
-        CHECK_NE(fb->value, 0) << "Divide by zero";
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Div>(a, b);
+  if (ret.defined()) return ret;
   return ir::Div::make(a, b);
 }
 
 Expr operator%(Expr a, Expr b) {
-  TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      // due to division and mod can have different modes
-      // only constant fold positive number where rule is fixed.
-      if (pa && pb && pa->value >= 0 && pb->value > 0) {
-        return IntImm::make(rtype, pa->value % pb->value);
-      }
-      if (pa) {
-        if (pa->value == 0) return SimpleCast(rtype, a);
-      }
-      if (pb) {
-        if (pb->value == 1) return make_zero(rtype);
-        CHECK_NE(pb->value, 0) << "Divide by zero";
-      }
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Mod>(a, b);
+  if (ret.defined()) return ret;
   return ir::Mod::make(a, b);
 }
 
 Expr min(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, std::min(pa->value, pb->value));
-      if (fa && fb) return FloatImm::make(rtype, std::min(fa->value, fb->value));
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Min>(a, b);
+  if (ret.defined()) return ret;
   return ir::Min::make(a, b);
 }
 
 Expr max(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      const Type& ta = a.type();
-      const Type& tb = b.type();
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
-      if (pa && pb) return IntImm::make(rtype, std::max(pa->value, pb->value));
-      if (fa && fb) return FloatImm::make(rtype, std::max(fa->value, fb->value));
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::Max>(a, b);
+  if (ret.defined()) return ret;
   return ir::Max::make(a, b);
 }
 
@@ -328,129 +215,116 @@ Expr likely(Expr cond) {
 }
 
 Expr operator>(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value > pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value > fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::GT>(a, b);
+  if (ret.defined()) return ret;
   return ir::GT::make(a, b);
 }
 
 Expr operator>=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value >= pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value >= fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::GE>(a, b);
+  if (ret.defined()) return ret;
   return ir::GE::make(a, b);
 }
 
 Expr operator<(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value < pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value < fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::LT>(a, b);
+  if (ret.defined()) return ret;
   return ir::LT::make(a, b);
 }
 
 Expr operator<=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value <= pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value <= fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::LE>(a, b);
+  if (ret.defined()) return ret;
   return ir::LE::make(a, b);
 }
 
 Expr operator==(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value == pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value == fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::EQ>(a, b);
+  if (ret.defined()) return ret;
   return ir::EQ::make(a, b);
 }
 
 Expr operator!=(Expr a, Expr b) {
-  TVM_ARITH_CONST_PROPAGATION({
-      if (pa && pb) return UIntImm::make(UInt(1), pa->value != pb->value);
-      if (fa && fb) return UIntImm::make(UInt(1), fa->value != fb->value);
-    });
+  BinaryOpMatchTypes(a, b);
+  Expr ret = arith::TryConstFold<ir::NE>(a, b);
+  if (ret.defined()) return ret;
   return ir::NE::make(a, b);
 }
 
 Expr operator&&(Expr a, Expr b) {
-  using ir::UIntImm;
-  if (a.type().is_bool() && b.type().is_bool()) {
-    const UIntImm* pa = a.as<UIntImm>();
-    const UIntImm* pb = b.as<UIntImm>();
-    if (pa && pa->value) return b;
-    if (pa && !pa->value) return a;
-    if (pb && pb->value) return a;
-    if (pb && !pb->value) return b;
-  }
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::And>(a, b);
+  if (ret.defined()) return ret;
   return ir::And::make(a, b);
 }
 
 Expr operator||(Expr a, Expr b) {
-  using ir::UIntImm;
-  if (a.type().is_bool() && b.type().is_bool()) {
-    const UIntImm* pa = a.as<UIntImm>();
-    const UIntImm* pb = b.as<UIntImm>();
-    if (pa && pa->value) return a;
-    if (pa && !pa->value) return b;
-    if (pb && pb->value) return b;
-    if (pb && !pb->value) return a;
-  }
+  CHECK(a.type().is_bool());
+  CHECK(b.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::Or>(a, b);
+  if (ret.defined()) return ret;
   return ir::Or::make(a, b);
 }
 
 Expr operator!(Expr a) {
-  using ir::UIntImm;
-  const UIntImm* pa = a.as<UIntImm>();
-  if (pa) {
-    return UIntImm::make(UInt(1), !(pa->value));
-  }
+  CHECK(a.type().is_bool());
+  Expr ret = arith::TryConstFold<ir::Not>(a);
+  if (ret.defined()) return ret;
   return ir::Not::make(a);
 }
 
 Expr operator>>(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value >> pb->value));
       if (pb) {
-        if (pb->value == 0) return SimpleCast(rtype, a);
+        if (pb->value == 0) return a;
       }
     });
   return ir::Call::make(a.type(), ir::Call::shift_right, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator<<(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value << pb->value));
       if (pb) {
-        if (pb->value == 0) return SimpleCast(rtype, a);
+        if (pb->value == 0) return a;
       }
     });
   return ir::Call::make(a.type(), ir::Call::shift_left, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator&(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value & pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_and, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator|(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value | pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_or, { a, b }, ir::Call::PureIntrinsic);
 }
 
 Expr operator^(Expr a, Expr b) {
+  BinaryOpMatchTypes(a, b);
   TVM_INDEX_CONST_PROPAGATION({
-      Type rtype = ta.bits() >= tb.bits() ? ta : tb;
+      const Type& rtype = a.type();
       if (pa && pb) return IntImm::make(rtype, (pa->value ^ pb->value));
     });
   return ir::Call::make(a.type(), ir::Call::bitwise_xor, { a, b }, ir::Call::PureIntrinsic);
diff --git a/src/op/hybrid_op.cc b/src/op/hybrid_op.cc
index 0268498c7db2..31c45258abc8 100644
--- a/src/op/hybrid_op.cc
+++ b/src/op/hybrid_op.cc
@@ -7,8 +7,8 @@
 #include <tvm/arithmetic.h>
 #include <tvm/ir.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
 #include <tvm/ir_pass.h>
+#include <tvm/expr_operator.h>
 #include <ir/Expr.h>
 #include <unordered_set>
 #include <string>
diff --git a/src/pass/ir_util.h b/src/pass/ir_util.h
index 3cef4486ee1b..6af8421398de 100644
--- a/src/pass/ir_util.h
+++ b/src/pass/ir_util.h
@@ -7,7 +7,7 @@
 #define TVM_PASS_IR_UTIL_H_
 
 #include <tvm/ir.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/runtime/device_api.h>
 #include <vector>
 
diff --git a/src/pass/storage_flatten.cc b/src/pass/storage_flatten.cc
index 488d44544c31..12913dde95af 100644
--- a/src/pass/storage_flatten.cc
+++ b/src/pass/storage_flatten.cc
@@ -8,7 +8,7 @@
 #include <tvm/expr.h>
 #include <tvm/operation.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/ir_pass.h>
 #include <tvm/buffer.h>
 #include <tvm/target_info.h>
diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc
index c24203cebdb3..5bab6399151a 100644
--- a/src/relay/op/nn/pad.cc
+++ b/src/relay/op/nn/pad.cc
@@ -4,7 +4,7 @@
  * \brief Implementation of operator pad
  */
 #include <tvm/data_layout.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/nn.h>
 #include <topi/nn.h>
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index df23b22512e3..55892e5c73a1 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -5,7 +5,7 @@
  */
 #include <tvm/relay/op.h>
 #include <tvm/relay/attrs/transform.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/ir.h>
 #include <tvm/data_layout.h>
 #include <topi/transform.h>
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 11a376b2b657..11f96c48a311 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -6,7 +6,7 @@
  * \brief This is a backend-aware optimization pass.
  *   Fuse necessary ops into a single one.
  */
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 #include <tvm/relay/pass.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/op_attr_types.h>
diff --git a/tests/cpp/ir_mutator_test.cc b/tests/cpp/ir_mutator_test.cc
index 0802d405bbe4..eecced8d90ab 100644
--- a/tests/cpp/ir_mutator_test.cc
+++ b/tests/cpp/ir_mutator_test.cc
@@ -1,7 +1,7 @@
 #include <dmlc/logging.h>
 #include <gtest/gtest.h>
 #include <tvm/ir_mutator.h>
-#include <tvm/ir_operator.h>
+#include <tvm/expr_operator.h>
 
 namespace {
 using namespace tvm::ir;

From be841948aa5cc85c4e3df88433763bfc4cf8643b Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Mon, 4 Mar 2019 02:15:12 +0800
Subject: [PATCH 58/93] [RELAY][PASS] Common subexpression elimination (#2639)

---
 python/tvm/relay/ir_pass.py                   | 20 ++++++
 src/relay/pass/eliminate_common_subexpr.cc    | 72 +++++++++++++++++++
 src/relay/pass/pattern_util.h                 | 15 ++++
 .../test_pass_eliminate_common_subexpr.py     | 63 ++++++++++++++++
 4 files changed, 170 insertions(+)
 create mode 100644 src/relay/pass/eliminate_common_subexpr.cc
 create mode 100644 tests/python/relay/test_pass_eliminate_common_subexpr.py

diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 02a6e8b5906e..04b92ba68e3b 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -564,3 +564,23 @@ def get_total_mac_number(expr):
       The number of MACs (multiply-accumulate) of a model
     """
     return _ir_pass.GetTotalMacNumber(expr)
+
+
+def eliminate_common_subexpr(expr, fskip=None):
+    """
+    Eliminate common subexpressions.
+
+    Parameters
+    ----------
+    expr : tvm.relay.Expr
+        The input expression.
+
+    fskip: function
+        The callback function that decides whether an expression should be skipped.
+
+    Returns
+    -------
+    expr : tvm.relay.Expr
+      The output expression.
+    """
+    return _ir_pass.eliminate_common_subexpr(expr, fskip)
diff --git a/src/relay/pass/eliminate_common_subexpr.cc b/src/relay/pass/eliminate_common_subexpr.cc
new file mode 100644
index 000000000000..10e6f920f245
--- /dev/null
+++ b/src/relay/pass/eliminate_common_subexpr.cc
@@ -0,0 +1,72 @@
+/*!
+ * Copyright (c) 2019 by Contributors
+ *
+ * \file eliminate_common_subexpr.cc
+ * \brief Combine common subexpressions.
+ *
+ * This is an optimization pass that eliminates common subexpressions. During the pass, it tries
+ * to replace an expression with a previously appeared expression with the same input and
+ * attributes. The fskip callback argument allows us to skip specific expressions.
+ */
+#include <tvm/relay/pass.h>
+#include <tvm/relay/expr_functor.h>
+#include <unordered_map>
+#include "./pattern_util.h"
+
+namespace tvm {
+namespace relay {
+
+class CommonSubexprEliminator : public ExprMutator {
+ public:
+  explicit CommonSubexprEliminator(runtime::TypedPackedFunc<bool(Expr)> fskip): fskip_(fskip) {}
+
+  Expr VisitExpr_(const CallNode* call) final {
+    static auto op_stateful = Op::GetAttr<TOpIsStateful>("TOpIsStateful");
+    Expr new_expr = ExprMutator::VisitExpr_(call);
+    const CallNode* new_call = new_expr.as<CallNode>();
+    CHECK(new_call);
+    const OpNode* op = new_call->op.as<OpNode>();
+    AttrsEqual attrs_equal;
+
+    if (new_call->args.size() == 0 || op == nullptr || op_stateful.get(GetRef<Op>(op), false)) {
+      return new_expr;
+    }
+    if (fskip_ != nullptr && fskip_(new_expr)) {
+      return new_expr;
+    }
+
+    auto it = expr_map_.find(new_call->op);
+    if (it != expr_map_.end()) {
+      for (const CallNode* candidate : it->second) {
+        bool is_equivalent = true;
+        if (!attrs_equal(new_call->attrs, candidate->attrs)) {
+          continue;
+        }
+        for (size_t i = 0; i < new_call->args.size(); i++) {
+          if (!new_call->args[i].same_as(candidate->args[i]) &&
+              !IsEqualScalar(new_call->args[i], candidate->args[i])) {
+            is_equivalent = false;
+            break;
+          }
+        }
+        if (!is_equivalent) continue;
+        return GetRef<Call>(candidate);
+      }
+    }
+    expr_map_[new_call->op].push_back(new_call);
+    return new_expr;
+  }
+
+  std::unordered_map<Expr, std::vector<const CallNode*>, NodeHash, NodeEqual> expr_map_;
+  runtime::TypedPackedFunc<bool(Expr)> fskip_;
+};
+
+Expr EliminateCommonSubexpr(const Expr& expr, PackedFunc callback) {
+  return CommonSubexprEliminator(callback)(expr);
+}
+
+TVM_REGISTER_API("relay._ir_pass.eliminate_common_subexpr")
+.set_body_typed<Expr(Expr, PackedFunc)>(EliminateCommonSubexpr);
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index 0644c26c6bcc..e59efa958310 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -191,6 +191,21 @@ inline Constant MakeConstantScalar(DataType dtype, T value) {
   return ConstantNode::make(arr);
 }
 
+/*!
+ * \brief Check if two expressions are equal scalars.
+ * \param a The expression to be checked.
+ * \param b The expression to be checked
+ * \return Whether two expressions are equal scalars.
+ */
+inline bool IsEqualScalar(const Expr& a, const Expr& b) {
+  const auto* constant_a = a.as<ConstantNode>();
+  const auto* constant_b = b.as<ConstantNode>();
+  if (!constant_a || !constant_b || !constant_a->is_scalar() || !constant_b->is_scalar()) {
+    return false;
+  }
+  return AlphaEqual(a, b);
+}
+
 inline Expr GetField(Expr t, size_t i) {
   return TupleGetItemNode::make(t, i);
 }
diff --git a/tests/python/relay/test_pass_eliminate_common_subexpr.py b/tests/python/relay/test_pass_eliminate_common_subexpr.py
new file mode 100644
index 000000000000..381a54a3d324
--- /dev/null
+++ b/tests/python/relay/test_pass_eliminate_common_subexpr.py
@@ -0,0 +1,63 @@
+"""Test eliminate common subexpr pass"""
+from tvm import relay
+from tvm.relay.op import register_alter_op_layout
+from tvm.relay import ir_pass
+
+
+def test_simple():
+    def before():
+        x = relay.var("x", shape=(1, 16))
+        y1 = relay.nn.relu(x)
+        y2 = relay.nn.relu(x)
+        y1 = relay.add(y1, relay.const(1.0, "float32"))
+        y2 = relay.add(y2, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def expected():
+        x = relay.var("x", shape=(1, 16))
+        y = relay.nn.relu(x)
+        y = relay.add(y, relay.const(1.0, "float32"))
+        y = relay.add(y, y)
+        f = relay.Function([x], y)
+        return f
+
+    z = before()
+    z = ir_pass.eliminate_common_subexpr(z)
+    assert ir_pass.alpha_equal(z, expected())
+
+
+def test_callback():
+    def before():
+        x = relay.var("x", shape=(1, 16))
+        y1 = relay.nn.relu(x)
+        y2 = relay.nn.relu(x)
+        y1 = relay.add(y1, relay.const(1.0, "float32"))
+        y2 = relay.add(y2, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def expected():
+        x = relay.var("x", shape=(1, 16))
+        y = relay.nn.relu(x)
+        y1 = relay.add(y, relay.const(1.0, "float32"))
+        y2 = relay.add(y, relay.const(1.0, "float32"))
+        y = relay.add(y1, y2)
+        f = relay.Function([x], y)
+        return f
+
+    def fskip(expr):
+        if isinstance(expr, relay.expr.Call) and expr.op.name == 'add':
+            return True
+        return False
+
+    z = before()
+    z = ir_pass.eliminate_common_subexpr(z, fskip)
+    assert ir_pass.alpha_equal(z, expected())
+
+
+if __name__ == "__main__":
+    test_simple()
+    test_callback()

From 215aedbe0f0de805560d429d610a5c01b21fc578 Mon Sep 17 00:00:00 2001
From: Ashutosh Parkhi <ashutosh.parkhi@imgtec.com>
Date: Sun, 3 Mar 2019 23:50:59 +0530
Subject: [PATCH 59/93] [Tensorflow, NNVM, TOPI] Support for logical operators
 (#2453)

---
 docs/api/python/topi.rst                      |  3 ++
 docs/nnvm_top.rst                             |  6 +++
 nnvm/python/nnvm/compiler/graph_attr.py       |  2 +
 nnvm/python/nnvm/frontend/tensorflow.py       |  8 ++++
 nnvm/python/nnvm/top/tensor.py                | 12 ++++++
 nnvm/src/compiler/compile_engine.cc           |  3 ++
 nnvm/src/top/tensor/elemwise.cc               | 38 ++++++++++++++++
 .../frontend/tensorflow/test_forward.py       | 43 +++++++++++++++++++
 topi/include/topi/broadcast.h                 | 27 ++++++++++++
 topi/include/topi/elemwise.h                  | 17 ++++++++
 topi/src/topi.cc                              |  2 +
 11 files changed, 161 insertions(+)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index e8b63637ffb5..d2f9f01fcf52 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -68,6 +68,9 @@ List of operators
    topi.not_equal
    topi.greater_equal
    topi.less_equal
+   topi.logical_and
+   topi.logical_or
+   topi.logical_not
    topi.arange
    topi.layout_transform
    topi.image.resize
diff --git a/docs/nnvm_top.rst b/docs/nnvm_top.rst
index 717ce985e002..f05eed3308b3 100644
--- a/docs/nnvm_top.rst
+++ b/docs/nnvm_top.rst
@@ -35,6 +35,9 @@ This level enables fully connected multi-layer perceptron.
    nnvm.symbol.exp
    nnvm.symbol.log
    nnvm.symbol.sqrt
+   nnvm.symbol.logical_and
+   nnvm.symbol.logical_or
+   nnvm.symbol.logical_not
    nnvm.symbol.elemwise_add
    nnvm.symbol.elemwise_sub
    nnvm.symbol.elemwise_mul
@@ -172,6 +175,9 @@ Detailed Definitions
 .. autofunction:: nnvm.symbol.exp
 .. autofunction:: nnvm.symbol.log
 .. autofunction:: nnvm.symbol.sqrt
+.. autofunction:: nnvm.symbol.logical_and
+.. autofunction:: nnvm.symbol.logical_or
+.. autofunction:: nnvm.symbol.logical_not
 .. autofunction:: nnvm.symbol.elemwise_add
 .. autofunction:: nnvm.symbol.elemwise_sub
 .. autofunction:: nnvm.symbol.elemwise_mul
diff --git a/nnvm/python/nnvm/compiler/graph_attr.py b/nnvm/python/nnvm/compiler/graph_attr.py
index 3ce6c4b53239..2f1f0350d71b 100644
--- a/nnvm/python/nnvm/compiler/graph_attr.py
+++ b/nnvm/python/nnvm/compiler/graph_attr.py
@@ -39,6 +39,7 @@ def set_shape_inputs(g, shape):
     "uint16": 8,
     "uint32": 9,
     "uint64": 10,
+    "bool": 11,
 }
 
 TCODE_TO_DTYPE = {
@@ -54,6 +55,7 @@ def set_shape_inputs(g, shape):
     8: "uint16",
     9: "uint32",
     10: "uint64",
+    11: "bool",
 }
 
 def set_dtype_inputs(g, dtype):
diff --git a/nnvm/python/nnvm/frontend/tensorflow.py b/nnvm/python/nnvm/frontend/tensorflow.py
index 980e60414595..777ab8a80adf 100644
--- a/nnvm/python/nnvm/frontend/tensorflow.py
+++ b/nnvm/python/nnvm/frontend/tensorflow.py
@@ -884,6 +884,11 @@ def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
 
     return _sym.expand_dims(data, axis=axis, num_newaxis=num_newaxis)
 
+def _logical(name):
+    def _impl(inputs, attr, params):
+        return AttrCvt(op_name=name)(inputs, attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -946,6 +951,9 @@ def _expand_dims_0d_aware(data, attr, axis, num_newaxis=1):
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
     'Mean'                              : _mean(),
+    'LogicalAnd'                        : _logical('logical_and'),
+    'LogicalOr'                         : _logical('logical_or'),
+    'LogicalNot'                        : _logical('logical_not'),
     'Less'                              : _broadcast('less'),
     'Greater'                           : _broadcast('greater'),
     'LessEqual'                         : _broadcast('less_equal'),
diff --git a/nnvm/python/nnvm/top/tensor.py b/nnvm/python/nnvm/top/tensor.py
index e0214d6ddf16..5dae01695e3a 100644
--- a/nnvm/python/nnvm/top/tensor.py
+++ b/nnvm/python/nnvm/top/tensor.py
@@ -140,6 +140,18 @@ def _compute(attrs, x, _):
 reg.register_pattern("__rshift_scalar__", OpPattern.ELEMWISE)
 reg.register_schedule("__rshift_scalar__", _fschedule_broadcast)
 
+# logical_and
+reg.register_pattern("logical_and", OpPattern.ELEMWISE)
+reg.register_schedule("logical_and", _fschedule_broadcast)
+
+# logical_or
+reg.register_pattern("logical_or", OpPattern.ELEMWISE)
+reg.register_schedule("logical_or", _fschedule_broadcast)
+
+# logical_not
+reg.register_pattern("logical_not", OpPattern.ELEMWISE)
+reg.register_schedule("logical_not", _fschedule_broadcast)
+
 # elemwise_add
 reg.register_pattern("elemwise_add", OpPattern.BROADCAST)
 reg.register_schedule("elemwise_add", _fschedule_broadcast)
diff --git a/nnvm/src/compiler/compile_engine.cc b/nnvm/src/compiler/compile_engine.cc
index fbeceb17668c..2fd9c44fda66 100644
--- a/nnvm/src/compiler/compile_engine.cc
+++ b/nnvm/src/compiler/compile_engine.cc
@@ -40,6 +40,7 @@ int GetTypeFlag(tvm::Type type) {
   if (type == tvm::UInt(16)) return 8;
   if (type == tvm::UInt(32)) return 9;
   if (type == tvm::UInt(64)) return 10;
+  if (type == tvm::UInt(1)) return 11;
   LOG(FATAL) << "cannot convert " << type;
   return 0;
 }
@@ -68,6 +69,8 @@ Type GetTVMType(int type_flag) {
       return tvm::UInt(32);
     case 10:
       return tvm::UInt(64);
+    case 11:
+      return tvm::UInt(1);
     default:
       LOG(FATAL) << "unknown type_flag=" << type_flag;
       return Float(32);
diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 3ee52008eb1c..52d9aa4456ed 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -361,6 +361,31 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(elemwise_pow)
       return Array<Tensor>{ topi::power(inputs[0], inputs[1]) };
 });
 
+// logical
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_and)
+.describe(R"code(Elementwise compute the logical AND
+
+)code")
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_and(inputs[0], inputs[1]) };
+});
+
+NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_or)
+.describe(R"code(Elementwise compute the logical OR
+
+)code")
+.set_support_level(1)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_or(inputs[0], inputs[1]) };
+});
+
 // negative
 NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
 .describe(R"code(Elemenwise numeric negative
@@ -383,6 +408,19 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(negative)
     };
 });
 
+// logical NOT
+NNVM_REGISTER_ELEMWISE_UNARY_OP(logical_not)
+.describe(R"code(Elementwise compute the logical NOT
+
+)code"  NNVM_ADD_FILELINE)
+.set_support_level(3)
+.set_attr<FTVMCompute>(
+  "FTVMCompute", [](const NodeAttrs& attrs,
+                    const Array<Tensor>& inputs,
+                    const Array<Tensor>& out_info) {
+      return Array<Tensor>{ topi::logical_not(inputs[0]) };
+});
+
 // copy
 NNVM_REGISTER_ELEMWISE_UNARY_OP(copy)
 .describe(R"code(Copy tensor to another one.
diff --git a/nnvm/tests/python/frontend/tensorflow/test_forward.py b/nnvm/tests/python/frontend/tensorflow/test_forward.py
index f4ec61979527..b71442d2b9a4 100644
--- a/nnvm/tests/python/frontend/tensorflow/test_forward.py
+++ b/nnvm/tests/python/frontend/tensorflow/test_forward.py
@@ -777,6 +777,48 @@ def test_forward_pad():
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT")
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT", constant_values=1.0)
 
+#######################################################################
+# Logical operators
+# --------------------
+def test_logical_and():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_and(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_or():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_or(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_xor():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_xor(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_not():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        out = tf.logical_not(in1, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm(in_data1, 'in1:0', 'out:0')
+
+def test_forward_logical():
+    test_logical_and()
+    test_logical_or()
+    test_logical_xor()
+    test_logical_not()
 
 #######################################################################
 # Inception V3
@@ -1205,3 +1247,4 @@ def test_forward_rel_ops():
 
     # Relational ops
     test_forward_rel_ops()
+    test_forward_logical()
diff --git a/topi/include/topi/broadcast.h b/topi/include/topi/broadcast.h
index ad1c04ae1327..88007ee94e85 100644
--- a/topi/include/topi/broadcast.h
+++ b/topi/include/topi/broadcast.h
@@ -93,6 +93,33 @@ inline tvm::Tensor broadcast_to(const tvm::Tensor& t,
     return topi::OpName(A, B);                                      \
   }
 
+/*!
+ * \fn logical_and
+ * \brief Compute A && B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(logical_and, { return a && b; });
+TOPI_DEFINE_OP_OVERLOAD(operator&&, logical_and);
+
+/*!
+ * \fn logical_or
+ * \brief Compute A || B with auto-broadcasting.
+ *
+ * \param A The first tensor, or Expr
+ * \param B The second tensor, or Expr
+ * \param name The name of the operation
+ * \param tag The tag to mark the operation
+ *
+ * \return The result.
+ */
+TOPI_DEFINE_BCAST_OP(logical_or, { return a || b; });
+TOPI_DEFINE_OP_OVERLOAD(operator||, logical_or);
 
 /*!
  * \fn add
diff --git a/topi/include/topi/elemwise.h b/topi/include/topi/elemwise.h
index 02bc51515159..40dffa09a9bf 100644
--- a/topi/include/topi/elemwise.h
+++ b/topi/include/topi/elemwise.h
@@ -71,6 +71,23 @@ inline Tensor negative(const Tensor& x,
   }, name, tag);
 }
 
+/*!
+* \brief Creates an operation that returns the logical NOT of a given tensor
+*
+* \param x The input tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the logical NOT operation
+*/
+inline Tensor logical_not(const Tensor& x,
+                          std::string name = "tensor",
+                          std::string tag = kElementWise) {
+  return compute(x->shape, [&](const Array<Var>& i) {
+    return !x(i);
+  }, name, tag);
+}
+
 /*!
 * \brief Creates an operation that clips each element of a tensor to
 * the interval [a_min, a_max]
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 6fa748547cd9..e566a5d510ee 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -112,6 +112,8 @@ TOPI_REGISTER_BCAST_OP("topi.maximum", topi::maximum);
 TOPI_REGISTER_BCAST_OP("topi.minimum", topi::minimum);
 TOPI_REGISTER_BCAST_OP("topi.power", topi::power);
 TOPI_REGISTER_BCAST_OP("topi.left_shift", topi::left_shift);
+TOPI_REGISTER_BCAST_OP("topi.logical_and", topi::logical_and);
+TOPI_REGISTER_BCAST_OP("topi.logical_or", topi::logical_or);
 TOPI_REGISTER_BCAST_OP("topi.right_shift", topi::right_shift);
 TOPI_REGISTER_BCAST_OP("topi.greater", topi::greater);
 TOPI_REGISTER_BCAST_OP("topi.less", topi::less);

From 76e83df8e7d5139681026f56c263f4ab0bbb7f20 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Sun, 3 Mar 2019 10:24:20 -0800
Subject: [PATCH 60/93] [Relay][Frontend] Add a few mxnet ops in relay frontend
 (#2704)

---
 python/tvm/relay/frontend/mxnet.py          | 79 +++++++++++++-------
 tests/python/frontend/mxnet/test_forward.py | 83 +++++++++++++++++++++
 2 files changed, 136 insertions(+), 26 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 3d3bb8e4fd84..1f1d18e240cd 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -64,6 +64,13 @@ def _stable_softrelu(x):
     raise RuntimeError("Do not support act_type: {}".format(act_type))
 
 
+def _mx_compare(new_op, wrapper):
+    def impl(inputs, attrs):
+        dtype = ir_pass.infer_type(inputs[0]).checked_type.dtype
+        return wrapper(new_op)(inputs, attrs).astype(dtype)
+    return impl
+
+
 def _mx_conv2d(inputs, attrs):
     kernel_size = attrs.get_int_tuple("kernel")
     if len(kernel_size) != 2:
@@ -333,32 +340,52 @@ def _mx_roi_align(inputs, attrs):
 ]
 
 _convert_map = {
-    "_copy"         : _rename(_op.copy),
-    "relu"          : _rename(_op.nn.relu),
-    "broadcast_add" : _rename(_op.add),
-    "broadcast_sub" : _rename(_op.subtract),
-    "broadcast_mul" : _rename(_op.multiply),
-    "broadcast_div" : _rename(_op.divide),
-    "elemwise_add"  : _rename(_op.add),
-    "elemwise_sub"  : _rename(_op.subtract),
-    "elemwise_mul"  : _rename(_op.multiply),
-    "elemwise_div"  : _rename(_op.divide),
-    "flatten"       : _rename(_op.nn.batch_flatten),
-    "Flatten"       : _rename(_op.nn.batch_flatten),
-    "_plus_scalar"  : _binop_scalar(_op.add),
-    "__add_scalar__": _binop_scalar(_op.add),
-    "__sub_scalar__": _binop_scalar(_op.subtract),
-    "_minus_scalar" : _binop_scalar(_op.subtract),
-    "__mul_scalar__": _binop_scalar(_op.multiply),
-    "_mul_scalar"   : _binop_scalar(_op.multiply),
-    "__div_scalar__": _binop_scalar(_op.divide),
-    "_div_scalar"   : _binop_scalar(_op.divide),
-    "__pow_scalar__": _binop_scalar(_op.power),
-    "_rminus_scalar": _rbinop_scalar(_op.subtract),
-    "__rsub_scalar__": _rbinop_scalar(_op.subtract),
-    "_rdiv_scalar"  : _rbinop_scalar(_op.divide),
-    "__rdiv_scalar__"  : _rbinop_scalar(_op.divide),
-    "__rpow_scalar__": _rbinop_scalar(_op.power),
+    "_copy"                  : _rename(_op.copy),
+    "relu"                   : _rename(_op.nn.relu),
+    "broadcast_add"          : _rename(_op.add),
+    "broadcast_sub"          : _rename(_op.subtract),
+    "broadcast_mul"          : _rename(_op.multiply),
+    "broadcast_div"          : _rename(_op.divide),
+    "broadcast_mod"          : _rename(_op.mod),
+    "broadcast_maximum"      : _rename(_op.maximum),
+    "broadcast_minimum"      : _rename(_op.minimum),
+    "broadcast_equal"        : _mx_compare(_op.equal, _rename),
+    "broadcast_not_equal"    : _mx_compare(_op.not_equal, _rename),
+    "broadcast_greater"      : _mx_compare(_op.greater, _rename),
+    "broadcast_greater_equal": _mx_compare(_op.greater_equal, _rename),
+    "broadcast_lesser"       : _mx_compare(_op.less, _rename),
+    "broadcast_lesser_equal" : _mx_compare(_op.less_equal, _rename),
+    "elemwise_add"           : _rename(_op.add),
+    "elemwise_sub"           : _rename(_op.subtract),
+    "elemwise_mul"           : _rename(_op.multiply),
+    "elemwise_div"           : _rename(_op.divide),
+    "_maximum"               : _rename(_op.maximum),
+    "_minimum"               : _rename(_op.minimum),
+    "flatten"                : _rename(_op.nn.batch_flatten),
+    "Flatten"                : _rename(_op.nn.batch_flatten),
+    "__add_scalar__"         : _binop_scalar(_op.add),
+    "_plus_scalar"           : _binop_scalar(_op.add),
+    "__sub_scalar__"         : _binop_scalar(_op.subtract),
+    "_minus_scalar"          : _binop_scalar(_op.subtract),
+    "__mul_scalar__"         : _binop_scalar(_op.multiply),
+    "_mul_scalar"            : _binop_scalar(_op.multiply),
+    "__div_scalar__"         : _binop_scalar(_op.divide),
+    "_div_scalar"            : _binop_scalar(_op.divide),
+    "__pow_scalar__"         : _binop_scalar(_op.power),
+    "_power_scalar"          : _binop_scalar(_op.power),
+    "__rsub_scalar__"        : _rbinop_scalar(_op.subtract),
+    "_rminus_scalar"         : _rbinop_scalar(_op.subtract),
+    "__rdiv_scalar__"        : _rbinop_scalar(_op.divide),
+    "_rdiv_scalar"           : _rbinop_scalar(_op.divide),
+    "__rpow_scalar__"        : _rbinop_scalar(_op.power),
+    "_equal_scalar"          : _mx_compare(_op.equal, _binop_scalar),
+    "_not_equal_scalar"      : _mx_compare(_op.not_equal, _binop_scalar),
+    "_greater_scalar"        : _mx_compare(_op.greater, _binop_scalar),
+    "_greater_equal_scalar"  : _mx_compare(_op.greater_equal, _binop_scalar),
+    "_lesser_scalar"         : _mx_compare(_op.less, _binop_scalar),
+    "_lesser_equal_scalar"   : _mx_compare(_op.less_equal, _binop_scalar),
+    "_maximum_scalar"        : _binop_scalar(_op.maximum),
+    "_minimum_scalar"        : _binop_scalar(_op.minimum),
     # reduction ops
     "max"           : _reduce(_op.max),
     "min"           : _reduce(_op.min),
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 671316079308..ee47d72046ed 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -1,4 +1,5 @@
 import numpy as np
+import operator
 
 import tvm
 from tvm.contrib import graph_runtime
@@ -256,6 +257,85 @@ def verify(start, stop, step):
     verify(20, 1, -1)
     verify(20, 1, -1.5)
 
+def _mx_symbol(F, op_name, inputs):
+    op = getattr(F, op_name)
+    return op(*inputs)
+
+def test_forward_broadcast_ops():
+    for op in ["broadcast_add", "broadcast_sub", "broadcast_mul",
+               "broadcast_div", "broadcast_mod", "broadcast_maximum",
+               "broadcast_minimum", "broadcast_equal", "broadcast_not_equal",
+               "broadcast_greater", "broadcast_greater_equal",
+               "broadcast_lesser", "broadcast_lesser_equal"]:
+        a_shape = (3, 4, 5)
+        b_shape = (4, 5)
+        if op == "broadcast_mod":
+            dtype = 'int32'
+            a_np = np.random.randint(1, 100, size=a_shape).astype(dtype)
+            b_np = np.random.randint(1, 100, size=b_shape).astype(dtype)
+        else:
+            dtype = 'float32'
+            a_np = np.random.uniform(size=a_shape).astype(dtype)
+            b_np = np.random.uniform(size=b_shape).astype(dtype)
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
+        shapes = {'a': a_shape, 'b': b_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
+def test_forward_elemwise_ops():
+    for op in ["elemwise_add", "elemwise_sub", "elemwise_mul",
+               "elemwise_div", "maximum", "minimum"]:
+        shape = (3, 4, 5)
+        dtype = 'float32'
+        a_np = np.random.uniform(size=shape).astype(dtype)
+        b_np = np.random.uniform(size=shape).astype(dtype)
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), mx.sym.var('b')])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), mx.nd.array(b_np)])
+        shapes = {'a': shape, 'b': shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np, b_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
+def test_forward_scalar_ops():
+    for op in [operator.add, operator.sub, operator.mul, operator.truediv,
+               operator.pow, operator.lt, operator.le, operator.eq,
+               operator.ne, operator.gt, operator.ge]:
+        dtype='float32'
+        a_shape = (3, 4, 5)
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_scalar = 2.3
+        mx_sym = op(mx.sym.var('a'), b_scalar)
+        ref_res = op(mx.nd.array(a_np), b_scalar)
+        shapes = {'a': a_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    for op in ["maximum", "minimum"]:
+        dtype='float32'
+        a_shape = (3, 4, 5)
+        a_np = np.random.uniform(size=a_shape).astype(dtype)
+        b_scalar = 2.3
+        mx_sym = _mx_symbol(mx.sym, op, [mx.sym.var('a'), b_scalar])
+        ref_res = _mx_symbol(mx.nd, op, [mx.nd.array(a_np), b_scalar])
+        shapes = {'a': a_shape}
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, shapes, dtype)
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(a_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -280,3 +360,6 @@ def verify(start, stop, step):
     test_forward_argmin()
     test_forward_where()
     test_forward_arange()
+    test_forward_broadcast_ops()
+    test_forward_elemwise_ops()
+    test_forward_scalar_ops()

From 674d9aa9b89d460387d50569301963db93af40c2 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Sun, 3 Mar 2019 21:14:14 -0800
Subject: [PATCH 61/93] [Relay][Frontend] Add slice axis op in mxnet converter
 (#2706)

* Add slice axis op in mxnet converter

* Fix lint
---
 python/tvm/relay/frontend/mxnet.py          | 29 +++++++++++++++++++++
 tests/python/frontend/mxnet/test_forward.py | 18 +++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1f1d18e240cd..4d341c76043a 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -194,6 +194,34 @@ def _mx_slice(inputs, attrs):
     return _op.strided_slice(inputs[0], **new_attrs)
 
 
+def _mx_slice_axis(inputs, attrs):
+    assert len(inputs) == 1
+    shape = ir_pass.infer_type(inputs[0]).checked_type.shape
+    axis = attrs.get_int("axis")
+    ax_beg = attrs.get_int("begin")
+    ax_end = attrs.get_str("end")
+    if ax_end == "None":
+        ax_end = int(shape[axis])
+    else:
+        ax_end = int(ax_end)
+    if ax_beg < 0:
+        ax_beg += int(shape[axis])
+    if ax_end < 0:
+        ax_end += int(shape[axis])
+    assert ax_beg >= 0 and ax_beg < int(shape[axis])
+    assert ax_end > ax_beg and ax_end <= int(shape[axis])
+    begin = []
+    end = []
+    for i, dim in enumerate(shape):
+        if i != axis:
+            begin.append(0)
+            end.append(dim)
+        else:
+            begin.append(ax_beg)
+            end.append(ax_end)
+    return _op.strided_slice(inputs[0], begin, end)
+
+
 def _mx_split(inputs, attrs):
     axis = attrs.get_int("axis", 1)
     new_attrs = {}
@@ -423,6 +451,7 @@ def _mx_roi_align(inputs, attrs):
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
     "slice"         : _mx_slice,
+    "slice_axis"    : _mx_slice_axis,
     "SliceChannel"  : _mx_split,
     "split"         : _mx_split,
     "expand_dims"   : _mx_expand_dims,
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index ee47d72046ed..7f53aa8a0155 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -337,6 +337,23 @@ def test_forward_scalar_ops():
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
 
+def test_forward_slice_axis():
+    def verify(shape, axis, begin, end):
+        data_np = np.random.uniform(size=shape).astype("float32")
+        ref_res = mx.nd.slice_axis(mx.nd.array(data_np), axis, begin, end)
+        mx_sym = mx.sym.slice_axis(mx.sym.var("data"), axis, begin, end)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"data": shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(data_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((3, 4), 0, 1, 2)
+    verify((3, 4), 0, 1, None)
+    verify((3, 4), 1, 0, 2)
+    verify((3, 4), 1, -3, -1)
+
+
 if __name__ == '__main__':
     test_forward_mlp()
     test_forward_vgg()
@@ -363,3 +380,4 @@ def test_forward_scalar_ops():
     test_forward_broadcast_ops()
     test_forward_elemwise_ops()
     test_forward_scalar_ops()
+    test_forward_slice_axis()
\ No newline at end of file

From 8ce998ebd3220e2d3578d9dd47ca7bea0f68a63d Mon Sep 17 00:00:00 2001
From: MORINAGA <34588258+imorinaga@users.noreply.github.com>
Date: Mon, 4 Mar 2019 22:21:49 +0900
Subject: [PATCH 62/93] [DOCS] Fix tutorial (#2724)

* fix docments

* delete e
---
 docs/install/from_source.rst   | 2 +-
 tutorials/relay_quick_start.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 7c0f5432ec94..5c828957cc79 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -162,7 +162,7 @@ Python dependencies
 
    .. code:: bash
 
-       pip install --user numpy decorator
+       pip install --user numpy decorator attrs
 
    * If you want to use RPC Tracker
 
diff --git a/tutorials/relay_quick_start.py b/tutorials/relay_quick_start.py
index 0768458d2cd4..286114fe997f 100644
--- a/tutorials/relay_quick_start.py
+++ b/tutorials/relay_quick_start.py
@@ -1,7 +1,8 @@
 """
 .. _tutorial-relay-quick-start:
+
 Quick Start Tutorial for Compiling Deep Learning Models
-======================================================e
+======================================================
 **Author**: `Yao Wang <https://github.com/kevinthesun>`_, `Truman Tian <https://github.com/SiNZeRo>`_
 
 This example shows how to build a neural network with Relay python frontend and

From 1f04aed7e83770313188f5dc7aea7155f68cda4c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Mon, 4 Mar 2019 09:53:07 -0800
Subject: [PATCH 63/93] [Relay] Higher order reverse mode automatic
 differentiation that work with control flow (#2496)

add test

remove dead code

stash

do it

add more test
---
 python/tvm/relay/ir_pass.py              |  21 ++-
 src/relay/pass/fuse_ops.cc               |   1 +
 src/relay/pass/gradient.cc               | 201 +++++++++++++++++------
 src/relay/pass/pattern_util.h            |   4 +-
 src/relay/pass/type_infer.cc             |   2 +-
 tests/python/relay/test_pass_gradient.py |  70 ++++++++
 6 files changed, 243 insertions(+), 56 deletions(-)

diff --git a/python/tvm/relay/ir_pass.py b/python/tvm/relay/ir_pass.py
index 04b92ba68e3b..2d8e99ae8b25 100644
--- a/python/tvm/relay/ir_pass.py
+++ b/python/tvm/relay/ir_pass.py
@@ -530,9 +530,11 @@ def to_graph_normal_form(expr):
     return _ir_pass.to_graph_normal_form(expr)
 
 
-def gradient(expr, mod=None):
+def gradient(expr, mod=None, mode='higher_order'):
     """
-    Transform a function to return original result paired with gradient of input.
+    Transform the input function,
+    returning a function that calculate the original result,
+    paired with gradient of the input.
 
     Parameters
     ----------
@@ -541,12 +543,23 @@ def gradient(expr, mod=None):
 
     mod : Optional[tvm.relay.Module]
 
+    mode : Optional[String]
+        The mode of the automatic differentiation algorithm.
+        'first_order' only work on first order code, but will not produce reference nor closure.
+        'higher_order' work on all code using reference and closure.
+
     Returns
     -------
     expr : tvm.relay.Expr
-      The output expression.
+      The transformed expression.
     """
-    return _ir_pass.first_order_gradient(expr, mod)
+    if mode == 'first_order':
+        return _ir_pass.first_order_gradient(expr, mod)
+    elif mode == 'higher_order':
+        return _ir_pass.gradient(expr, mod)
+    else:
+        raise Exception('unknown mode')
+
 
 
 def get_total_mac_number(expr):
diff --git a/src/relay/pass/fuse_ops.cc b/src/relay/pass/fuse_ops.cc
index 11f96c48a311..66ff9caf4ae4 100644
--- a/src/relay/pass/fuse_ops.cc
+++ b/src/relay/pass/fuse_ops.cc
@@ -225,6 +225,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor {
     }
 
     node->pattern = op_pattern;
+    this->Update(call->op, nullptr, kOpaque);
     const auto* rtype = call->checked_type().as<TensorTypeNode>();
     // pass the message back to all the children it references.
     for (size_t i = 0; i < call->args.size(); ++i) {
diff --git a/src/relay/pass/gradient.cc b/src/relay/pass/gradient.cc
index 780490a45b0a..d564e02b5596 100644
--- a/src/relay/pass/gradient.cc
+++ b/src/relay/pass/gradient.cc
@@ -85,10 +85,10 @@ using ADValue = std::shared_ptr<ADValueNode>;
 
 /*! \brief AD over a program which generates a tensor output. */
 struct ADTensor : ADValueNode {
-  Expr foward;
+  Expr forward;
   mutable Expr reverse;  // must be a variable to avoid duplication
-  ADTensor(LetList* ll, const Expr& foward) :
-    foward(ll->Push(foward)), reverse(ll->Push(ZeroLike(this->foward))) { }
+  ADTensor(LetList* ll, const Expr& forward) :
+    forward(ll->Push(forward)), reverse(ll->Push(ZerosLike(this->forward))) { }
 };
 
 /*! \brief A staged representation of the program, we reflect
@@ -105,14 +105,14 @@ struct ADFunction : ADValueNode {
     func(func) { }
 };
 
-struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
+struct FirstOrderReverseAD : ExprFunctor<ADValue(const Expr &)> {
   const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
   std::vector<std::function<void(LetList* ll)>> backprop_actions;
   // we assume no closure so no need for lexical scoping
   std::unordered_map<Var, ADValue, NodeHash, NodeEqual> env;
   LetList* ll;
 
-  ReverseAD(LetList* ll) : ll(ll) { }
+  FirstOrderReverseAD(LetList* ll) : ll(ll) { }
 
   ADValue VisitExpr_(const OpNode* op) final {
     Op op_ref = GetRef<Op>(op);
@@ -121,21 +121,22 @@ struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
     return std::make_shared<ADFunction>([this, op_ref](const std::vector<ADValue>& args,
                                                        const Attrs& attrs,
                                                        const tvm::Array<Type>& type_args) {
-        std::vector<Expr> call_args;
-        for (const ADValue& adval : args) {
-          call_args.push_back(adval->get<ADTensor>().foward);
+      std::vector<Expr> call_args;
+      for (const ADValue& adval : args) {
+        call_args.push_back(adval->get<ADTensor>().forward);
+      }
+      auto orig = CallNode::make(op_ref, call_args, attrs, type_args);
+      auto ret = std::make_shared<ADTensor>(ll, orig);
+      backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
+        tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
+        CHECK(args.size() == rev.size());
+        for (size_t i = 0; i < args.size(); ++i) {
+          args[i]->get<ADTensor>().reverse =
+            ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
         }
-        auto orig = CallNode::make(op_ref, call_args, attrs, type_args);
-        auto ret = std::make_shared<ADTensor>(ll, orig);
-        backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) {
-            tvm::Array<Expr> rev = rev_map[op_ref](orig, ret->reverse);
-            for (size_t i = 0; i < args.size(); ++i) {
-              args[i]->get<ADTensor>().reverse =
-                ll->Push(Add(args[i]->get<ADTensor>().reverse, rev[i]));
-            }
-          });
-        return ret;
       });
+      return ret;
+    });
   }
 
   ADValue VisitExpr_(const ConstantNode* op) final {
@@ -172,6 +173,23 @@ struct ReverseAD : ExprFunctor<ADValue(const Expr &)> {
   }
 };
 
+Type GradRetType(const Function& f) {
+  // if type annotations are provided, we will construct a ret type;
+  // otherwise, leave it to be inferred
+  if (!f->ret_type.defined()) {
+    return Type();
+  }
+  std::vector<Type> vt;
+  for (const auto& p : f->params) {
+    if (!p->type_annotation.defined()) {
+      return Type();
+    }
+    vt.push_back(p->type_annotation);
+  }
+
+  return TupleTypeNode::make({f->ret_type, TupleTypeNode::make(vt)});
+}
+
 Expr FirstOrderGradient(const Expr& re, const Module& mod) {
   // Currently we first remove any global functions for the first
   // order case.
@@ -182,7 +200,7 @@ Expr FirstOrderGradient(const Expr& re, const Module& mod) {
 
   // We will then build a sequence of lets which implement reverse mode.
   Expr body = LetList::With([&](LetList* ll) {
-    ReverseAD reverse_ad(ll);
+    FirstOrderReverseAD reverse_ad(ll);
     ADValue rev = reverse_ad(e);
     std::vector<ADValue> args;
     for (const auto& p : f->params) {
@@ -191,46 +209,131 @@ Expr FirstOrderGradient(const Expr& re, const Module& mod) {
     auto c = rev->get<ADFunction>().func(args, Attrs(), {});
     const auto& res = c->get<ADTensor>();
     Expr grad = LetList::With([&](LetList* ll) {
-        res.reverse = OneLike(res.foward);
-        for (auto it = reverse_ad.backprop_actions.rbegin();
-             it != reverse_ad.backprop_actions.rend();
-             ++it) {
-          (*it)(ll);
+      res.reverse = OnesLike(res.forward);
+      for (auto it = reverse_ad.backprop_actions.rbegin();
+           it != reverse_ad.backprop_actions.rend();
+           ++it) {
+        (*it)(ll);
+      }
+      std::vector<Expr> grad_res;
+      for (const auto& a : args) {
+        grad_res.push_back(a->get<ADTensor>().reverse);
+      }
+      return TupleNode::make(grad_res);
+    });
+    return Pair(res.forward, grad);
+  });
+
+  return FunctionNode::make(f->params, body, GradRetType(GetRef<Function>(f)), {});
+}
+
+TVM_REGISTER_API("relay._ir_pass.first_order_gradient")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args.size(), 2);
+  *ret = FirstOrderGradient(args[0], args[1]);
+});
+
+struct ReverseADType : TypeMutator {
+  Type VisitType_(const TensorTypeNode* ttn) final {
+    Type t = GetRef<Type>(ttn);
+    return TupleTypeNode::make({t, RefTypeNode::make(t)});
+  }
+};
+
+struct ReverseAD : ExprMutator {
+  Var bp;
+  const OpMap<FPrimalGradient> rev_map = Op::GetAttr<FPrimalGradient>("FPrimalGradient");
+
+  ReverseAD(const Var& bp) : bp(bp) { }
+
+  Expr VisitExpr_(const OpNode* op) final {
+    LOG(FATAL) << "op should only be inside call";
+    throw;
+  }
+
+  Expr VisitExpr_(const CallNode* op) final {
+    if (const OpNode* op_node = op->op.as<OpNode>()) {
+      Op op_ref = GetRef<Op>(op_node);
+      CHECK(rev_map.count(op_ref))
+        << op_node->name << " does not have reverse mode defined";
+      return LetList::With([&](LetList* ll) {
+        std::vector<Var> args;
+        for (const auto& arg : op->args) {
+          args.push_back(ll->Push(VisitExpr(arg)));
         }
-        std::vector<Expr> grad_res;
-        for (const auto& a : args) {
-          grad_res.push_back(a->get<ADTensor>().reverse);
+        std::vector<Expr> orig_args;
+        for (const auto& arg : args) {
+          orig_args.push_back(GetField(VisitExpr(arg), 0));
         }
-        return TupleNode::make(grad_res);
+        Expr orig = CallNode::make(op->op, orig_args, op->attrs, op->type_args);
+        Var orig_var = ll->Push(orig);
+        auto ref = ll->Push(RefCreateNode::make(ZerosLike(orig_var)));
+        auto bpv = ll->Push(RefReadNode::make(bp));
+        Expr nbp = FunctionNode::make(
+          {},
+          LetList::With([&](LetList* ll) {
+              tvm::Array<Expr> rev = rev_map[op_ref](orig, ll->Push(RefReadNode::make(ref)));
+              CHECK(args.size() == rev.size());
+              for (size_t i = 0; i < args.size(); ++i) {
+                ll->Push(RefWriteNode::make(GetField(args[i], 1),
+                                            Add(ll->Push(RefReadNode::make(GetField(args[i], 1))),
+                                                rev[i])));
+              }
+            return CallNode::make(bpv, {});
+            }),
+          TupleTypeNode::make({}),
+          {});
+        ll->Push(RefWriteNode::make(bp, nbp));
+        return Pair(orig_var, ref);
       });
-    return Pair(res.foward, grad);
-  });
-
-  // if type annotations are provided, we will construct a ret type;
-  // otherwise, leave it to be inferred
-  Type ret_type = Type();
-  std::vector<Type> vt;
-  bool missing = !f->ret_type.defined();
-  for (const auto& p : f->params) {
-    if (missing || !p->type_annotation.defined()) {
-      missing = true;
-      break;
     }
-    vt.push_back(p->type_annotation);
+    return ExprMutator::VisitExpr_(op);
+  }
+
+  Expr VisitExpr_(const ConstantNode* op) final {
+    Expr e = GetRef<Expr>(op);
+    return Pair(e, RefCreateNode::make(ZerosLike(e)));
   }
 
-  if (!missing) {
-    ret_type = TupleTypeNode::make({f->ret_type, TupleTypeNode::make(vt)});
+  Type VisitType(const Type& t) final {
+    return t.defined() ? ReverseADType()(t) : t;
   }
+};
 
-  return FunctionNode::make(f->params, body, ret_type, {});
+Expr BPEmpty() {
+  Expr unitF = FunctionNode::make({}, TupleNode::make({}), TupleTypeNode::make({}), {});
+  return RefCreateNode::make(unitF);
 }
 
-TVM_REGISTER_API("relay._ir_pass.first_order_gradient")
-  .set_body([](TVMArgs args, TVMRetValue* ret) {
-      CHECK_EQ(args.size(), 2);
-      *ret = FirstOrderGradient(args[0], args[1]);
-    });
+Expr Gradient(const Expr& re, const Module& mod) {
+  auto e = DeGlobal(mod, re);
+  auto f = e.as<FunctionNode>();
+  CHECK(f) << "input need to be a function";
+  CHECK(f->type_params.size() == 0) << "no polymorphism supported for now";
+  Expr body = LetList::With([&](LetList* ll) {
+    Var bp = ll->Push(BPEmpty());
+    Expr rev = ReverseAD(bp)(e);
+    std::vector<Expr> args;
+    for (const auto& p : f->params) {
+      args.push_back(ll->Push(Pair(p, RefCreateNode::make(ZerosLike(p)))));
+    }
+    auto c = ll->Push(CallNode::make(rev, args));
+    ll->Push(RefWriteNode::make(GetField(c, 1), OnesLike(GetField(c, 0))));
+    ll->Push(CallNode::make(RefReadNode::make(bp), {}));
+    std::vector<Expr> ret;
+    for (const auto& a : args) {
+      ret.push_back(RefReadNode::make(GetField(a, 1)));
+    }
+    return Pair(GetField(c, 0), TupleNode::make(ret));
+  });
+  return FunctionNode::make(f->params, body, GradRetType(GetRef<Function>(f)), {});
+}
+
+TVM_REGISTER_API("relay._ir_pass.gradient")
+.set_body([](TVMArgs args, TVMRetValue* ret) {
+  CHECK_EQ(args.size(), 2);
+  *ret = Gradient(args[0], args[1]);
+});
 
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/pass/pattern_util.h b/src/relay/pass/pattern_util.h
index e59efa958310..96038745474e 100644
--- a/src/relay/pass/pattern_util.h
+++ b/src/relay/pass/pattern_util.h
@@ -299,12 +299,12 @@ inline Expr Divide(Expr lhs, Expr rhs) {
   return CallNode::make(op, {lhs, rhs}, Attrs(), {});
 }
 
-inline Expr ZeroLike(Expr e) {
+inline Expr ZerosLike(Expr e) {
   static const Op& op = Op::Get("zeros_like");
   return CallNode::make(op, {e});
 }
 
-inline Expr OneLike(Expr e) {
+inline Expr OnesLike(Expr e) {
   static const Op& op = Op::Get("ones_like");
   return CallNode::make(op, {e});
 }
diff --git a/src/relay/pass/type_infer.cc b/src/relay/pass/type_infer.cc
index 8dd02f39adce..ea6b9a95da50 100644
--- a/src/relay/pass/type_infer.cc
+++ b/src/relay/pass/type_infer.cc
@@ -53,7 +53,7 @@ bool TupleGetItemRel(const Array<Type>& types,
   const auto* param = attrs.as<TupleGetItemAttrs>();
   CHECK(param != nullptr);
   CHECK_GE(param->index, 0);
-  CHECK_LT(param->index,  data->fields.size());
+  CHECK_LT(param->index, data->fields.size());
   reporter->Assign(types[1], data->fields[param->index]);
   return true;
 }
diff --git a/tests/python/relay/test_pass_gradient.py b/tests/python/relay/test_pass_gradient.py
index 6b5d0e776934..400941f12617 100644
--- a/tests/python/relay/test_pass_gradient.py
+++ b/tests/python/relay/test_pass_gradient.py
@@ -2,6 +2,7 @@
 from tvm import relay
 from tvm.relay.ir_pass import free_vars, free_type_vars, gradient
 from tvm.relay import create_executor
+from tvm.relay.prelude import Prelude
 
 import numpy as np
 
@@ -123,6 +124,72 @@ def test_broadcast_subtract():
                                -np.ones_like(expected_forward).sum(axis=(0, 1), keepdims=True).squeeze(axis=0))
 
 
+def test_tuple():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    y = relay.var("y", t)
+    z = relay.var("z", t)
+    tup = relay.Var("tup")
+    func = relay.Function([x, y, z], relay.Let(tup, relay.Tuple([x, y, z]),
+                                               relay.TupleGetItem(tup, 0) +
+                                               relay.TupleGetItem(tup, 1) -
+                                               relay.TupleGetItem(tup, 2)))
+    back_func = relay.ir_pass.infer_type(gradient(func))
+    assert back_func.checked_type == relay.FuncType([t, t, t], relay.TupleType([t, relay.TupleType([t, t, t])]))
+    x_nd = rand(dtype, *shape)
+    y_nd = rand(dtype, *shape)
+    z_nd = rand(dtype, *shape)
+    x_np = x_nd.asnumpy()
+    y_np = y_nd.asnumpy()
+    z_np = z_nd.asnumpy()
+    expected_forward = x_np + y_np - z_np
+    ex = create_executor()
+    forward, (grad_x, grad_y, grad_z) = ex.evaluate(back_func)(x_nd, y_nd, z_nd)
+    np.testing.assert_allclose(forward.asnumpy(), expected_forward)
+    np.testing.assert_allclose(grad_x.asnumpy(), np.ones_like(grad_x.asnumpy()))
+    np.testing.assert_allclose(grad_y.asnumpy(), np.ones_like(grad_y.asnumpy()))
+    np.testing.assert_allclose(grad_z.asnumpy(), -1 * np.ones_like(grad_z.asnumpy()))
+
+
+def test_pow():
+    mod = relay.Module()
+    p = Prelude(mod)
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    double = relay.Function([x], x + x)
+    i = relay.var("i", t)
+    func = relay.Function([i], relay.Call(p.iterate(double, p.s(p.s(p.s(p.z())))), [i]))
+    back_func = relay.ir_pass.infer_type(gradient(func, mod=mod), mod=mod)
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    i_nd = rand(dtype, *shape)
+    ex = create_executor(mod=mod)
+    forward, (grad_i,) = ex.evaluate(back_func)(i_nd)
+    np.testing.assert_allclose(forward.asnumpy(), 8 * i_nd.asnumpy())
+    np.testing.assert_allclose(grad_i.asnumpy(), 8 * np.ones_like(grad_i.asnumpy()))
+
+def test_ref():
+    shape = (10, 10)
+    dtype = 'float32'
+    t = relay.TensorType(shape, dtype)
+    x = relay.var("x", t)
+    r = relay.Var("r")
+    u = relay.Var("u")
+    body = relay.RefRead(r)
+    body = relay.Let(u, relay.RefWrite(r, relay.RefRead(r) + relay.RefRead(r)), body)
+    body = relay.Let(r, relay.RefCreate(x), body)
+    func = relay.Function([x], body)
+    back_func = relay.ir_pass.infer_type(gradient(func))
+    assert back_func.checked_type == relay.FuncType([t], relay.TupleType([t, relay.TupleType([t])]))
+    x_nd = rand(dtype, *shape)
+    ex = create_executor()
+    forward, (grad_x,) = ex.evaluate(back_func)(x_nd)
+    np.testing.assert_allclose(forward.asnumpy(), 2 * x_nd.asnumpy())
+    np.testing.assert_allclose(grad_x.asnumpy(), 2 * np.ones_like(grad_x.asnumpy()))
+
 if __name__ == "__main__":
     test_id()
     test_add()
@@ -130,3 +197,6 @@ def test_broadcast_subtract():
     test_sub()
     test_broadcast_add()
     test_broadcast_subtract()
+    test_tuple()
+    test_pow()
+    test_ref()

From f63975f8451e273e23c1c03061019b228c2a37aa Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Mon, 4 Mar 2019 15:20:12 -0800
Subject: [PATCH 64/93] Fix compilation on XCode 10 (#2731)

---
 src/api/api_arith.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index cba70370f5b6..a714fe37005b 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -109,7 +109,10 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         });
       } else if (name == "enter_constraint_context") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
-            auto ctx = std::make_shared<ConstraintContext>(self.get(), args[0]);
+            // can't use make_shared due to noexcept(false) decl in destructor,
+            // see https://stackoverflow.com/a/43907314
+            auto ctx =
+                std::shared_ptr<ConstraintContext>(new ConstraintContext(self.get(), args[0]));
             auto fexit = [ctx](TVMArgs, TVMRetValue*) mutable {
               ctx.reset();
             };

From 1e2dc64a44aacbefe5fcbb102ccf37ee5a4b6828 Mon Sep 17 00:00:00 2001
From: Marcus Shawcroft <marcus.shawcroft@arm.com>
Date: Mon, 4 Mar 2019 23:21:13 +0000
Subject: [PATCH 65/93] [DOCKER] Pin pylint==1.9.4 (#2727)

---
 docker/Dockerfile.ci_lint                       | 2 +-
 docker/install/ubuntu_install_python_package.sh | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docker/Dockerfile.ci_lint b/docker/Dockerfile.ci_lint
index 461a5f1f1135..0d7b4a410033 100644
--- a/docker/Dockerfile.ci_lint
+++ b/docker/Dockerfile.ci_lint
@@ -6,4 +6,4 @@ RUN apt-get update && apt-get install -y sudo wget
 COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh
 RUN bash /install/ubuntu_install_python.sh
 RUN apt-get install -y doxygen graphviz
-RUN pip3 install cpplint pylint==2.2.2 mypy
+RUN pip3 install cpplint pylint==1.9.4 mypy
diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh
index c15ff75f260e..200fe6e47781 100755
--- a/docker/install/ubuntu_install_python_package.sh
+++ b/docker/install/ubuntu_install_python_package.sh
@@ -5,5 +5,5 @@ set -u
 set -o pipefail
 
 # install libraries for python package on ubuntu
-pip2 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
-pip3 install nose pylint==2.2.2 six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs
+pip2 install nose pylint==1.9.4 six numpy nose-timer cython decorator scipy tornado typing antlr4-python2-runtime attrs
+pip3 install nose pylint==1.9.4 six numpy nose-timer cython decorator scipy tornado typed_ast pytest mypy orderedset antlr4-python3-runtime attrs

From ccfe87dae3916c1f2eb3928a36e1ca40e0a646de Mon Sep 17 00:00:00 2001
From: Ruslan Baratov <ruslan_baratov@yahoo.com>
Date: Tue, 5 Mar 2019 02:21:36 +0300
Subject: [PATCH 66/93] Docs: pip dependencies for testing (#2728)

---
 docs/contribute/pull_request.rst | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 039ef65c7b13..ec693dc260e5 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -52,6 +52,12 @@ C++
 
 Python
 ^^^^^^
+Necessary dependencies:
+
+.. code:: bash
+
+  pip install --user nose Cython
+
 If you want to run all tests:
 
 .. code:: bash
@@ -72,4 +78,4 @@ If you want to run a single test:
   export PYTHONPATH=python:topi/python
   rm -rf python/tvm/*.pyc python/tvm/*/*.pyc python/tvm/*/*/*.pyc
 
-  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py
\ No newline at end of file
+  TVM_FFI=ctypes python -m nose -v tests/python/unittest/test_pass_storage_rewrite.py

From e4c2fc6a143a396d345946d0e923f86ee5cf5211 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Mon, 4 Mar 2019 21:51:09 -0800
Subject: [PATCH 67/93] [COMMUNITY] @sgrechanik-h -> Reviewer (#2732)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 772f4ab18646..a0ab2a0c91a3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -33,6 +33,7 @@ We do encourage everyone to work anything they are interested in.
 - [Tianqi Chen](https://github.com/tqchen): @tqchen
 - [Liangfu Chen](https://github.com/liangfu): @liangfu
 - [Zhi Chen](https://github.com/zhiics): @zhiics
+- [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h
 - [Nick Hynes](https://github.com/nhynes): @nhynes
 - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei
 - [Yizhi Liu](https://github.com/yzhliu) : @yzhliu

From bcce07d014fd2f8acef1657ae6b440f12b152adf Mon Sep 17 00:00:00 2001
From: Martin Boos <mboos@outlook.com>
Date: Tue, 5 Mar 2019 16:48:03 +0100
Subject: [PATCH 68/93] use LLVM linker (#2713)

* use LLVM linker

* error message improved in case of filenotfound

* linting error fixed
---
 python/tvm/contrib/cc.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py
index 0ffa6c420243..ee84da820902 100644
--- a/python/tvm/contrib/cc.py
+++ b/python/tvm/contrib/cc.py
@@ -85,13 +85,13 @@ def _windows_shared(output, objects, options):
             cl_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("can not found cl.exe,"
+        raise RuntimeError("Can not find cl.exe,"
                            "please run this in Vistual Studio Command Prompt.")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += py_str(out)
         raise RuntimeError(msg)
-    link_cmd = ["link"]
+    link_cmd = ["lld-link"]
     link_cmd += ["-dll", "-FORCE:MULTIPLE"]
 
     for obj in objects:
@@ -111,8 +111,11 @@ def _windows_shared(output, objects, options):
             link_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
         (out, _) = proc.communicate()
     except FileNotFoundError:
-        raise RuntimeError("can not found link.exe,"
-                           "please run this in Vistual Studio Command Prompt.")
+        raise RuntimeError("Can not find the LLVM linker for Windows (lld-link.exe)."
+                           "Make sure it's installed"
+                           " and the installation directory is in the %PATH% environment "
+                           "variable. Prebuilt binaries can be found at: https://llvm.org/"
+                           "For building the linker on your own see: https://lld.llvm.org/#build")
     if proc.returncode != 0:
         msg = "Compilation error:\n"
         msg += py_str(out)

From 6988c4d908bce9ae715f641f252bead1bb2a9f28 Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Wed, 6 Mar 2019 03:33:40 +0800
Subject: [PATCH 69/93] [RELAY][OP] Faster-RCNN Proposal OP (#2725)

* [RELAY][OP] Proposal

* Fix

* Fix test
---
 include/tvm/relay/attrs/vision.h      | 38 +++++++++++++++
 python/tvm/relay/frontend/mxnet.py    | 16 +++++++
 python/tvm/relay/op/vision/_rcnn.py   | 28 ++++++++++-
 python/tvm/relay/op/vision/rcnn.py    | 60 ++++++++++++++++++++++++
 src/relay/op/vision/rcnn_op.cc        | 67 +++++++++++++++++++++++++++
 tests/python/relay/test_op_level5.py  | 67 +++++++++++++++++++++++++++
 topi/tests/python/test_topi_vision.py |  4 +-
 7 files changed, 277 insertions(+), 3 deletions(-)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index 73b7339e2edb..df059a6238e1 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -109,6 +109,44 @@ struct YoloReorgAttrs : public tvm::AttrsNode<YoloReorgAttrs> {
   }
 };
 
+/*! \brief Attributes used in proposal operators */
+struct ProposalAttrs : public tvm::AttrsNode<ProposalAttrs> {
+  Array<IndexExpr> scales;
+  Array<IndexExpr> ratios;
+  int feature_stride;
+  double threshold;
+  int rpn_pre_nms_top_n;
+  int rpn_post_nms_top_n;
+  int rpn_min_size;
+  bool iou_loss;
+
+  TVM_DECLARE_ATTRS(ProposalAttrs, "relay.attrs.ProposalAttrs") {
+    TVM_ATTR_FIELD(scales)
+        .set_default(Array<IndexExpr>({4.0f, 8.0f, 16.0f, 32.0f}))
+        .describe("Used to generate anchor windows by enumerating scales");
+    TVM_ATTR_FIELD(ratios)
+        .set_default(Array<IndexExpr>({0.5f, 1.0f, 2.0f}))
+        .describe("Used to generate anchor windows by enumerating ratios");
+    TVM_ATTR_FIELD(feature_stride)
+        .set_default(16)
+        .describe(
+            "The size of the receptive field each unit in the convolution layer of the rpn,"
+            "for example the product of all stride's prior to this layer.");
+    TVM_ATTR_FIELD(threshold)
+        .set_default(0.7)
+        .describe(
+            "IoU threshold of non-maximum suppresion (suppress boxes with IoU >= this threshold)");
+    TVM_ATTR_FIELD(rpn_pre_nms_top_n)
+        .set_default(6000)
+        .describe("Number of top scoring boxes to apply NMS. -1 to use all boxes");
+    TVM_ATTR_FIELD(rpn_post_nms_top_n)
+        .set_default(300)
+        .describe("Number of top scoring boxes to keep after applying NMS to RPN proposals");
+    TVM_ATTR_FIELD(rpn_min_size).set_default(16).describe("Minimum height or width in proposal");
+    TVM_ATTR_FIELD(iou_loss).set_default(false).describe("Usage of IoU Loss");
+  }
+};
+
 }  // namespace relay
 }  // namespace tvm
 #endif  // TVM_RELAY_ATTRS_VISION_H_
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 4d341c76043a..69fa5e719f30 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -351,6 +351,20 @@ def _mx_roi_align(inputs, attrs):
     return _op.vision.roi_align(inputs[0], inputs[1], **new_attrs)
 
 
+def _mx_proposal(inputs, attrs):
+    new_attrs = {}
+    new_attrs["scales"] = attrs.get_float_tuple("scales", (4.0, 8.0, 16.0, 32.0))
+    new_attrs["ratios"] = attrs.get_float_tuple("ratios", (0.5, 1.0, 2.0))
+    new_attrs["feature_stride"] = attrs.get_int("feature_stride", 16)
+    new_attrs["threshold"] = attrs.get_float("threshold", 0.7)
+    new_attrs["rpn_pre_nms_top_n"] = attrs.get_int("rpn_pre_nms_top_n", 6000)
+    new_attrs["rpn_post_nms_top_n"] = attrs.get_int("rpn_post_nms_top_n", 300)
+    new_attrs["rpn_min_size"] = attrs.get_int("rpn_min_size", 16)
+    new_attrs["iou_loss"] = attrs.get_bool("iou_loss", False)
+    assert not attrs.get_bool("output_score", False), "proposal doesn't support output score"
+    return _op.vision.proposal(inputs[0], inputs[1], inputs[2], **new_attrs)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -466,6 +480,8 @@ def _mx_roi_align(inputs, attrs):
     "_contrib_MultiBoxPrior" : _mx_multibox_prior,
     "_contrib_MultiBoxDetection" : _mx_multibox_detection,
     "_contrib_ROIAlign" : _mx_roi_align,
+    "_contrib_Proposal" : _mx_proposal,
+    "_contrib_MultiProposal" : _mx_proposal,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
diff --git a/python/tvm/relay/op/vision/_rcnn.py b/python/tvm/relay/op/vision/_rcnn.py
index 2617bf8562b9..9606ee64c7be 100644
--- a/python/tvm/relay/op/vision/_rcnn.py
+++ b/python/tvm/relay/op/vision/_rcnn.py
@@ -1,7 +1,7 @@
 # pylint: disable=invalid-name, unused-argument
 """Faster R-CNN and Mask R-CNN operations."""
 import topi
-from topi.util import get_const_tuple
+from topi.util import get_const_tuple, get_float_tuple, get_const_int
 from .. import op as reg
 from ..op import OpPattern
 
@@ -21,3 +21,29 @@ def schedule_roi_align(_, outs, target):
         return topi.generic.vision.schedule_roi_align(outs)
 
 reg.register_pattern("vision.roi_align", OpPattern.OUT_ELEMWISE_FUSABLE)
+
+@reg.register_compute("vision.proposal")
+def compute_proposal(attrs, inputs, _, target):
+    """Compute definition of proposal"""
+    scales = get_float_tuple(attrs.scales)
+    ratios = get_float_tuple(attrs.ratios)
+    feature_stride = attrs.feature_stride
+    threshold = attrs.threshold
+    rpn_pre_nms_top_n = attrs.rpn_pre_nms_top_n
+    rpn_post_nms_top_n = attrs.rpn_post_nms_top_n
+    rpn_min_size = attrs.rpn_min_size
+    iou_loss = bool(get_const_int(attrs.iou_loss))
+    with target:
+        return [
+            topi.vision.rcnn.proposal(inputs[0], inputs[1], inputs[2], scales, ratios,
+                                      feature_stride, threshold, rpn_pre_nms_top_n,
+                                      rpn_post_nms_top_n, rpn_min_size, iou_loss)
+        ]
+
+@reg.register_schedule("vision.proposal")
+def schedule_proposal(_, outs, target):
+    """Schedule definition of proposal"""
+    with target:
+        return topi.generic.schedule_proposal(outs)
+
+reg.register_pattern("vision.proposal", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/rcnn.py b/python/tvm/relay/op/vision/rcnn.py
index 8bbafbe75c53..8e95435d0ecc 100644
--- a/python/tvm/relay/op/vision/rcnn.py
+++ b/python/tvm/relay/op/vision/rcnn.py
@@ -30,3 +30,63 @@ def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout='N
         4-D tensor with shape [num_roi, channel, pooled_size, pooled_size]
     """
     return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout)
+
+
+def proposal(cls_prob,
+             bbox_pred,
+             im_info,
+             scales,
+             ratios,
+             feature_stride,
+             threshold,
+             rpn_pre_nms_top_n,
+             rpn_post_nms_top_n,
+             rpn_min_size,
+             iou_loss):
+    """Proposal operator.
+
+    Parameters
+    ----------
+    cls_prob : relay.Expr
+        4-D tensor with shape [batch, 2 * num_anchors, height, width].
+
+    bbox_pred : relay.Expr
+        4-D tensor with shape [batch, 4 * num_anchors, height, width].
+
+    im_info : relay.Expr
+        2-D tensor with shape [batch, 3]. The last dimension should be in format of
+        [im_height, im_width, im_scale]
+
+    scales : list/tuple of float
+        Scales of anchor windoes.
+
+    ratios : list/tuple of float
+        Ratios of anchor windoes.
+
+    feature_stride : int
+        The size of the receptive field each unit in the convolution layer of the rpn, for example
+        the product of all stride's prior to this layer.
+
+    threshold : float
+        Non-maximum suppression threshold.
+
+    rpn_pre_nms_top_n : int
+        Number of top scoring boxes to apply NMS. -1 to use all boxes.
+
+    rpn_post_nms_top_n : int
+        Number of top scoring boxes to keep after applying NMS to RPN proposals.
+
+    rpn_min_size : int
+        Minimum height or width in proposal.
+
+    iou_loss : bool
+        Usage of IoU loss.
+
+    Returns
+    -------
+    output : relay.Expr
+        2-D tensor with shape [batch * rpn_post_nms_top_n, 5]. The last dimension is in format of
+        [batch_index, w_start, h_start, w_end, h_end].
+    """
+    return _make.proposal(cls_prob, bbox_pred, im_info, scales, ratios, feature_stride, threshold,
+                          rpn_pre_nms_top_n, rpn_post_nms_top_n, rpn_min_size, iou_loss)
diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc
index e46eaf2207fb..6dbc76599708 100644
--- a/src/relay/op/vision/rcnn_op.cc
+++ b/src/relay/op/vision/rcnn_op.cc
@@ -63,5 +63,72 @@ RELAY_REGISTER_OP("vision.roi_align")
 .set_support_level(5)
 .add_type_rel("ROIAlign", ROIAlignRel);
 
+TVM_REGISTER_NODE_TYPE(ProposalAttrs);
+
+bool ProposalRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
+                 const TypeReporter& reporter) {
+  auto proposal_attrs = attrs.as<ProposalAttrs>();
+  CHECK_EQ(types.size(), 4);
+  const auto* cls_prob = types[0].as<TensorTypeNode>();
+  const auto* bbox_pred = types[1].as<TensorTypeNode>();
+  const auto* im_info = types[2].as<TensorTypeNode>();
+
+  if (!cls_prob || !bbox_pred || !im_info) {
+    return false;
+  }
+
+  CHECK_EQ(cls_prob->shape.size(), 4U)
+      << "The dimension of class probability should be 4, but received " << cls_prob->shape.size();
+  CHECK_EQ(bbox_pred->shape.size(), 4U)
+      << "The dimension of box prediction should be 4, but received " << bbox_pred->shape.size();
+  CHECK_EQ(im_info->shape.size(), 2U)
+      << "The dimension of image info should be 2, but received " << im_info->shape.size();
+  CHECK(reporter->AssertEQ(im_info->shape[1], 3));
+
+  auto batch = cls_prob->shape[0];
+
+  std::vector<IndexExpr> oshape(
+      {batch * proposal_attrs->rpn_post_nms_top_n, 5});
+  reporter->Assign(types[3], TensorTypeNode::make(oshape, cls_prob->dtype));
+  return true;
+}
+
+Expr MakeProposal(Expr cls_prob, Expr bbox_pred, Expr im_info, Array<IndexExpr> scales,
+                  Array<IndexExpr> ratios, int feature_stride, double threshold,
+                  int rpn_pre_nms_top_n, int rpn_post_nms_top_n, int rpn_min_size,
+                  bool iou_loss) {
+  auto attrs = make_node<ProposalAttrs>();
+  attrs->scales = scales;
+  attrs->ratios = ratios;
+  attrs->feature_stride = feature_stride;
+  attrs->threshold = threshold;
+  attrs->rpn_pre_nms_top_n = rpn_pre_nms_top_n;
+  attrs->rpn_post_nms_top_n = rpn_post_nms_top_n;
+  attrs->rpn_min_size = rpn_min_size;
+  attrs->iou_loss = iou_loss;
+  static const Op& op = Op::Get("vision.proposal");
+  return CallNode::make(op, {cls_prob, bbox_pred, im_info}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op.vision._make.proposal")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 11>(MakeProposal, args, rv);
+  });
+
+RELAY_REGISTER_OP("vision.proposal")
+    .describe(R"code(Generate region proposals via RPN.
+
+ - **cls_prob**: 4-D with shape [batch, 2 * num_anchors, height, width].
+ - **bbox_pred**: 4-D with shape [batch, 4 * num_anchors, height, width].
+ - **im_info**: 2-D with shape [batch, 3].
+ - **out**: 2-D with shape [batch * rpn_post_nms_top_n, 5].
+ )code" TVM_ADD_FILELINE)
+.set_num_inputs(3)
+.add_argument("cls_prob", "Tensor", "Score of how likely proposal is object")
+.add_argument("bbox_pred", "Tensor", "BBox predicted deltas from anchors for proposals")
+.add_argument("im_info", "Tensor", "Image size and scale")
+.set_support_level(5)
+.add_type_rel("Proposal", ProposalRel);
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 8db6d747ef5e..003318f01a2f 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -306,6 +306,72 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_
     verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2)
 
 
+def test_proposal():
+    def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
+        cls_prob = relay.var("cls_prob", relay.ty.TensorType(np_cls_prob.shape, "float32"))
+        bbox_pred = relay.var("bbox_pred", relay.ty.TensorType(np_bbox_pred.shape, "float32"))
+        im_info = relay.var("im_info", relay.ty.TensorType(np_im_info.shape, "float32"))
+        z = relay.vision.proposal(cls_prob, bbox_pred, im_info, **attrs)
+        zz = relay.ir_pass.infer_type(z)
+
+        assert zz.checked_type == relay.ty.TensorType(np_out.shape, "float32")
+
+        func = relay.Function([cls_prob, bbox_pred, im_info], z)
+        func = relay.ir_pass.infer_type(func)
+        for target in ['cuda']:
+            if not tvm.module.enabled(target):
+                print("Skip test because %s is not enabled." % target)
+                continue
+            ctx = tvm.context(target, 0)
+            intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
+            op_res1 = intrp1.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
+            tvm.testing.assert_allclose(op_res1.asnumpy(), np_out, rtol=1e-4)
+            intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
+            op_res2 = intrp2.evaluate(func)(np_cls_prob, np_bbox_pred, np_im_info)
+            tvm.testing.assert_allclose(op_res2.asnumpy(), np_out, rtol=1e-4)
+
+    attrs = {
+        'scales': (0.5,),
+        'ratios': (0.5,),
+        'feature_stride': 16,
+        'iou_loss': False,
+        'rpn_min_size': 16,
+        'threshold': 0.7,
+        'rpn_pre_nms_top_n': 200,
+        'rpn_post_nms_top_n': 4,
+    }
+
+    np_cls_prob = np.array([[
+        [[0.3, 0.6, 0.2], [0.4, 0.7, 0.5], [0.1, 0.4, 0.3]],
+        [[0.7, 0.5, 0.3], [0.6, 0.4, 0.8], [0.9, 0.2, 0.5]]
+    ]], dtype='float32')
+    np_bbox_pred = np.array([[
+        [[0.5, 1.0, 0.6], [0.8,  1.2, 2.0], [0.9, 1.0, 0.8]],
+        [[0.5, 1.0, 0.7], [0.8,  1.2, 1.6], [2.1, 1.5, 0.7]],
+        [[1.0, 0.5, 0.7], [1.5,  0.9, 1.6], [1.4, 1.5, 0.8]],
+        [[1.0, 0.5, 0.6], [1.5,  0.9, 2.0], [1.8, 1.0, 0.9]],
+    ]], dtype='float32')
+    np_im_info = np.array([[48., 48., 1.]], dtype='float32')
+    np_out = np.array([
+        [0., 0., 2.8451548,28.38012, 18.154846],
+        [0., 0., 15.354933, 41.96971, 41.245064],
+        [0., 18.019852, 1.0538368, 51.98015, 25.946163],
+        [0., 27.320923, -1.266357, 55., 24.666357]
+    ], dtype='float32')
+
+
+    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+
+    np_out = np.array([
+        [ 0., -5.25, -2.5, 21.75, 19.],
+        [ 0., 11.25, -2., 37.25, 18.5],
+        [ 0., 26.849998, -2.3000002, 53.45, 18.6],
+        [ 0., -4.95, 13.799999, 22.25, 35.5]
+    ], dtype='float32')
+    attrs['iou_loss'] = True
+    verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs)
+
+
 def test_yolo_reorg_infer_shape():
     def verify_yolo_reorg(shape, stride, out_shape):
         x = relay.var("x", relay.TensorType(shape, "float32"))
@@ -347,5 +413,6 @@ def verify_yolo_reorg(shape, stride):
     test_multibox_transform_loc()
     test_nms()
     test_roi_align()
+    test_proposal()
     test_yolo_reorg_infer_shape()
     test_yolo_reorg()
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 135b3857df31..3c0c3aa854d7 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -210,7 +210,7 @@ def test_roi_align():
 def verify_proposal(np_cls_prob, np_bbox_pred, np_im_info, np_out, attrs):
     cls_prob = tvm.placeholder(np_cls_prob.shape)
     bbox_pred = tvm.placeholder(np_bbox_pred.shape)
-    im_info = tvm.placeholder(np_im_info.shape, dtype='int32')
+    im_info = tvm.placeholder(np_im_info.shape)
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -252,7 +252,7 @@ def test_proposal():
         [[1.0, 0.5, 0.7], [1.5,  0.9, 1.6], [1.4, 1.5, 0.8]],
         [[1.0, 0.5, 0.6], [1.5,  0.9, 2.0], [1.8, 1.0, 0.9]],
     ]], dtype='float32')
-    np_im_info = np.array([[48, 48, 1]], dtype='int32')
+    np_im_info = np.array([[48., 48., 1.]], dtype='float32')
     np_out = np.array([
         [0., 0., 2.8451548,28.38012, 18.154846],
         [0., 0., 15.354933, 41.96971, 41.245064],

From 638e7e65f0e990ea4d1a351cf49cda7aec93c5f9 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Tue, 5 Mar 2019 15:07:26 -0800
Subject: [PATCH 70/93] [Relay][Frontend][Bugfix] Fix bug in converting
 slice_axis when axis is negative (#2739)

* bug fix

* trigger ci
---
 python/tvm/relay/frontend/mxnet.py          | 3 +++
 tests/python/frontend/mxnet/test_forward.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 69fa5e719f30..e7cf1289c8ed 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -200,6 +200,9 @@ def _mx_slice_axis(inputs, attrs):
     axis = attrs.get_int("axis")
     ax_beg = attrs.get_int("begin")
     ax_end = attrs.get_str("end")
+    if axis < 0:
+        axis += len(shape)
+    assert axis >= 0 and axis < len(shape)
     if ax_end == "None":
         ax_end = int(shape[axis])
     else:
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 7f53aa8a0155..74a87e29a0c0 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -352,6 +352,7 @@ def verify(shape, axis, begin, end):
     verify((3, 4), 0, 1, None)
     verify((3, 4), 1, 0, 2)
     verify((3, 4), 1, -3, -1)
+    verify((3, 4), -1, -3, -1)
 
 
 if __name__ == '__main__':
@@ -380,4 +381,4 @@ def verify(shape, axis, begin, end):
     test_forward_broadcast_ops()
     test_forward_elemwise_ops()
     test_forward_scalar_ops()
-    test_forward_slice_axis()
\ No newline at end of file
+    test_forward_slice_axis()

From 8b990568d4b73159d4efa89bfcb685985f8e7983 Mon Sep 17 00:00:00 2001
From: ziheng <ziheng@apache.org>
Date: Tue, 5 Mar 2019 18:22:16 -0800
Subject: [PATCH 71/93] [VERSION] Update to 0.6.dev (#2736)

---
 conda/nnvm/meta.yaml                | 2 +-
 conda/topi/meta.yaml                | 2 +-
 conda/tvm-libs/meta.yaml            | 2 +-
 conda/tvm/meta.yaml                 | 2 +-
 include/tvm/runtime/c_runtime_api.h | 2 +-
 python/tvm/_ffi/libinfo.py          | 2 +-
 version.py                          | 2 +-
 web/tvm_runtime.js                  | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index 7162fdc8391f..7a8b4aab9f2b 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: nnvm
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index 22a1f3579868..a3133b6c39cc 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: topi
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index 6a2f0ff75f38..15c7de9563ad 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: tvm-libs
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index b4b93471821a..d60c743b87eb 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -1,4 +1,4 @@
-{% set version = "0.5.dev" %}
+{% set version = "0.6.dev" %}
 
 package:
   name: tvm
diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h
index b493cf6dc8da..1a1a8da67aed 100644
--- a/include/tvm/runtime/c_runtime_api.h
+++ b/include/tvm/runtime/c_runtime_api.h
@@ -43,7 +43,7 @@
 #endif
 
 // TVM version
-#define TVM_VERSION "0.5.dev"
+#define TVM_VERSION "0.6.dev"
 
 
 // TVM Runtime is DLPack compatible.
diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py
index 6ad2e06939b1..9ef0f498a7a4 100644
--- a/python/tvm/_ffi/libinfo.py
+++ b/python/tvm/_ffi/libinfo.py
@@ -163,4 +163,4 @@ def find_include_path(name=None, search_path=None, optional=False):
 # We use the version of the incoming release for code
 # that is under development.
 # The following line is set by tvm/python/update_version.py
-__version__ = "0.5.dev"
+__version__ = "0.6.dev"
diff --git a/version.py b/version.py
index acdc3f435798..b0c0b2af109e 100644
--- a/version.py
+++ b/version.py
@@ -16,7 +16,7 @@
 # current version
 # We use the version of the incoming release for code
 # that is under development
-__version__ = "0.5"
+__version__ = "0.6.dev"
 
 # Implementations
 def update(file_name, pattern, repl):
diff --git a/web/tvm_runtime.js b/web/tvm_runtime.js
index 2eab15093b72..fe303d57b0c6 100644
--- a/web/tvm_runtime.js
+++ b/web/tvm_runtime.js
@@ -2,7 +2,7 @@
  * TVM Javascript web runtime library.
  *
  * @projectname tvm
- * @version 0.5.dev
+ * @version 0.6.dev
  */
 /* eslint no-unused-vars: "off" */
 /* eslint no-unexpected-multiline: "off" */

From 29e0d2d0ee03de26b40d4f31b48d62ccc5ef1ba8 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Tue, 5 Mar 2019 19:42:32 -0800
Subject: [PATCH 72/93] [Relay][TOPI][OP] intel_graphics conv2d alterlayout
 support relay, added stack op (#2729)

* add stack op frontend

* concate moved

* topi stack added

* stack added

* fix stack bugs and tested

* conv2d alterlayout udpated for relay

* fix pylint

* fix cmake warnings

* cmake warnings fixed
---
 docs/api/python/topi.rst                  |   2 +
 docs/langref/relay_op.rst                 |   2 +
 include/tvm/relay/attrs/transform.h       |   9 ++
 python/tvm/relay/frontend/mxnet.py        |   6 ++
 python/tvm/relay/op/_transform.py         |  11 +--
 python/tvm/relay/op/transform.py          |  22 +++++
 src/relay/op/tensor/transform.cc          | 100 +++++++++++++++++++++-
 topi/include/topi/transform.h             |  50 +++++++++++
 topi/python/topi/intel_graphics/conv2d.py |  16 ++--
 topi/python/topi/transform.py             |  19 ++++
 topi/src/topi.cc                          |   5 ++
 topi/tests/python/test_topi_transform.py  |  36 +++++++-
 12 files changed, 259 insertions(+), 19 deletions(-)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index d2f9f01fcf52..f0fc78909258 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -72,6 +72,7 @@ List of operators
    topi.logical_or
    topi.logical_not
    topi.arange
+   topi.stack
    topi.layout_transform
    topi.image.resize
 
@@ -130,6 +131,7 @@ topi
 .. autofunction:: topi.greater
 .. autofunction:: topi.less
 .. autofunction:: topi.arange
+.. autofunction:: topi.stack
 .. autofunction:: topi.layout_transform
 
 topi.nn
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index 7958d6cbe553..f706be08009d 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -96,6 +96,7 @@ This level enables additional math and transform operators.
    tvm.relay.cast
    tvm.relay.split
    tvm.relay.arange
+   tvm.relay.stack
 
 
 **Level 4: Broadcast and Reductions**
@@ -220,6 +221,7 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.cast
 .. autofunction:: tvm.relay.split
 .. autofunction:: tvm.relay.arange
+.. autofunction:: tvm.relay.stack
 
 
 Level 4 Definitions
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index d76bfceb59e8..fea2c960d032 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -115,6 +115,15 @@ struct ArangeAttrs : public tvm::AttrsNode<ArangeAttrs> {
   }
 };  // struct ArangeAttrs
 
+/*! \brief Attributes used in stack operators */
+struct StackAttrs : public tvm::AttrsNode<StackAttrs> {
+  Integer axis;
+  TVM_DECLARE_ATTRS(StackAttrs, "relay.attrs.StackAttrs") {
+    TVM_ATTR_FIELD(axis).set_default(0)
+        .describe("The axis in the result array along which the input arrays are stacked.");
+  }
+};  // struct StackAttrs
+
 /*! \brief Attributes used in squeeze operators */
 struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
   // use axis to make the name numpy compatible.
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index e7cf1289c8ed..45329e1b3fe5 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -253,6 +253,11 @@ def _mx_concat(inputs, attrs):
     return _op.concatenate(tuple(inputs), axis=axis)
 
 
+def _mx_stack(inputs, attrs):
+    axis = attrs.get_int("axis", 0)
+    return _op.stack(tuple(inputs), axis=axis)
+
+
 def _mx_expand_dims(inputs, attrs):
     axis = attrs.get_int("axis")
     return _op.expand_dims(inputs[0], axis=axis)
@@ -474,6 +479,7 @@ def _mx_proposal(inputs, attrs):
     "expand_dims"   : _mx_expand_dims,
     "Concat"        : _mx_concat,
     "concat"        : _mx_concat,
+    "stack"         : _mx_stack,
     "batch_dot"     : _mx_batch_dot,
     "LeakyReLU"     : _mx_leaky_relu,
     "_arange"       : _mx_arange,
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index b8c00b90d40e..1389f96b8325 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -1,7 +1,6 @@
 """Backend compiler related feature registration"""
 # pylint: disable=invalid-name,unused-argument
 from __future__ import absolute_import
-import topi
 from . import op as _reg
 from ._reduce import _schedule_reduce
 from .op import schedule_injective, OpPattern
@@ -27,16 +26,10 @@
 _reg.register_schedule("take", schedule_injective)
 _reg.register_schedule("transpose", schedule_injective)
 _reg.register_schedule("where", schedule_broadcast)
+_reg.register_schedule("stack", schedule_injective)
+_reg.register_schedule("concatenate", schedule_injective)
 _reg.register_schedule("_contrib_reverse_reshape", schedule_injective)
 
 # layout_transform
 _reg.register_schedule("layout_transform", schedule_injective)
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
-
-# concatenate
-@_reg.register_compute("concatenate")
-def concatenate_compute(attrs, inputs, output_type, target):
-    return [topi.concatenate(inputs, axis=attrs.axis)]
-
-_reg.register_schedule("concatenate", schedule_injective)
-_reg.register_pattern("concatenate", OpPattern.INJECTIVE)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index cf1ae0573716..845ee02b0582 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -294,6 +294,28 @@ def arange(start, stop=None, step=1, dtype="float32"):
     return _make.arange(start, stop, step, dtype)
 
 
+def stack(data, axis):
+    """Join a sequence of arrays along a new axis.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    axis : int
+        The axis in the result array along which the input arrays are stacked.
+
+    .. note::
+        Each array in the input array sequence must have the same shape.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+    """
+    return _make.stack(data, axis)
+
+
 def where(condition, x, y):
     """Selecting elements from either x or y depending on the value of the
     condition.
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 55892e5c73a1..de3ac03977f4 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -206,6 +206,15 @@ bool ConcatenateRel(const Array<Type>& types,
   return true;
 }
 
+Array<Tensor> ConcatenateCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const ConcatenateAttrs *param = attrs.as<ConcatenateAttrs>();
+  CHECK(param != nullptr);
+  return { topi::concatenate(inputs, param->axis) };
+}
+
 Array<Array<Layout>> ConcatenateLayout(
     const Attrs& attrs,
     const Array<Layout>& new_in_layouts,
@@ -268,7 +277,96 @@ RELAY_REGISTER_OP("concatenate")
 .add_argument("data", "Tensor", "The input list of tensors.")
 .set_support_level(1)
 .add_type_rel("Concatenate", ConcatenateRel)
-.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout);
+.set_attr<FInferCorrectLayout>("FInferCorrectLayout", ConcatenateLayout)
+.set_attr<FTVMCompute>("FTVMCompute", ConcatenateCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
+
+TVM_REGISTER_NODE_TYPE(StackAttrs);
+
+bool StackRel(const Array<Type>& types,
+              int num_inputs,
+              const Attrs& attrs,
+              const TypeReporter& reporter) {
+  // types: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* tensor_tuple = types[0].as<TupleTypeNode>();
+  if (tensor_tuple == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "cast: expect input type to be TupleType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<StackAttrs>();
+  const auto& first = Downcast<TensorType>(tensor_tuple->fields[0]);
+  // Sanity check: ndim and dtype.
+  const int ndim = static_cast<int>(first->shape.size());
+  const DataType dtype = first->dtype;
+  for (const Type& ele : tensor_tuple->fields) {
+    const auto& e = Downcast<TensorType>(ele);
+    int e_ndim = static_cast<int>(e->shape.size());
+    const DataType& e_dtype = e->dtype;
+    CHECK_EQ(e_ndim, ndim) << "relay.stack requires all tensors have the same ndim";
+    CHECK_EQ(e_dtype, dtype) << "relay.stack requires all tensors have the same dtype";
+  }
+  // Sanity check: axis
+  int axis = param->axis;
+  CHECK(-ndim <= axis && axis < ndim)
+    << "stack only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  axis = axis < 0 ? ndim + axis + 1 : axis;
+  // Calculate shape
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + 1);
+  const int stack_dim = static_cast<int>(tensor_tuple->fields.size());
+  for (int i = 0; i < axis; ++i) {
+    oshape.emplace_back(first->shape[i]);
+  }
+  oshape.emplace_back(stack_dim);
+  for (int i = axis; i < ndim; ++i) {
+    oshape.emplace_back(first->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, dtype));
+  return true;
+}
+
+Array<Tensor> StackCompute(const Attrs& attrs,
+                           const Array<Tensor>& inputs,
+                           const Type& out_type,
+                           const Target& target) {
+  const StackAttrs *param = attrs.as<StackAttrs>();
+  CHECK(param != nullptr);
+  return { topi::stack(inputs, param->axis) };
+}
+
+Expr MakeStack(Expr data,
+               int axis) {
+  auto attrs = make_node<StackAttrs>();
+  attrs->axis = axis;
+  static const Op& op = Op::Get("stack");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.stack")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeStack, args, rv);
+});
+
+RELAY_REGISTER_OP("stack")
+.describe(R"code(Stack the input tensors along the given axis.
+
+- **data** : A list of tensors.
+
+- **axis** : The axis along which the tensors are stacked.
+
+)code" TVM_ADD_FILELINE)
+.set_attrs_type_key("relay.attrs.StackAttrs")
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "The input list of tensors.")
+.set_support_level(1)
+.add_type_rel("Stack", StackRel)
+.set_attr<FTVMCompute>("FTVMCompute", StackCompute)
+.set_attr<TOpPattern>("TOpPattern", kInjective);
 
 /* relay.transpose */
 TVM_REGISTER_NODE_TYPE(TransposeAttrs);
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 24ebe5de4a20..fc686f88dba6 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -323,6 +323,56 @@ inline Tensor concatenate(const Array<Tensor>& inputs,
     }, name, tag);
 }
 
+/*!
+* \brief Join a sequence of tensors along a new axis.
+*
+* \param inputs The input tensors
+* \param axis The axis along which the tensors will be stacked
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the stack operation
+*/
+inline Tensor stack(const Array<Tensor>& inputs,
+                    int axis = 0,
+                    std::string name = "tensor",
+                    std::string tag = kInjective) {
+  int ndim = static_cast<int>(inputs[0]->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "stack only accepts `axis` in [-ndim, ndim)"
+    << ", but got axis = " << axis
+    << ", and ndim = " << ndim;
+  if (axis < 0) {
+    axis += ndim + 1;
+  }
+  CHECK_LT(axis, inputs[0]->shape.size() + 1) <<
+    "axis out of bounds";
+
+  const int stack_size = static_cast<int>(inputs.size());
+  Array<Expr> out_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i)
+    out_shape.push_back(inputs[0]->shape[i]);
+  out_shape.push_back(stack_size);
+  for (size_t i = static_cast<size_t>(axis); i < static_cast<size_t>(ndim); ++i)
+    out_shape.push_back(inputs[0]->shape[i]);
+
+  return compute(
+    out_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      for (size_t i = 0; i < indices.size(); ++i)
+        if (i != static_cast<size_t>(axis))
+          idx.push_back(indices[i]);
+      auto ind = indices[axis];
+      auto ret = inputs[0](idx);
+      for (int i = 0; i < static_cast<int>(inputs.size() - 1); ++i) {
+        ret = tvm::if_then_else(ind == i + 1,
+                                inputs[i + 1](idx),
+                                ret);
+      }
+      return ret;
+    }, name, tag);
+}
+
 /*!
 * \brief Split a tensor into multiple sub-tensors
 *
diff --git a/topi/python/topi/intel_graphics/conv2d.py b/topi/python/topi/intel_graphics/conv2d.py
index e5a4983b455e..554deaad35a3 100644
--- a/topi/python/topi/intel_graphics/conv2d.py
+++ b/topi/python/topi/intel_graphics/conv2d.py
@@ -3,7 +3,6 @@
 
 from __future__ import absolute_import as _abs
 
-import warnings
 import tvm
 
 from .. import generic
@@ -40,10 +39,6 @@ def tile_and_bind3d(s, tensor, z, y, x, z_factor=2, y_factor=None, x_factor=None
 @conv2d_alter_layout.register(["intel_graphics"])
 def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     import nnvm.symbol as sym
-    if F != sym:
-        warnings.warn("Only support alter layout for intel graphics in NNVM now. "
-                      "This pass is ignored in relay.")
-        return None
 
     copy_inputs = [s for s in inputs]
 
@@ -51,8 +46,8 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     kernel = tinfos[1]
 
     import ast
-    padding = ast.literal_eval(attrs['padding'])
-    stride = ast.literal_eval(attrs['strides'])
+    padding = ast.literal_eval(str(attrs['padding']))
+    stride = ast.literal_eval(str(attrs['strides']))
 
     wkl = _get_workload(data, kernel, stride, padding, data.dtype)
     oc_bn = 1
@@ -69,7 +64,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     if "target" in new_attrs:
         del new_attrs["target"]
 
-    return sym.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    if F == sym:
+        out = F.contrib.conv2d_NCHWc(*copy_inputs, **new_attrs)
+    else:
+        out = F.nn.contrib_conv2d_nchwc(*copy_inputs, **new_attrs)
+
+    return out
 
 @conv2d_NCHWc.register(["intel_graphics"])
 def _decl_conv2d(data, kernel, stride, padding, dilation, layout, out_layout, out_dtype='float32'):
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index e3ab0b364c65..2ddfee2806a5 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -191,6 +191,25 @@ def concatenate(a_tuple, axis=0):
     return cpp.concatenate(a_tuple, axis)
 
 
+def stack(a, axis):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be stacked.
+
+    axis : int, optional
+        The axis in the result array along which the input arrays are stacked.
+
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.stack(a, axis)
+
+
 def split(ary, indices_or_sections, axis=0):
     """Split an array into multiple sub-arrays.
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index e566a5d510ee..3630c4cf3b85 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -266,6 +266,11 @@ TVM_REGISTER_GLOBAL("topi.concatenate")
   *rv = concatenate(args[0], args[1]);
   });
 
+TVM_REGISTER_GLOBAL("topi.stack")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = stack(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.split")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   if (args[1].type_code() == kDLInt || args[1].type_code() == kDLUInt) {
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 31e37d4d26f2..66c75854193f 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -124,6 +124,31 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+def verify_stack(shapes, axis):
+    tensor_l = []
+    for i, shape in enumerate(shapes):
+        tensor_l.append(tvm.placeholder(shape, name="A" + str(i)))
+    out_tensor = topi.stack(tensor_l, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(out_tensor)
+
+        foo = tvm.build(s, tensor_l + [out_tensor], device, name="stack")
+        data_npys = [np.random.normal(size=shape).astype(tensor_l[0].dtype) for shape in shapes]
+        out_npy = np.stack(data_npys, axis=axis)
+        data_nds = [tvm.nd.array(data_npy, ctx) for data_npy in data_npys]
+        out_nd = tvm.nd.empty(out_npy.shape, ctx=ctx, dtype=out_tensor.dtype)
+        foo(*(data_nds + [out_nd]))
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
 
 def verify_split(src_shape, indices_or_sections, axis):
     A = tvm.placeholder(shape=src_shape, name="A")
@@ -383,7 +408,7 @@ def test_squeeze():
 
 
 def test_concatenate():
-    verify_concatenate([(2,), (2,), (2,)], 0)
+    verify_concatenate([(2,), (2,), (2,)], -1)
     verify_concatenate([(2, 3, 4), (2, 2, 4), (2, 5, 4)], 1)
     verify_concatenate([(1, 2, 4), (1, 2, 3), (1, 2, 7), (1, 2, 8), (1, 2, 1)], -1)
     verify_concatenate([(5, 6, 7, 3),
@@ -393,6 +418,14 @@ def test_concatenate():
                         (2, 6, 7, 3)], 0)
 
 
+def test_stack():
+    verify_stack([(2,), (2,), (2,)], -1)
+    verify_stack([(2,), (2,), (2,)], 1)
+    verify_stack([(2,), (2,), (2,)], 0)
+    verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1)
+    verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1)
+
+
 def test_split():
     verify_split((2, 12, 3), 3, 1)
     verify_split((2, 12, 3), [2, 4], 1)
@@ -480,6 +513,7 @@ def check_device(device):
 if __name__ == "__main__":
     test_strided_slice()
     test_concatenate()
+    test_stack()
     test_tranpose()
     test_expand_dims()
     test_reshape()

From 8d1032fa415611d6747f3887f07612768ac92179 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Thu, 7 Mar 2019 13:31:50 +0900
Subject: [PATCH 73/93] [RUNTIME][OPENCL] clFinish before releasing memory
 (#2737)

---
 src/runtime/opencl/opencl_device_api.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 6bb0948bca91..38edcd1967cc 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -110,6 +110,10 @@ void* OpenCLWorkspace::AllocDataSpace(
 }
 
 void OpenCLWorkspace::FreeDataSpace(TVMContext ctx, void* ptr) {
+  // We have to make sure that the memory object is not in the command queue
+  // for some OpenCL platforms.
+  OPENCL_CALL(clFinish(this->GetQueue(ctx)));
+
   cl_mem mptr = static_cast<cl_mem>(ptr);
   OPENCL_CALL(clReleaseMemObject(mptr));
 }

From 17100df4de5e03634b33353bc947be06f42eaf81 Mon Sep 17 00:00:00 2001
From: Haichen Shen <shenhaichen@gmail.com>
Date: Thu, 7 Mar 2019 14:45:30 -0800
Subject: [PATCH 74/93] [Bugfix][Relay][Frontend] Fix bug in mxnet converter
 for slick_like (#2744)

* Fix bug in mxnet converter for slick_like

* More tolerance for topi_conv2d_NCHWc
---
 python/tvm/relay/frontend/mxnet.py          |  9 +++++++-
 tests/python/frontend/mxnet/test_forward.py | 23 ++++++++++++++++++++-
 topi/tests/python/test_topi_conv2d_NCHWc.py |  4 ++--
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 45329e1b3fe5..2e0ccd07fdc1 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -194,6 +194,13 @@ def _mx_slice(inputs, attrs):
     return _op.strided_slice(inputs[0], **new_attrs)
 
 
+def _mx_slice_like(inputs, attrs):
+    assert len(inputs) == 2
+    new_attrs = {}
+    new_attrs["axes"] = attrs.get_int_tuple("axes", None)
+    return _op.slice_like(*inputs, **new_attrs)
+
+
 def _mx_slice_axis(inputs, attrs):
     assert len(inputs) == 1
     shape = ir_pass.infer_type(inputs[0]).checked_type.shape
@@ -383,7 +390,6 @@ def _mx_proposal(inputs, attrs):
     "exp",
     "negative",
     "reshape_like",
-    "slice_like",
     "zeros_like",
     "ones_like",
     "where",
@@ -473,6 +479,7 @@ def _mx_proposal(inputs, attrs):
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
     "slice"         : _mx_slice,
+    "slice_like"    : _mx_slice_like,
     "slice_axis"    : _mx_slice_axis,
     "SliceChannel"  : _mx_split,
     "split"         : _mx_split,
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 74a87e29a0c0..2dfe20c503e6 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -336,7 +336,6 @@ def test_forward_scalar_ops():
                 op_res = intrp.evaluate(new_sym)(a_np)
                 tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
 
-
 def test_forward_slice_axis():
     def verify(shape, axis, begin, end):
         data_np = np.random.uniform(size=shape).astype("float32")
@@ -354,6 +353,27 @@ def verify(shape, axis, begin, end):
     verify((3, 4), 1, -3, -1)
     verify((3, 4), -1, -3, -1)
 
+def test_forward_slice_like():
+    def verify(x_shape, y_shape, axes):
+        x_np = np.random.uniform(size=x_shape).astype("float32")
+        y_np = np.random.uniform(size=y_shape).astype("float32")
+        if axes is None:
+            ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np))
+            mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"))
+        else:
+            ref_res = mx.nd.slice_like(mx.nd.array(x_np), mx.nd.array(y_np), axes=axes)
+            mx_sym = mx.sym.slice_like(mx.sym.var("x"), mx.sym.var("y"), axes=axes)
+        new_sym, _ = relay.frontend.from_mxnet(mx_sym, {"x": x_shape, "y": y_shape})
+        for target, ctx in ctx_list():
+            for kind in ["graph", "debug"]:
+                intrp = relay.create_executor(kind, ctx=ctx, target=target)
+                op_res = intrp.evaluate(new_sym)(x_np, y_np)
+                tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy())
+    verify((3, 4), (2, 3), None)
+    verify((3, 4), (2, 3), (0, 1))
+    verify((3, 4), (2, 3), (0))
+    verify((3, 4), (2, 3), (-1))
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -382,3 +402,4 @@ def verify(shape, axis, begin, end):
     test_forward_elemwise_ops()
     test_forward_scalar_ops()
     test_forward_slice_axis()
+    test_forward_slice_like()
diff --git a/topi/tests/python/test_topi_conv2d_NCHWc.py b/topi/tests/python/test_topi_conv2d_NCHWc.py
index a3af43c8d810..73c1fdae2d66 100644
--- a/topi/tests/python/test_topi_conv2d_NCHWc.py
+++ b/topi/tests/python/test_topi_conv2d_NCHWc.py
@@ -105,7 +105,7 @@ def check_device(device):
                              name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                                   (batch, in_channel, in_size, num_filter, kernel, stride, padding, dilation))
             func(a, w, c)
-        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
+        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3)
 
     # test llvm only for now since conv2d_NCHWc implement is missing in other backend.
     for device in ["llvm"]:
@@ -202,4 +202,4 @@ def test_conv2d_NCHWc():
     verify_conv2d_NCHWc(1,  256,   3, 126, 3, 1, 1)
 
 if __name__ == "__main__":
-    test_conv2d_NCHWc()
\ No newline at end of file
+    test_conv2d_NCHWc()

From 9547cbb57760513bf084409e10c977bd8fd265df Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Fri, 8 Mar 2019 14:35:46 +0900
Subject: [PATCH 75/93] Improve NNVM to Relay conversion (#2734)

* Improve NNVM to Relay conversion

* fix pylint

* support __lshift_scalar__, abs, ceil, floor, and trunc to pass CI
---
 nnvm/python/nnvm/testing/check_computation.py |  19 ++
 nnvm/python/nnvm/to_relay.py                  | 258 ++++++++----------
 python/tvm/relay/frontend/nnvm_common.py      |  25 +-
 src/relay/pass/type_solver.cc                 |   4 +-
 4 files changed, 145 insertions(+), 161 deletions(-)

diff --git a/nnvm/python/nnvm/testing/check_computation.py b/nnvm/python/nnvm/testing/check_computation.py
index 7ab4dc0d4c6c..68419b73523b 100644
--- a/nnvm/python/nnvm/testing/check_computation.py
+++ b/nnvm/python/nnvm/testing/check_computation.py
@@ -8,10 +8,12 @@
 import tvm
 from tvm.contrib import graph_runtime
 from tvm.testing import check_numerical_grads
+from tvm import relay
 
 import nnvm
 from nnvm.compiler import graph_util
 from nnvm.compiler.graph_attr import TCODE_TO_DTYPE, DTYPE_TO_TCODE
+from nnvm.to_relay import to_relay
 from .config import ctx_list
 
 def infer_shapes_dtypes(graph, shape=None, dtype=None, fallback_dtype=None):
@@ -441,6 +443,23 @@ def check_function(symbol, forward=None, backward=None, grad_input_vars=None,
             debug_stage = "running"
             nnvm_res = main_function(**np_inputs)
 
+            try:
+                logging.debug("checking to_relay conversion")
+                inputs = np_inputs_without_head_grads.copy()
+                func, inputs = to_relay(main_graph, shape, dtype, params=inputs)
+                with relay.build_config(opt_level=3):
+                    graph, lib, params = relay.build(func, target=target)
+                m = graph_runtime.create(graph, lib, ctx)
+                m.set_input(**inputs)
+                m.set_input(**params)
+                m.run()
+                for i in range(out_len):
+                    relay_out = m.get_output(i).asnumpy()
+                    tvm.testing.assert_allclose(nnvm_res[i], relay_out, atol=atol, rtol=rtol)
+            except NotImplementedError as err:
+                # the NNVM operator is not supported yet
+                logging.warning(err)
+
             if backward_graph is not None:
                 grad_var_names = [x.attr('name') for x in grad_input_vars]
                 nnvm_grads = {x: v for x, v in zip(grad_var_names, nnvm_res[out_len:])}
diff --git a/nnvm/python/nnvm/to_relay.py b/nnvm/python/nnvm/to_relay.py
index 264a18d90c77..7d792116b104 100644
--- a/nnvm/python/nnvm/to_relay.py
+++ b/nnvm/python/nnvm/to_relay.py
@@ -6,7 +6,8 @@
 from tvm import relay, nd
 from tvm.relay import op, expr, var
 from tvm.relay.frontend.common import StrAttrsDict
-from tvm.relay.frontend.nnvm_common import _rename
+from tvm.relay.frontend.nnvm_common import _rename, _binop_scalar, _rbinop_scalar, \
+     _elemwise_sum, _softmax_op, _compare, _reduce
 from .symbol import Symbol
 from .compiler import graph_attr
 from .graph import create as graph_create
@@ -25,11 +26,6 @@ def _dense(children, attrs, odtype='float32'):
     else:
         return dense
 
-def _nn_softmax(children, attrs, odtype='float32'):
-    assert len(children) == 1
-    axis = attrs.get_int('axis', 1)
-    return op.nn.softmax(children[0], axis)
-
 def _conv2d(children, attrs, odtype='float32'):
     use_bias = attrs.get_bool('use_bias', True)
 
@@ -150,84 +146,6 @@ def _transpose(children, attrs, odtype='float32'):
     return op.transpose(children[0], axes=axes)
 
 
-def _add(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.add(left, right)
-
-
-def _subtract(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.subtract(left, right)
-
-
-def _rsubtract(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.subtract(right, left)
-
-
-def _multiply(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.multiply(left, right)
-
-
-def _divide(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype=odtype)
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.divide(left, right)
-
-
-def _rshift(children, attrs, odtype='float32'):
-    if len(children) == 1:
-        left = children[0]
-        scalar = attrs.get_float('scalar')
-        right = relay.const(scalar, dtype='int32')
-    else:
-        assert len(children) == 2
-        left = children[0]
-        right = children[1]
-
-    return op.right_shift(left, right)
-
-
 def _clip(children, attrs, odtype='float32'):
     a_min = attrs.get_float('a_min')
     a_max = attrs.get_float('a_max')
@@ -255,9 +173,6 @@ def broadcast_to(children, attrs, odtype='float32'):
     rconst = relay.Constant(nd.array(array))
     return op.broadcast_to_like(data, rconst)
 
-def _copy(children, attrs, odtype='float32'):
-    return op.copy(children[0])
-
 
 def _global_avg_pool2d(children, attrs, odtype='float32'):
     data = children[0]
@@ -309,42 +224,10 @@ def _full_like(children, attrs, odtype='float32'):
     return op.full_like(children[0], fill_value)
 
 
-def _greater(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type')
-    if out_type:
-        return op.greater(children[0], children[1]).astype(out_type)
-    else:
-        return op.greater(children[0], children[1])
-
-
-def _greater_equal(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.greater_equal(children[0], children[1]).astype(out_type)
-    else:
-        return op.greater_equal(children[0], children[1])
-
-
-def _less(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.less(children[0], children[1]).astype(out_type)
-    else:
-        return op.less(children[0], children[1])
-
-
-def _less_equal(children, attrs, odtype='float32'):
-    out_type = attrs.get_str('out_type', None)
-    if out_type:
-        return op.less_equal(children[0], children[1]).astype(out_type)
-    else:
-        return op.less_equal(children[0], children[1])
-
-
 def _strided_slice(children, attrs, odtype='float32'):
     begin = attrs.get_int_list('begin')
     end = attrs.get_int_list('end')
-    strides = attrs.get_int_list('strides', None)
+    strides = attrs.get_int_list('stride', None)
     return op.strided_slice(children[0], begin, end, strides=strides)
 
 
@@ -358,14 +241,11 @@ def _split(children, attrs, odtype='float32'):
 
     axis = attrs.get_int('axis', 0)
 
-    return op.split(children[0], indices_or_sections, axis)
+    return op.split(children[0], indices_or_sections, axis).astuple()
 
 def _squeeze(children, attrs, odtype='float32'):
-    axis = None
-    try:
-        axis = [attrs.get_int('axis', None)]
-    except ValueError:
-        axis = axis or attrs.get_int_tuple('axis', None)
+    axis = attrs.get_int_tuple('axis', None)
+    axis = [axis] if isinstance(axis, int) else axis
 
     return op.squeeze(children[0], axis)
 
@@ -378,20 +258,60 @@ def _dropout(children, attrs, odtype='float32'):
     return op.nn.dropout(children[0], rate)
 
 def _mean(children, attrs, odtype='float32'):
-    axis = None
-    try:
-        axis = [attrs.get_int('axis', None)]
-    except ValueError:
-        axis = axis or attrs.get_int_tuple('axis', None)
+    axis = attrs.get_int_tuple('axis', None)
     keepdims = attrs.get_bool('keepdims')
 
     return op.mean(children[0], axis, keepdims)
 
 
+def _prelu(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', 1)
+    return op.nn.prelu(children[0], children[1], axis)
+
+
+def _lrn(children, attrs, odtype='float32'):
+    size = attrs.get_int("size", 5)
+    axis = attrs.get_int("axis", 1)
+    bias = attrs.get_float("bias", 2)
+    alpha = attrs.get_float("alpha", 1e-05)
+    beta = attrs.get_float("beta", 0.75)
+    return op.nn.lrn(children[0], size, axis, bias, alpha, beta)
+
+
+def _l2_nomalize(children, attrs, odtype='float32'):
+    eps = attrs.get_float('eps')
+    axis = attrs.get_int_tuple('axis', None)
+    return op.nn.l2_normalize(children[0], eps, axis)
+
+
+def _take(children, attrs, odtype='float32'):
+    axis = attrs.get_int('axis', None)
+    return op.take(children[0], children[1], axis)
+
+
+def _matmul(children, attrs, odtype='float32'):
+    input_1_t = op.transpose(children[1], axes=(1, 0))
+    return op.nn.dense(children[0], input_1_t)
+
+
+def _collapse_sum(children, attrs, odtype='float32'):
+    for key in ["axis", "keepdims", "exclude"]:
+        if key in attrs.attrs:
+            raise NotImplementedError("Parameter '" + key + "' is not supported.")
+    return op.collapse_sum_like(children[0], children[1])
+
+
+def _not_implemented(new_op):
+    def _impl(children, attrs, odtype='float32'):
+        raise NotImplementedError(str(new_op) + " is not implemented.")
+    return _impl
+
+
 NNVM_OP_2_RELAY_OP = {
     'flatten': _nn_batch_flatten,
     'dense': _dense,
-    'softmax': _nn_softmax,
+    'softmax': _softmax_op(op.nn.softmax),
+    'log_softmax': _softmax_op(op.nn.log_softmax),
     'conv2d': _conv2d,
     'batch_norm': _batch_norm,
     'max_pool2d': _max_pool2d,
@@ -400,30 +320,47 @@ def _mean(children, attrs, odtype='float32'):
     'dropout': _dropout,
     'mean': _mean,
     # Addition
-    '__add_scalar__': _add,
-    'broadcast_add': _add,
-    'elemwise_add': _add,
+    '__add_scalar__': _binop_scalar(op.add),
+    'broadcast_add' : _rename(op.add),
+    'elemwise_add'  : _rename(op.add),
     # Subtraction
-    '__sub_scalar__': _subtract,
-    '__rsub_scalar__': _rsubtract,
-    'broadcast_sub': _subtract,
-    'elemwise_sub': _subtract,
+    '__sub_scalar__' : _binop_scalar(op.subtract),
+    '__rsub_scalar__': _rbinop_scalar(op.subtract),
+    'broadcast_sub'  : _rename(op.subtract),
+    'elemwise_sub'   : _rename(op.subtract),
     # Multiply
-    '__mul_scalar__': _multiply,
-    'broadcast_mul': _multiply,
-    'elemwise_mul': _multiply,
+    '__mul_scalar__': _binop_scalar(op.multiply),
+    'broadcast_mul' : _rename(op.multiply),
+    'elemwise_mul'  : _rename(op.multiply),
     # Division
-    '__div_scalar__': _divide,
-    'broadcast_div': _divide,
-    'elemwise_div': _divide,
+    '__div_scalar__': _binop_scalar(op.divide),
+    'broadcast_div' : _rename(op.divide),
+    'elemwise_div'  : _rename(op.divide),
+    'broadcast_mod' : _rename(op.mod),
     # Negative
     'negative': _rename("negative"),
+    # Power
+    '__pow_scalar__': _binop_scalar(op.power),
+    '__rpow_scalar__': _rbinop_scalar(op.power),
+    'broadcast_pow': _rename(op.power),
+    # Sum
+    'sum': _reduce(op.sum),
+    'elemwise_sum': _elemwise_sum,
+    'collapse_sum': _collapse_sum,
+    'broadcast_max': _rename(op.maximum),
+    'broadcast_min': _rename(op.minimum),
 
     # Comparsion
-    'greater': _greater,
-    'greater_equal': _greater_equal,
-    'less': _less,
-    'less_equal': _less_equal,
+    'greater': _compare(op.greater),
+    'broadcast_greater': _compare(op.greater),
+    'greater_equal': _compare(op.greater_equal),
+    'broadcast_greater_equal': _compare(op.greater_equal),
+    'less': _compare(op.less),
+    'broadcast_less': _compare(op.less),
+    'less_equal': _compare(op.less_equal),
+    'broadcast_less_equal': _compare(op.less_equal),
+    'broadcast_equal': _compare(op.equal),
+    'broadcast_not_equal': _compare(op.not_equal),
 
     # Activations
     'sigmoid': _rename('sigmoid'),
@@ -432,13 +369,17 @@ def _mean(children, attrs, odtype='float32'):
     'log': _rename('log'),
     'tanh': _rename('tanh'),
     'leaky_relu': _leaky_relu,
+    'prelu': _prelu,
     'clip': _clip,
     'round': _rename('round'),
     'cast': _cast,
     'expand_dims': _expand_dims,
     'broadcast_to': broadcast_to,
-    '__rshift_scalar__': _rshift,
-    'copy': _copy,
+    '__lshift_scalar__': _binop_scalar(op.left_shift),
+    '__rshift_scalar__': _binop_scalar(op.right_shift),
+    'broadcast_left_shift': _rename(op.left_shift),
+    'broadcast_right_shift': _rename(op.right_shift),
+    'copy': _rename(op.copy),
     'global_avg_pool2d': _global_avg_pool2d,
     'avg_pool2d': _avg_pool2d,
     'conv2d_transpose': _conv2d_transpose,
@@ -449,6 +390,21 @@ def _mean(children, attrs, odtype='float32'):
     'split': _split,
     'squeeze': _squeeze,
     'concatenate': _concatenate,
+    'abs': _rename(op.abs),
+    'ceil': _rename(op.ceil),
+    'floor': _rename(op.floor),
+    'trunc': _rename(op.trunc),
+    'take': _take,
+    'lrn': _lrn,
+    'l2_normalize': _l2_nomalize,
+    'matmul': _matmul,
+    'zeros_like': _rename(op.zeros_like),
+    'reshape_like': _rename(op.reshape_like),
+    'ones_like': _rename(op.ones_like),
+
+    'expand_like': _not_implemented("expand_like"),
+    'gather_nd': _not_implemented("gather_nd"),
+    'block_grad': _not_implemented("block_grad"),
 }
 
 
diff --git a/python/tvm/relay/frontend/nnvm_common.py b/python/tvm/relay/frontend/nnvm_common.py
index 3838c3d4aa3b..7fd6f409cfd3 100644
--- a/python/tvm/relay/frontend/nnvm_common.py
+++ b/python/tvm/relay/frontend/nnvm_common.py
@@ -41,7 +41,7 @@ def _impl(inputs, attrs):
 
 def _softmax_op(new_op):
     """softmax/log_softmax"""
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, _dtype='float32'):
         assert len(inputs) == 1
         axis = attrs.get_int("axis", -1)
         return new_op(inputs[0], axis=axis)
@@ -50,13 +50,14 @@ def _impl(inputs, attrs):
 
 def _reduce(new_op):
     """Reduction ops like sum/min/max"""
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, _dtype='float32'):
         assert len(inputs) == 1
         axis = attrs.get_int_tuple("axis", [])
         keepdims = attrs.get_bool("keepdims", False)
+        exclude = attrs.get_bool("exclude", False)
         # use None for reduce over all axis.
         axis = None if len(axis) == 0 else axis
-        return new_op(inputs[0], axis=axis, keepdims=keepdims)
+        return new_op(inputs[0], axis=axis, keepdims=keepdims, exclude=exclude)
     return _impl
 
 
@@ -97,7 +98,7 @@ def _upsampling(inputs, attrs):
     return _op.nn.upsampling(inputs[0], scale=scale)
 
 
-def _elemwise_sum(inputs, _):
+def _elemwise_sum(inputs, _, _dtype='float32'):
     assert len(inputs) > 0
     res = inputs[0]
     for x in inputs[1:]:
@@ -106,20 +107,28 @@ def _elemwise_sum(inputs, _):
 
 
 def _binop_scalar(new_op):
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, odtype='float32'):
         assert len(inputs) == 1
         scalar = attrs.get_float("scalar")
         # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
+        scalar = _expr.const(scalar, dtype=odtype)
         return new_op(inputs[0], scalar)
     return _impl
 
 
 def _rbinop_scalar(new_op):
-    def _impl(inputs, attrs):
+    def _impl(inputs, attrs, odtype='float32'):
         assert len(inputs) == 1
         scalar = attrs.get_float("scalar")
         # Note: binary scalar only works for float op for now
-        scalar = _expr.const(scalar, dtype="float32")
+        scalar = _expr.const(scalar, dtype=odtype)
         return new_op(scalar, inputs[0])
     return _impl
+
+
+def _compare(new_op):
+    """Compare ops like greater/less"""
+    def _impl(inputs, _, odtype='float32'):
+        assert len(inputs) == 2
+        return new_op(inputs[0], inputs[1]).astype(odtype)
+    return _impl
diff --git a/src/relay/pass/type_solver.cc b/src/relay/pass/type_solver.cc
index 179f90a2fe15..abbd82977499 100644
--- a/src/relay/pass/type_solver.cc
+++ b/src/relay/pass/type_solver.cc
@@ -476,8 +476,8 @@ bool TypeSolver::Solve() {
       rnode->resolved = false;
       this->ReportError(
           RELAY_ERROR(
-            "an internal invariant was violdated while" \
-            "typechecking your program" <<
+            "an internal invariant was violdated while " \
+            "typechecking your program " <<
             err.what()), rnode->location);
     }
 

From 6dbd2d7a5a1a82cf21829b0952fa93b348a9d58b Mon Sep 17 00:00:00 2001
From: abergeron <bergearn@iro.umontreal.ca>
Date: Fri, 8 Mar 2019 20:46:01 -0500
Subject: [PATCH 76/93] [Relay] Add logical operators (#2743)

---
 nnvm/src/top/tensor/elemwise.cc               |  6 +--
 python/tvm/relay/frontend/tensorflow.py       |  8 +++
 python/tvm/relay/op/_tensor.py                |  3 ++
 python/tvm/relay/op/tensor.py                 | 52 +++++++++++++++++++
 src/relay/op/tensor/binary.cc                 | 12 +++++
 src/relay/op/tensor/unary.cc                  | 11 ++++
 .../frontend/tensorflow/test_forward.py       | 46 +++++++++++++++-
 7 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/nnvm/src/top/tensor/elemwise.cc b/nnvm/src/top/tensor/elemwise.cc
index 52d9aa4456ed..2d9813e22131 100644
--- a/nnvm/src/top/tensor/elemwise.cc
+++ b/nnvm/src/top/tensor/elemwise.cc
@@ -366,7 +366,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_and)
 .describe(R"code(Elementwise compute the logical AND
 
 )code")
-.set_support_level(1)
+.set_support_level(4)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const NodeAttrs& attrs,
                     const Array<Tensor>& inputs,
@@ -378,7 +378,7 @@ NNVM_REGISTER_ELEMWISE_BINARY_OP(logical_or)
 .describe(R"code(Elementwise compute the logical OR
 
 )code")
-.set_support_level(1)
+.set_support_level(4)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const NodeAttrs& attrs,
                     const Array<Tensor>& inputs,
@@ -413,7 +413,7 @@ NNVM_REGISTER_ELEMWISE_UNARY_OP(logical_not)
 .describe(R"code(Elementwise compute the logical NOT
 
 )code"  NNVM_ADD_FILELINE)
-.set_support_level(3)
+.set_support_level(4)
 .set_attr<FTVMCompute>(
   "FTVMCompute", [](const NodeAttrs& attrs,
                     const Array<Tensor>& inputs,
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index d583053dc5a6..1f2ba4eb435f 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -849,6 +849,11 @@ def _impl(inputs, attr, params):
                        transforms={'axis': ('axis', 1)})([inputs[0]], attr)
     return _impl
 
+def _logical(name):
+    def _impl(inputs, attr, params):
+        return AttrCvt(op_name=name)(inputs, attr)
+    return _impl
+
 # compatible operators that do NOT require any conversion.
 _identity_list = []
 
@@ -909,6 +914,9 @@ def _impl(inputs, attr, params):
     'Transpose'                         : _transpose(),
     'Tanh'                              : AttrCvt('tanh'),
     'Mean'                              : _mean(),
+    'LogicalAnd'                        : _logical('logical_and'),
+    'LogicalOr'                         : _logical('logical_or'),
+    'LogicalNot'                        : _logical('logical_not'),
     'Less'                              : _broadcast('less'),
     'Greater'                           : _broadcast('greater'),
     'LessEqual'                         : _broadcast('less_equal'),
diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py
index 39e1f7afbfa2..7f8da03008d2 100644
--- a/python/tvm/relay/op/_tensor.py
+++ b/python/tvm/relay/op/_tensor.py
@@ -18,6 +18,7 @@
 register_schedule("round", schedule_broadcast)
 register_schedule("abs", schedule_broadcast)
 register_schedule("tanh", schedule_broadcast)
+register_schedule("logical_not", schedule_broadcast)
 register_schedule("negative", schedule_broadcast)
 register_schedule("copy", schedule_broadcast)
 
@@ -27,6 +28,8 @@
 register_schedule("divide", schedule_broadcast)
 register_schedule("power", schedule_injective)
 register_schedule("mod", schedule_broadcast)
+register_schedule("logical_and", schedule_broadcast)
+register_schedule("logical_or", schedule_broadcast)
 register_schedule("equal", schedule_broadcast)
 register_schedule("not_equal", schedule_broadcast)
 register_schedule("less", schedule_broadcast)
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index a6247dd971a8..e315f27dc593 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -191,6 +191,22 @@ def negative(data):
     return _make.negative(data)
 
 
+def logical_not(data):
+    """Compute element-wise logical not of data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_not(data)
+
+
 def add(lhs, rhs):
     """Addition with numpy-style broadcasting.
 
@@ -307,6 +323,42 @@ def mod(lhs, rhs):
     return _make.mod(lhs, rhs)
 
 
+def logical_and(lhs, rhs):
+    """logical AND with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_and(lhs, rhs)
+
+
+def logical_or(lhs, rhs):
+    """logical OR with numpy-style broadcasting.
+
+    Parameters
+    ----------
+    lhs : relay.Expr
+        The left hand side input data
+    rhs : relay.Expr
+        The right hand side input data
+
+    Returns
+    -------
+    result : relay.Expr
+        The computed result.
+    """
+    return _make.logical_or(lhs, rhs)
+
+
 def equal(lhs, rhs):
     """Broadcasted elementwise test for (lhs == rhs).
 
diff --git a/src/relay/op/tensor/binary.cc b/src/relay/op/tensor/binary.cc
index 97adbc7f4ca8..b8305c44c037 100644
--- a/src/relay/op/tensor/binary.cc
+++ b/src/relay/op/tensor/binary.cc
@@ -82,6 +82,18 @@ RELAY_REGISTER_BINARY_OP("mod")
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::mod));
 
 
+RELAY_REGISTER_BINARY_OP("logical_and")
+.describe("Elementwise logical AND with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::logical_and));
+
+
+RELAY_REGISTER_BINARY_OP("logical_or")
+.describe("Elementwise logical OR with broadcasting")
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_BINARY_COMPUTE(topi::logical_or));
+
+
 RELAY_REGISTER_CMP_OP("equal")
 .describe("Elementwise equal compare with broadcasting")
 .set_support_level(4)
diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc
index 06720d67713c..cfcc130564c0 100644
--- a/src/relay/op/tensor/unary.cc
+++ b/src/relay/op/tensor/unary.cc
@@ -178,5 +178,16 @@ RELAY_REGISTER_UNARY_OP("negative")
 .set_support_level(3)
 .set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::negative));
 
+
+RELAY_REGISTER_UNARY_OP("logical_not")
+.describe(R"code(Returns the logical inverse of input array, computed element-wise.
+
+.. math::
+   ~(x)
+
+)code" TVM_ADD_FILELINE)
+.set_support_level(4)
+.set_attr<FTVMCompute>("FTVMCompute", RELAY_UNARY_COMPUTE(topi::logical_not));
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py
index 0db6952d837d..84c431aaf342 100644
--- a/tests/python/frontend/tensorflow/test_forward.py
+++ b/tests/python/frontend/tensorflow/test_forward.py
@@ -682,6 +682,49 @@ def test_forward_pad():
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT")
     _test_pad((2, 3), [[1,1], [2,2]], mode="CONSTANT", constant_values=1.0)
 
+#######################################################################
+# Logical operators
+# --------------------
+def test_logical_and():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_and(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_or():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_or(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_xor():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        in2 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in2')
+        out = tf.logical_xor(in1, in2, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        in_data2 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm([in_data1, in_data2], ['in1:0', 'in2:0'], 'out:0')
+
+def test_logical_not():
+    with tf.Graph().as_default():
+        in1 = tf.placeholder(tf.bool, shape=[1, 4, 4, 3], name='in1')
+        out = tf.logical_not(in1, name='out')
+        in_data1 = np.random.choice(a=[False, True],size=(1, 4, 4, 3)).astype('bool')
+        compare_tf_with_tvm(in_data1, 'in1:0', 'out:0')
+
+def test_forward_logical():
+    test_logical_and()
+    test_logical_or()
+    test_logical_xor()
+    test_logical_not()
+
 
 #######################################################################
 # Inception V3
@@ -1109,5 +1152,4 @@ def test_forward_rel_ops():
 
     # Relational ops
     test_forward_rel_ops()
-
-
+    test_forward_logical()

From be89cc17c31b6a1bd2a8893c60eb65d14653bc1f Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Fri, 8 Mar 2019 19:44:07 -0800
Subject: [PATCH 77/93] Fix vmlal.s16 code generation for int8 x int8 -> int32
 (#2748)

---
 src/pass/lower_intrin.cc                  | 18 +++++++++-
 tests/python/unittest/test_codegen_arm.py | 44 +++++++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/src/pass/lower_intrin.cc b/src/pass/lower_intrin.cc
index 1a9caf4b591e..82eabf09b9e3 100644
--- a/src/pass/lower_intrin.cc
+++ b/src/pass/lower_intrin.cc
@@ -50,7 +50,23 @@ class IntrinInjecter : public IRMutator {
     // on ARM.
     if (const Broadcast* bcast = e.as<Broadcast>()) {
       if (const Cast* cast = bcast->value.as<Cast>()) {
-        if (cast->type.bits() == cast->value.type().bits() * 2) {
+        auto should_swap = [&]() {
+          // Maintain behaviour (int8 -> int16, fp16 -> fp32).
+          if (cast->type.bits() == cast->value.type().bits() * 2) {
+            return true;
+          }
+          // Check both operands are integer-like.
+          if (!cast->type.is_uint() && !cast->type.is_int()) {
+            return false;
+          }
+          if (!cast->value.type().is_uint() && !cast->value.type().is_int()) {
+            return false;
+          }
+          // If both are integer-like, swap if we have a widening cast.
+          return cast->type.bits() > cast->value.type().bits();
+        };
+
+        if (should_swap()) {
           Expr new_bcast = Broadcast::make(cast->value, bcast->lanes);
           return Cast::make(bcast->type, new_bcast);
         }
diff --git a/tests/python/unittest/test_codegen_arm.py b/tests/python/unittest/test_codegen_arm.py
index 24240db72b26..049696f95135 100644
--- a/tests/python/unittest/test_codegen_arm.py
+++ b/tests/python/unittest/test_codegen_arm.py
@@ -26,5 +26,49 @@ def check_correct_assembly(type, elements, counts):
     check_correct_assembly('uint32', 2, 2)
     check_correct_assembly('uint64', 2, 3)
 
+def test_vmlal_s16():
+    target = 'llvm -target=armv7l-none-linux-gnueabihf -mcpu=cortex-a53 -mattr=+neon'
+
+    def check_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K, N), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k, n].astype("int32"), axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert (len(matches) == N // 4)
+    check_correct_assembly(4)
+    check_correct_assembly(8)
+    check_correct_assembly(16)
+
+    def check_broadcast_correct_assembly(N):
+        K = tvm.var("K")
+        A = tvm.placeholder((K, N), dtype="int8", name='A')
+        B = tvm.placeholder((K,), dtype="int8", name='A')
+        k = tvm.reduce_axis((0, K))
+        C = tvm.compute((N, ), lambda n: tvm.sum(
+            A[k, n].astype("int32") * B[k].astype("int32"),
+            axis=[k]), name='C')
+        s = tvm.create_schedule(C.op)
+        s[C].vectorize(s[C].op.axis[0])
+        f = tvm.build(s, [A, B, C], target)
+
+        # Verify we see the correct number of vmlal.s16 instructions
+        assembly = f.get_source('asm')
+        matches = re.findall("vmlal.s16", assembly)
+        assert len(matches) == N // 4
+    check_broadcast_correct_assembly(8)
+    check_broadcast_correct_assembly(16)
+    check_broadcast_correct_assembly(32)
+    check_broadcast_correct_assembly(64)
+
 if __name__ == "__main__":
     test_popcount()
+    test_vmlal_s16()

From 90197ba30f77fb8210d53c56a43a463d95b52dbc Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Sat, 9 Mar 2019 02:03:40 -0800
Subject: [PATCH 78/93] revert PR#2420 nms changes (#2747)

---
 topi/python/topi/cuda/nms.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index 3cdc02e58aec..e0d71559f1a0 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -35,7 +35,7 @@ def sort_ir(data, index, output):
     p_index = ib.buffer_ptr(index)
     p_out = ib.buffer_ptr(output)
     nthread_tx = max_threads
-    nthread_bx = (num_anchors + 1) // 2 // max_threads + 1
+    nthread_bx = num_anchors // max_threads + 1
     tx = tvm.thread_axis("threadIdx.x")
     bx = tvm.thread_axis("vthread")
     ib.scope_attr(tx, "thread_extent", nthread_tx)
@@ -46,10 +46,8 @@ def sort_ir(data, index, output):
 
     with ib.for_range(0, batch, for_type="unroll") as b:
         start = b * num_anchors
-        for i in range(2):
-            bbox_id = tid * 2 + i
-            with ib.if_scope(bbox_id < num_anchors):
-                p_out[start + bbox_id] = bbox_id
+        with ib.if_scope(tid < num_anchors):
+            p_out[start + tid] = tid
         # OddEvenTransposeSort
         with ib.for_range(0, p_index[b]) as k:
             with ib.if_scope(tid < (p_index[b] + 1) // 2):

From 534818c3a34eb0211671dc34c63cee1fff083b1c Mon Sep 17 00:00:00 2001
From: Wuwei Lin <vincentl13x@gmail.com>
Date: Sat, 9 Mar 2019 23:57:49 +0800
Subject: [PATCH 79/93] [Relay][Quantization] Speed-aware quantization scheme
 improvement (#2723)

* [Relay][Quantization] Speed-aware quantization scheme improvement

* Add comment

* Add use_stop_fusion to qconfig

* Update comment
---
 python/tvm/relay/build_module.py       | 17 ++++++++++++++---
 python/tvm/relay/quantize/_annotate.py |  3 +++
 python/tvm/relay/quantize/quantize.py  |  5 +++++
 src/relay/pass/quantize.cc             | 26 +++++++++++++++++++++-----
 src/relay/pass/quantize.h              |  2 ++
 5 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py
index 7d63513d7dc0..e0784d53ee47 100644
--- a/python/tvm/relay/build_module.py
+++ b/python/tvm/relay/build_module.py
@@ -9,7 +9,7 @@
 from .. import nd as _nd, target as _target, autotvm
 from ..contrib import graph_runtime as _graph_rt
 from . import ir_pass
-from . import expr
+from . import expr as _expr
 from .backend import interpreter as _interpreter
 from .backend import graph_runtime_codegen as _graph_gen
 
@@ -22,6 +22,7 @@
     "FoldScaleAxis": 3,
     "AlterOpLayout": 3,
     "CanonicalizeOps": 3,
+    "EliminateCommonSubexpr": 3,
 }
 
 
@@ -126,8 +127,8 @@ def _bind_params_by_name(func, params):
         arg = name_dict[k]
         if arg is None:
             raise ValueError("Multiple args in the function have name %s" % k)
-        bind_dict[arg] = expr.const(v)
-    return expr.bind(func, bind_dict)
+        bind_dict[arg] = _expr.const(v)
+    return _expr.bind(func, bind_dict)
 
 
 def optimize(func, target=None, params=None):
@@ -162,6 +163,16 @@ def optimize(func, target=None, params=None):
         func = ir_pass.infer_type(func)
         func = ir_pass.simplify_inference(func)
 
+    if cfg.pass_enabled("EliminateCommonSubexpr"):
+        def fskip(expr):
+            if isinstance(expr, _expr.Call) and expr.op.name == 'cast' and \
+               expr.attrs.dtype == 'int32':
+                return True
+            return False
+
+        func = ir_pass.infer_type(func)
+        func = ir_pass.eliminate_common_subexpr(func, fskip)
+
     if cfg.pass_enabled("CombineParallelConv2D"):
         func = ir_pass.infer_type(func)
         func = ir_pass.combine_parallel_conv2d(func)
diff --git a/python/tvm/relay/quantize/_annotate.py b/python/tvm/relay/quantize/_annotate.py
index d56f21b2e2bb..5daf10284a9d 100644
--- a/python/tvm/relay/quantize/_annotate.py
+++ b/python/tvm/relay/quantize/_annotate.py
@@ -192,6 +192,9 @@ def add_rewrite(ref_call, new_args, ctx):
         else:
             # quantize rhs to INPUT field if it is not Constant
             rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
+    if lhs_kind == QAnnotateKind.ACTIVATION and rhs_kind == QAnnotateKind.ACTIVATION:
+        # quantize rhs to INPUT field if both lhs and rhs are ACTIVATION
+        rhs_expr = attach_simulated_quantize(rhs_expr, QAnnotateKind.INPUT)
 
     expr = _forward_op(ref_call, [lhs_expr, rhs_expr])
     return QAnnotateExpr(expr, QAnnotateKind.ACTIVATION)
diff --git a/python/tvm/relay/quantize/quantize.py b/python/tvm/relay/quantize/quantize.py
index 6756090f14a7..56e0f586fc1f 100644
--- a/python/tvm/relay/quantize/quantize.py
+++ b/python/tvm/relay/quantize/quantize.py
@@ -58,6 +58,7 @@ class QConfig(NodeBase):
         "round_for_shift": True,
         "store_lowbit_output": True,
         "debug_enabled_ops": None,
+        "use_stop_fusion": True
     }
 
     # pylint: disable=no-member
@@ -129,6 +130,10 @@ def qconfig(**kwargs):
         Whether to store low-bit integer back as output before dequantizing.
         Some accelerators need this, e.g. VTA.
 
+    use_stop_fusion: boolean
+        Whether add stop_fusion when casting to dtype_activation. stop_fusion forces lowbit
+        results to be stored in memory.
+
     Returns
     -------
     config: QConfig
diff --git a/src/relay/pass/quantize.cc b/src/relay/pass/quantize.cc
index ff6c8ea5c187..a1b93546b84f 100644
--- a/src/relay/pass/quantize.cc
+++ b/src/relay/pass/quantize.cc
@@ -124,7 +124,7 @@ TVM_REGISTER_API("relay._quantize.annotate")
       }
       return e;
     };
-  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, nullptr);
+  return ForwardRewrite(expr, "FQAnnotateRewrite", nullptr, fmulti_ref);
 });
 
 
@@ -329,9 +329,11 @@ float ChooseDomScale(const std::vector<const QRealizeIntExprNode*>& nptrs) {
 
 
 /* \brief Unify the dom scale of arguments */
-Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
+Array<Expr> UnifyDTypeScale(const Array<Expr>& ref_args,
+                            const Array<Expr>& args,
                             DataType* dtype_ptr,
                             Expr* scale_ptr) {
+  static const Op& simulated_quantize = Op::Get("relay.op.annotation.simulated_quantize");
   const QConfig& cfg = QConfig::Current();
 
   std::vector<const QRealizeIntExprNode*> nptrs;
@@ -344,10 +346,19 @@ Array<Expr> UnifyDTypeScale(const Array<Expr>& args,
   }
 
   // unify the data type
+  CHECK_EQ(ref_args.size(), args.size());
   DataType dtype = cfg->dtype_activation;
   for (size_t i = 0; i < ret.size(); ++i) {
+    auto ref_arg = ref_args[i].as<CallNode>();
     if (nptrs[i]->dtype != dtype) {
       ret.Set(i, Cast(ret[i], dtype));
+    } else if (ref_arg && ref_arg->op.same_as(simulated_quantize) &&
+               ref_arg->attrs.as<SimulatedQuantizeAttrs>()->kind == kQInput) {
+      auto new_arg = Cast(ret[i], cfg->dtype_input);
+      if (cfg->use_stop_fusion) {
+        new_arg = StopFusion(new_arg);
+      }
+      ret.Set(i, Cast(new_arg, dtype));
     }
   }
 
@@ -371,7 +382,7 @@ Expr AddRealize(const Call& ref_call,
   if (new_args[0].as<QRealizeIntExprNode>() && new_args[1].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(new_args, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_call->args, new_args, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, ret_args);
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   }
@@ -387,15 +398,19 @@ Expr ConcatenateRealize(const Call& ref_call,
                         const Array<Expr>& new_args,
                         const NodeRef& ctx) {
   CHECK_EQ(new_args.size(), 1);
+  CHECK_EQ(ref_call->args.size(), 1);
 
   const auto* tuple = new_args[0].as<TupleNode>();
+  const auto* ref_tuple = ref_call->args[0].as<TupleNode>();
   CHECK(tuple);
+  CHECK(ref_tuple);
   const Array<Expr>& arr = tuple->fields;
+  const Array<Expr>& ref_arr = ref_tuple->fields;
 
   if (arr[0].as<QRealizeIntExprNode>()) {
     DataType dtype;
     Expr dom_scale;
-    Array<Expr> ret_args = UnifyDTypeScale(arr, &dtype, &dom_scale);
+    Array<Expr> ret_args = UnifyDTypeScale(ref_arr, arr, &dtype, &dom_scale);
     Expr ret = ForwardOp(ref_call, {TupleNode::make(ret_args)});
     return QRealizeIntExprNode::make(ret, dom_scale, dtype);
   } else {
@@ -530,7 +545,8 @@ TVM_STATIC_IR_FUNCTOR(IRPrinter, vtable)
   p->stream << "skip_k_conv==" << op->skip_k_conv << ", ";
   p->stream << "round_for_shift==" << op->round_for_shift << ", ";
   p->stream << "store_lowbit_output==" << op->store_lowbit_output << ", ";
-  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops;
+  p->stream << "debug_enabled_ops==" << op->debug_enabled_ops << ", ";
+  p->stream << "use_stop_fusion==" << op->use_stop_fusion;
   p->stream << ")";
 });
 
diff --git a/src/relay/pass/quantize.h b/src/relay/pass/quantize.h
index b1a15308d914..ed0a8b10a574 100644
--- a/src/relay/pass/quantize.h
+++ b/src/relay/pass/quantize.h
@@ -110,6 +110,7 @@ class QConfigNode : public Node {
   bool round_for_shift = true;
   bool store_lowbit_output = true;
   Array<Expr> debug_enabled_ops = Array<Expr>(NodePtr<Node>(nullptr));
+  bool use_stop_fusion = true;
 
   void VisitAttrs(AttrVisitor* v) final {
     v->Visit("nbit_input", &nbit_input);
@@ -123,6 +124,7 @@ class QConfigNode : public Node {
     v->Visit("round_for_shift", &round_for_shift);
     v->Visit("store_lowbit_output", &store_lowbit_output);
     v->Visit("debug_enabled_ops", &debug_enabled_ops);
+    v->Visit("use_stop_fusion", &use_stop_fusion);
   }
 
   static constexpr const char* _type_key = "relay.quantize.QConfig";

From 829c17903ba020ac9f6f5b41bcd731247fd7e866 Mon Sep 17 00:00:00 2001
From: MORITA Kazutaka <morita.kazutaka@lab.ntt.co.jp>
Date: Sun, 10 Mar 2019 01:58:44 +0900
Subject: [PATCH 80/93] [RUNTIME][OPENCL] set type_key even when platform is
 not available (#2741)

---
 src/runtime/opencl/opencl_device_api.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index 38edcd1967cc..a5ad66b2def4 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -237,6 +237,7 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
   std::lock_guard<std::mutex> lock(this->mu);
   if (initialized_) return;
   if (context != nullptr) return;
+  this->type_key = type_key;
   // matched platforms
   std::vector<cl_platform_id> platform_ids = cl::GetPlatformIDs();
   if (platform_ids.size() == 0) {
@@ -254,7 +255,6 @@ void OpenCLWorkspace::Init(const std::string& type_key, const std::string& devic
       devices_matched = cl::GetDeviceIDs(platform_id, "cpu");
     }
     if (devices_matched.size() > 0) {
-      this->type_key = type_key;
       this->platform_id = platform_id;
       this->platform_name = cl::GetPlatformInfo(platform_id, CL_PLATFORM_NAME);
       this->device_type = device_type;

From 274c4014a561ef2e6707a012c4b386919b877a16 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Mar 2019 15:58:14 -0500
Subject: [PATCH 81/93] [DLPACK] fix flaky ctypes support (#2759)

---
 python/tvm/_ffi/_ctypes/ndarray.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/tvm/_ffi/_ctypes/ndarray.py b/python/tvm/_ffi/_ctypes/ndarray.py
index 37a18cbe4051..da24b9cd41eb 100644
--- a/python/tvm/_ffi/_ctypes/ndarray.py
+++ b/python/tvm/_ffi/_ctypes/ndarray.py
@@ -24,6 +24,8 @@ def _from_dlpack(dltensor):
     dltensor = ctypes.py_object(dltensor)
     if ctypes.pythonapi.PyCapsule_IsValid(dltensor, _c_str_dltensor):
         ptr = ctypes.pythonapi.PyCapsule_GetPointer(dltensor, _c_str_dltensor)
+        # enforce type to make sure it works for all ctypes
+        ptr = ctypes.cast(ptr, ctypes.c_void_p)
         handle = TVMArrayHandle()
         check_call(_LIB.TVMArrayFromDLPack(ptr, ctypes.byref(handle)))
         ctypes.pythonapi.PyCapsule_SetName(dltensor, _c_str_used_dltensor)
@@ -36,6 +38,8 @@ def _dlpack_deleter(pycapsule):
     pycapsule = ctypes.cast(pycapsule, ctypes.py_object)
     if ctypes.pythonapi.PyCapsule_IsValid(pycapsule, _c_str_dltensor):
         ptr = ctypes.pythonapi.PyCapsule_GetPointer(pycapsule, _c_str_dltensor)
+        # enforce type to make sure it works for all ctypes
+        ptr = ctypes.cast(ctypes.c_void_p, ptr)
         _LIB.TVMDLManagedTensorCallDeleter(ptr)
         ctypes.pythonapi.PyCapsule_SetDestructor(dltensor, TVMPyCapsuleDestructor(0))
 

From daf9e80acdf222d8d55cef88289fe582c8c43a69 Mon Sep 17 00:00:00 2001
From: abergeron <bergearn@iro.umontreal.ca>
Date: Sat, 9 Mar 2019 16:38:40 -0500
Subject: [PATCH 82/93] Improvements to the conda build (#2742)

---
 conda/cross-linux.cmake  | 20 ++++++++++++++++++++
 conda/nnvm/meta.yaml     |  2 +-
 conda/topi/meta.yaml     |  2 +-
 conda/tvm-libs/build.sh  | 26 +++++++++++++++++++++++---
 conda/tvm-libs/meta.yaml | 14 +++++---------
 conda/tvm/meta.yaml      |  2 +-
 6 files changed, 51 insertions(+), 15 deletions(-)
 create mode 100644 conda/cross-linux.cmake

diff --git a/conda/cross-linux.cmake b/conda/cross-linux.cmake
new file mode 100644
index 000000000000..bb837eea5ba7
--- /dev/null
+++ b/conda/cross-linux.cmake
@@ -0,0 +1,20 @@
+# this one is important
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_PLATFORM Linux)
+#this one not so much
+set(CMAKE_SYSTEM_VERSION 1)
+
+# specify the cross compiler
+set(CMAKE_C_COMPILER $ENV{CC})
+
+# where is the target environment
+set(CMAKE_FIND_ROOT_PATH $ENV{PREFIX} $ENV{BUILD_PREFIX}/$ENV{HOST}/sysroot)
+
+# search for programs in the build host directories
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# god-awful hack because it seems to not run correct tests to determine this:
+set(__CHAR_UNSIGNED___EXITCODE 1)
diff --git a/conda/nnvm/meta.yaml b/conda/nnvm/meta.yaml
index 7a8b4aab9f2b..bae06740fd0b 100644
--- a/conda/nnvm/meta.yaml
+++ b/conda/nnvm/meta.yaml
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
   skip: True  # [win]
 
 requirements:
diff --git a/conda/topi/meta.yaml b/conda/topi/meta.yaml
index a3133b6c39cc..f13c95ac4032 100644
--- a/conda/topi/meta.yaml
+++ b/conda/topi/meta.yaml
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
 
 requirements:
   host:
diff --git a/conda/tvm-libs/build.sh b/conda/tvm-libs/build.sh
index 1ea99fb3dbc6..772838e63ac6 100644
--- a/conda/tvm-libs/build.sh
+++ b/conda/tvm-libs/build.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+# Fix for OSX build to hide the clang LLVM
+rm -f ${BUILD_PREFIX}/bin/llvm-config
+rm -rf ${BUILD_PREFIX}/lib/cmake
+
 set -e
 
 if [ -z "$PREFIX" ]; then
@@ -9,13 +13,29 @@ fi
 if [ -z "$cuda" ] || [ "$cuda" == "False" ]; then
     CUDA_OPT=""
 else
-    CUDA_OPT="-DUSE_CUDA=ON"
+    CUDA_OPT="-DUSE_CUDA=ON -DUSE_CUBLAS=ON"
+fi
+
+if [ "$target_platform" == "osx-64" ]; then
+    # macOS 64 bits
+    METAL_OPT=""  # Conda can only target 10.9 for now
+    TOOLCHAIN_OPT=""
+else
+    METAL_OPT=""
+    if [ "$target_platform" == "linux-64" ]; then
+        # Linux 64 bits
+        TOOLCHAIN_OPT="-DCMAKE_TOOLCHAIN_FILE=${RECIPE_DIR}/../cross-linux.cmake"
+    else
+        # Windows (or 32 bits, which we don't support)
+        METAL_OPT=""
+        TOOLCHAIN_OPT=""
+    fi
 fi
 
 rm -rf build || true
 mkdir -p build
 cd build
-cmake $CUDA_OPT -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" ..
-make -j4 VERBOSE=1
+cmake $METAL_OPT $CUDA_OPT -DUSE_LLVM=ON -DINSTALL_DEV=ON -DCMAKE_INSTALL_PREFIX="$PREFIX" $TOOLCHAIN_OPT ..
+make -j${CPU_COUNT} VERBOSE=1
 make install
 cd ..
diff --git a/conda/tvm-libs/meta.yaml b/conda/tvm-libs/meta.yaml
index 15c7de9563ad..fcb8f22cad25 100644
--- a/conda/tvm-libs/meta.yaml
+++ b/conda/tvm-libs/meta.yaml
@@ -8,21 +8,17 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
   string: cuda{{ cuda_version }}_{{ PKG_BUILDNUM }}  # [cuda]
 
 requirements:
   build:
-    - {{ compiler('cxx') }}  # [linux]
-    - llvmdev ==6.0.0  # [osx]
-  host:
     # The OS X build will require some manual setup or it will break
-    # See https://conda.io/docs/user-guide/tasks/build-packages/compiler-tools.html#macos-sdk
-    # It is also ass-backward because of llvm brokeness when mixed with the
-    # conda OS X compiler
-    - {{ compiler('cxx') }}  # [osx]
+    # See https://docs.conda.io/projects/conda-build/en/latest/source/resources/compiler-tools.html#macos-sdk
+    - {{ compiler('cxx') }}
+  host:
     - cmake
-    - llvmdev ==6.0.0  # [linux]
+    - llvmdev ==6.0.0
     - zlib  # [linux]
   run:
     - {{ pin_compatible('cudatoolkit', lower_bound=cuda_version, max_pin='x.x') }}  # [cuda]
diff --git a/conda/tvm/meta.yaml b/conda/tvm/meta.yaml
index d60c743b87eb..37adf5b4fe2e 100644
--- a/conda/tvm/meta.yaml
+++ b/conda/tvm/meta.yaml
@@ -8,7 +8,7 @@ source:
   path: ../..
 
 build:
-  number: 1
+  number: 0
 
 requirements:
   build:

From 6f94a1a8c9a7e90776f7067871aa151ea7ae1494 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 9 Mar 2019 19:51:05 -0500
Subject: [PATCH 83/93] [COMMUNITY] @kevinthesun -> committer (#2760)

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index a0ab2a0c91a3..95e897820fec 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -24,6 +24,7 @@ We do encourage everyone to work anything they are interested in.
 - [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi
+- [Yao Wang](https://github.com/kevinthesun): @kevinthesun: - topi, vision
 - [Eddie Yan](https://github.com/eqy): @eqy - runtime, autotvm, rpc, topi
 - [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, topi, relay
 

From f197307e6f51c4bb13ce2b74a64a39c8bd9a6e18 Mon Sep 17 00:00:00 2001
From: lee <cy-l@live.com>
Date: Sun, 10 Mar 2019 09:02:27 +0800
Subject: [PATCH 84/93] [WIN] Fix a bug in find_llvm when specify llvm-config
 (#2758)

---
 cmake/util/FindLLVM.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/util/FindLLVM.cmake b/cmake/util/FindLLVM.cmake
index 8497761a7116..f2ee945207b6 100644
--- a/cmake/util/FindLLVM.cmake
+++ b/cmake/util/FindLLVM.cmake
@@ -37,8 +37,9 @@ macro(find_llvm use_llvm)
     execute_process(COMMAND ${LLVM_CONFIG} --cxxflags
       OUTPUT_VARIABLE __llvm_cxxflags)
     execute_process(COMMAND ${LLVM_CONFIG} --version
-      COMMAND cut -b 1,3
-      OUTPUT_VARIABLE TVM_LLVM_VERSION)
+      OUTPUT_VARIABLE __llvm_version)
+    # llvm version
+    string(REGEX REPLACE "^([^.]+)\.([^.])+\.[^.]+.*$" "\\1\\2" TVM_LLVM_VERSION ${__llvm_version})
     # definitions
     string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" LLVM_DEFINITIONS ${__llvm_cxxflags})
     # include dir

From 0c343c2499c655c9ef4db4e0e931a9cf0c53e353 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=BE=E9=9B=A8=E9=AD=94=E7=90=86=E6=B2=99?=
 <lolisa@marisa.moe>
Date: Sun, 10 Mar 2019 09:29:08 -0700
Subject: [PATCH 85/93] fix typo in backend interpreter (#2752)

---
 python/tvm/relay/backend/interpreter.py        | 2 +-
 tests/python/relay/test_backend_interpreter.py | 8 +++++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/backend/interpreter.py b/python/tvm/relay/backend/interpreter.py
index 7a70a6e45e17..e927df22b201 100644
--- a/python/tvm/relay/backend/interpreter.py
+++ b/python/tvm/relay/backend/interpreter.py
@@ -95,7 +95,7 @@ def __init__(self, value):
 
 def _arg_to_ast(arg):
     if isinstance(arg, TensorValue):
-        return Constant(arg.data.copyto(_nd.cpu(0)))
+        return Constant(arg.data.copyto(nd.cpu(0)))
     elif isinstance(arg, np.ndarray):
         return Constant(nd.array(arg))
     elif isinstance(arg, Constant):
diff --git a/tests/python/relay/test_backend_interpreter.py b/tests/python/relay/test_backend_interpreter.py
index 801b3068eff0..773af1f9fe0e 100644
--- a/tests/python/relay/test_backend_interpreter.py
+++ b/tests/python/relay/test_backend_interpreter.py
@@ -2,7 +2,7 @@
 import tvm
 import tvm.testing
 from tvm import relay
-from tvm.relay.backend.interpreter import Value, TupleValue
+from tvm.relay.backend.interpreter import Value, TupleValue, TensorValue
 from tvm.relay.scope_builder import ScopeBuilder
 from tvm.relay import testing, create_executor
 
@@ -135,6 +135,11 @@ def test_binds():
     tvm.testing.assert_allclose(xx + xx, res)
 
 
+def test_tensor_value():
+    x = relay.var("x", shape=(1, 10))
+    xx = np.ones((1, 10)).astype("float32")
+    check_eval(relay.Function([x], x), [TensorValue(xx)], xx)
+
 def test_kwargs_params():
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
@@ -159,3 +164,4 @@ def test_kwargs_params():
     test_binds()
     test_kwargs_params()
     test_ref()
+    test_tensor_value()

From c8eb7d9af815dd2175f9ccfaef6c85454d927fca Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sun, 10 Mar 2019 12:31:34 -0400
Subject: [PATCH 86/93] [ARITH] Analyzer RewriteSimplifier: add/sub/mul/div/mod
 (#2722)

---
 include/tvm/arithmetic.h                      |  35 +
 python/tvm/arith.py                           |  16 +
 src/api/api_arith.cc                          |   4 +
 src/arithmetic/analyzer.cc                    |  11 +-
 src/arithmetic/const_fold.h                   |   4 +-
 src/arithmetic/pattern_match.h                |  51 +-
 src/arithmetic/rewrite_simplify.cc            | 650 ++++++++++++++++++
 tests/cpp/pattern_match_test.cc               |   1 +
 .../unittest/test_arith_rewrite_simplify.py   | 252 +++++++
 9 files changed, 1016 insertions(+), 8 deletions(-)
 create mode 100644 src/arithmetic/rewrite_simplify.cc
 create mode 100644 tests/python/unittest/test_arith_rewrite_simplify.py

diff --git a/include/tvm/arithmetic.h b/include/tvm/arithmetic.h
index 44b00b5d89fa..d023f8f1cf7e 100644
--- a/include/tvm/arithmetic.h
+++ b/include/tvm/arithmetic.h
@@ -192,6 +192,39 @@ class ModularSetAnalyzer {
   Impl* impl_;
 };
 
+/*!
+ * \brief Rewrite-rule based simplifier.
+ */
+class RewriteSimplifier {
+ public:
+  /*!
+   * \brief analyze the expr
+   * \param expr The expression of interest.
+   * \return the result of the analysis.
+   */
+  Expr operator()(const Expr& expr);
+
+  /*!
+   * \brief Update binding of var to a new expression.
+   *
+   * \param var The variable of interest.
+   * \param new_expr
+   * \param override Whether do we allow override of existing information.
+   */
+  void Update(const Var& var,
+              const Expr& new_expr,
+              bool override = false);
+
+ private:
+  friend class Analyzer;
+  friend class ConstraintContext;
+  explicit RewriteSimplifier(Analyzer* parent);
+  ~RewriteSimplifier();
+  class Impl;
+  /*! \brief Internal impl */
+  Impl* impl_;
+};
+
 /*!
  * \brief A RAII constraint context.
  *
@@ -242,6 +275,8 @@ class Analyzer {
   ConstIntBoundAnalyzer const_int_bound;
   /*! \brief sub-analyzer: modular set */
   ModularSetAnalyzer modular_set;
+  /*! \brief sub-analyzer rewrite simplfy */
+  RewriteSimplifier rewrite_simplify;
   /*! \brief constructor */
   Analyzer();
   /*!
diff --git a/python/tvm/arith.py b/python/tvm/arith.py
index 92aaa36aa10f..3981a4815aeb 100644
--- a/python/tvm/arith.py
+++ b/python/tvm/arith.py
@@ -96,6 +96,7 @@ def __init__(self):
         self._const_int_bound_update = _mod("const_int_bound_update")
         self._bind = _mod("bind")
         self._modular_set = _mod("modular_set")
+        self._rewrite_simplify = _mod("rewrite_simplify")
         self._enter_constraint_context = _mod("enter_constraint_context")
 
     def const_int_bound(self, expr):
@@ -128,6 +129,21 @@ def modular_set(self, expr):
         """
         return self._modular_set(expr)
 
+    def rewrite_simplify(self, expr):
+        """Simplify expression via rewriting rules.
+
+        Parameters
+        ----------
+        expr : tvm.Expr
+            The expression.
+
+        Returns
+        -------
+        result : Expr
+            The result.
+        """
+        return self._rewrite_simplify(expr)
+
     def bind(self, var, expr):
         """Bind a variable to the expression.
 
diff --git a/src/api/api_arith.cc b/src/api/api_arith.cc
index a714fe37005b..cc7d814617a9 100644
--- a/src/api/api_arith.cc
+++ b/src/api/api_arith.cc
@@ -98,6 +98,10 @@ TVM_REGISTER_API("arith._CreateAnalyzer")
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             self->const_int_bound.Update(args[0], args[1], args[2]);
         });
+      } else if (name == "rewrite_simplify") {
+        return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
+            *ret = self->rewrite_simplify(args[0]);
+        });
       } else if (name == "bind") {
         return PackedFunc([self](TVMArgs args, TVMRetValue *ret) {
             auto& sptr = args[1].node_sptr();
diff --git a/src/arithmetic/analyzer.cc b/src/arithmetic/analyzer.cc
index 236a21ba71f5..81195eba2747 100644
--- a/src/arithmetic/analyzer.cc
+++ b/src/arithmetic/analyzer.cc
@@ -2,6 +2,7 @@
  *  Copyright (c) 2019 by Contributors
  * \file tvm/arithmetic/analyzer.cc
  */
+#include <tvm/ir.h>
 #include <tvm/arithmetic.h>
 
 namespace tvm {
@@ -9,19 +10,22 @@ namespace arith {
 
 Analyzer::Analyzer()
     : const_int_bound(this),
-      modular_set(this) {
+      modular_set(this),
+      rewrite_simplify(this) {
 }
 
 void Analyzer::Bind(const VarExpr& v, const Expr& expr) {
   Var var(v.node_);
   this->const_int_bound.Update(var, this->const_int_bound(expr));
   this->modular_set.Update(var, this->modular_set(expr));
+  this->rewrite_simplify.Update(var, this->rewrite_simplify(expr));
 }
 
 void Analyzer::Bind(const VarExpr& v, const Range& range) {
   Var var(v.node_);
   this->const_int_bound.Bind(var, range);
   // skip modular_set
+  // skip rewrite simplify
 }
 
 ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint) {
@@ -36,7 +40,10 @@ ConstraintContext::ConstraintContext(Analyzer* analyzer, const Expr& constraint)
 }
 
 bool Analyzer::CanProveGreaterEqual(const Expr& expr, int64_t lower_bound) {
-  auto bd = this->const_int_bound(expr);
+  if (const auto* ptr = expr.as<ir::IntImm>()) {
+    return ptr->value > lower_bound;
+  }
+  auto bd = this->const_int_bound(this->rewrite_simplify(expr));
   if (bd->min_value >= lower_bound) return true;
   return false;
 }
diff --git a/src/arithmetic/const_fold.h b/src/arithmetic/const_fold.h
index 91613867115b..4c247c8a7b59 100644
--- a/src/arithmetic/const_fold.h
+++ b/src/arithmetic/const_fold.h
@@ -23,7 +23,9 @@ namespace arith {
  * \return nullptr if constant fold fails, otherwise return folded result.
  */
 template<typename Op>
-inline Expr TryConstFold(Expr a, Expr b);
+inline Expr TryConstFold(Expr a, Expr b) {
+  return Expr();
+}
 
 /*!
  * \brief Try to run unary compute with constant folding.
diff --git a/src/arithmetic/pattern_match.h b/src/arithmetic/pattern_match.h
index 50f2300dd4b7..20c24b330cbd 100644
--- a/src/arithmetic/pattern_match.h
+++ b/src/arithmetic/pattern_match.h
@@ -49,6 +49,7 @@
 
 #include <tvm/ir_pass.h>
 #include <tuple>
+#include "const_fold.h"
 
 namespace tvm {
 namespace arith {
@@ -242,7 +243,11 @@ class PBinaryExpr :
   }
 
   Expr Eval() const {
-    return NodeType::make(a_.Eval(), b_.Eval());
+    Expr lhs = a_.Eval();
+    Expr rhs = b_.Eval();
+    Expr ret = TryConstFold<NodeType>(lhs, rhs);
+    if (ret.defined()) return ret;
+    return NodeType::make(lhs, rhs);
   }
 
  private:
@@ -250,12 +255,48 @@ class PBinaryExpr :
   typename TB::Nested b_;
 };
 
+template<typename TA>
+class PConstWithTypeLike :
+      public Pattern<PConstWithTypeLike<TA> > {
+ public:
+  PConstWithTypeLike(const TA& ref, int64_t value)
+      : ref_(ref), value_(value) {}
+
+  void InitMatch_() const {}
+
+  bool Match_(const NodeRef& node) const {
+    if (const ir::IntImm* ptr = node.as<ir::IntImm>()) {
+      return ptr->value == value_;
+    } else {
+      return false;
+    }
+  }
+
+  Expr Eval() const {
+    return make_const(ref_.Eval().type(), value_);
+  }
+
+ private:
+  typename TA::Nested ref_;
+  int64_t value_;
+};
+
 
-#define TVM_PATTERN_BINARY_OP(FuncName, NodeName)             \
-  template<typename TA, typename TB>                          \
-  inline PBinaryExpr<NodeName, TA, TB>                        \
-  FuncName(const Pattern<TA>& a, const Pattern<TB>& b) {      \
+#define TVM_PATTERN_BINARY_OP(FuncName, NodeName)                   \
+  template<typename TA, typename TB>                                \
+  inline PBinaryExpr<NodeName, TA, TB>                              \
+  FuncName(const Pattern<TA>& a, const Pattern<TB>& b) {            \
     return PBinaryExpr<NodeName, TA, TB>(a.derived(), b.derived()); \
+  }                                                                 \
+  template<typename TA>                                             \
+  inline PBinaryExpr<NodeName, TA, PConstWithTypeLike<TA> >         \
+  FuncName(const Pattern<TA>& a, int64_t b) {                       \
+    return FuncName(a, PConstWithTypeLike<TA>(a.derived(), b));     \
+  }                                                                 \
+  template<typename TA>                                             \
+  inline PBinaryExpr<NodeName, PConstWithTypeLike<TA>, TA>          \
+  FuncName(int64_t b, const Pattern<TA>& a) {                       \
+    return FuncName(PConstWithTypeLike<TA>(a.derived(), b), a);     \
   }
 
 // arithmetic expressions
diff --git a/src/arithmetic/rewrite_simplify.cc b/src/arithmetic/rewrite_simplify.cc
new file mode 100644
index 000000000000..b304a8dc4bf2
--- /dev/null
+++ b/src/arithmetic/rewrite_simplify.cc
@@ -0,0 +1,650 @@
+/*!
+ *  Copyright (c) 2019 by Contributors
+ * \file rewrite_simplify.cc
+ * \brief Rewrite-rule based simplification.
+ */
+// Acknowledgement: Most rewrite-rules are from Halide.
+#include <tvm/arithmetic.h>
+#include <tvm/expr_operator.h>
+#include <tvm/ir_mutator.h>
+#include "const_fold.h"
+#include "pattern_match.h"
+
+namespace tvm {
+namespace arith {
+
+using namespace ir;
+
+// macro for doing simple rewrite
+#define TVM_TRY_REWRITE(SrcExpr, ResExpr)       \
+  if ((SrcExpr).Match(ret)) {                   \
+    return (ResExpr).Eval();                    \
+  }
+
+// macro for rewrite + recursively rewrite ResExpr
+#define TVM_TRY_RECURSIVE_REWRITE(SrcExpr, ResExpr) \
+  if ((SrcExpr).Match(ret)) {                       \
+    return RecursiveRewrite((ResExpr).Eval());      \
+  }
+
+// macro rewrite only if CondExor is true after match.
+#define TVM_TRY_REWRITE_IF(SrcExpr, ResExpr, CondExpr)  \
+  if ((SrcExpr).Match(ret) && (CondExpr)) {             \
+    return (ResExpr).Eval();                            \
+  }
+
+// macro rewrite + recursive_rewrite only if CondExor is true after match.
+#define TVM_TRY_RECURSIVE_REWRITE_IF(SrcExpr, ResExpr, CondExpr)  \
+  if ((SrcExpr).Match(ret) && (CondExpr)) {                       \
+    return RecursiveRewrite((ResExpr).Eval());                    \
+  }
+
+
+// NOTE for developers:
+//
+// We mainly focus on index expression simplification.
+// Besides the RewriteSimplifier, some cases can be better
+// handled by CanonicalSimplifier.
+//
+class RewriteSimplifier::Impl : public IRMutator {
+ public:
+  explicit Impl(Analyzer* parent)
+      : parent_(parent) {}
+
+  void Update(const Var& var,
+              const Expr& info,
+              bool override) {
+    if (!override) {
+      CHECK(!var_map_.count(var));
+    }
+    var_map_[var] = info;
+  }
+
+  // Run simplification in post order
+  Expr PostOrderSimplify(Expr expr, int max_iter = 2) {
+    for (int i = 0; i < max_iter; ++i) {
+      Expr new_expr = this->Mutate(expr);
+      if (new_expr.same_as(expr)) return expr;
+      expr = new_expr;
+    }
+    return expr;
+  }
+
+  Expr Mutate_(const Add* op, const Expr& self) final;
+  Expr Mutate_(const Sub* op, const Expr& self) final;
+  Expr Mutate_(const Mul* op, const Expr& self) final;
+  Expr Mutate_(const Div* op, const Expr& self) final;
+  Expr Mutate_(const Mod* op, const Expr& self) final;
+
+ private:
+  // reference to the main analyzer
+  Analyzer* parent_;
+  // counter to record recursive rewrite depth.
+  int recur_depth_{0};
+  // internal variable map
+  std::unordered_map<Var, Expr, ExprHash, ExprEqual> var_map_;
+  // maximum number of recursion allowed during a single pass.
+  static const constexpr int kMaxRecurDepth = 5;
+  // Whether x >= val
+  bool CanProveGreaterEqual(const Expr& x, int64_t val) {
+    return parent_->CanProveGreaterEqual(x, val);
+  }
+  // Whether x == val
+  bool CanProveEqual(const Expr& x, int64_t val) {
+    // TODO(tqchen) refer back to super-analyzer.
+    Expr res = Mutate(x);
+    if (const auto* ptr = res.as<ir::IntImm>()) {
+      return ptr->value == val;
+    }
+    return false;
+  }
+  // Recursive rewrite x
+  // we limit maximum depth of recursive rewrite allowed to
+  // avoid infinite loop
+  Expr RecursiveRewrite(const Expr& x) {
+    if (recur_depth_ >= kMaxRecurDepth) return x;
+    ++recur_depth_;
+    Expr res = Mutate(x);
+    --recur_depth_;
+    return res;
+  }
+
+  template<typename TA>
+  PConstWithTypeLike<TA> ZeroWithTypeLike(const Pattern<TA>& pattern) {
+    return PConstWithTypeLike<TA>(pattern.derived(), 0);
+  }
+};
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Add* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Add>();
+  Expr const_res = TryConstFold<Add>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) + ramp(b2, s2, lanes),
+                    ramp(b1 + b2, s1 + s2, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) + broadcast(x, lanes),
+                    ramp(b1 + x, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) + ramp(b1, s1, lanes),
+                    ramp(x + b1, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) + broadcast(y, lanes),
+                    broadcast(x + y, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // Index rules
+    // cancelation rules
+    TVM_TRY_REWRITE((x - y) + y, x);
+    TVM_TRY_REWRITE(x + (y - x), y);
+
+    TVM_TRY_REWRITE((x - y) + (y - z), x - z);
+    TVM_TRY_REWRITE((x - y) + (z - x), z - y);
+
+    TVM_TRY_REWRITE(min(x, y - z) + z, min(x + z, y));
+    TVM_TRY_REWRITE(min(x - z, y) + z, min(x, y + z));
+    TVM_TRY_REWRITE(max(x, y - z) + z, max(x + z, y));
+    TVM_TRY_REWRITE(max(x - z, y) + z, max(x, y + z));
+    TVM_TRY_REWRITE(max(x, y) + min(x, y), x + y);
+    TVM_TRY_REWRITE(min(x, y) + max(x, y), x + y);
+    TVM_TRY_REWRITE(max(x, y) + min(y, x), x + y);
+    TVM_TRY_REWRITE(min(x, y) + max(y, x), x + y);
+
+    TVM_TRY_REWRITE_IF(min(x, y + c1) + c2, min(x + c2, y),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(min(x + c1, y) + c2, min(x, y + c2),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(max(x, y + c1) + c2, max(x + c2, y),
+                       c1.Eval()->value == -c2.Eval()->value);
+    TVM_TRY_REWRITE_IF(max(x + c1, y) + c2, max(x, y + c2),
+                       c1.Eval()->value == -c2.Eval()->value);
+
+    // constant folding
+    // NOTE: canonicalization might better at this.
+    TVM_TRY_REWRITE((x + c1) + c2, x + (c1 + c2));
+
+    // mul co-efficient folding
+    TVM_TRY_REWRITE(x + x, x * 2);
+    TVM_TRY_REWRITE(x * y + x, x * (y + 1));
+    TVM_TRY_REWRITE(y * x + x, x * (y + 1));
+    TVM_TRY_REWRITE(x + y * x, x * (1 + y));
+    TVM_TRY_REWRITE(x + x * y, x * (1 + y));
+    TVM_TRY_REWRITE(x * y + x * z, x * (y + z));
+    TVM_TRY_REWRITE(y * x + x * z, x * (y + z));
+    TVM_TRY_REWRITE(x * y + z * x, x * (y + z));
+    TVM_TRY_REWRITE(y * x + z * x, x * (y + z));
+
+    // modular-div simplification
+    // Always pre-condition on positive integer domain
+    TVM_TRY_REWRITE_IF(
+        (x / c1) * c1 + x % c1, x,
+        CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+
+    // canonicalization rule
+    // will try rewrite again after canonicalization.
+    TVM_TRY_RECURSIVE_REWRITE(x + (c1 - y), (x - y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x + c1 + y, (x + y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x + (c1 + y), (x + y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE((y % c1) + x * c1, x * c1 + (y % c1));
+  }
+
+  // condition rules.
+  TVM_TRY_REWRITE(select(x, b1, b2) + select(x, s1, s2),
+                  select(x, b1 + s1, b2 + s2));
+  // default value
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Sub* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Sub>();
+  Expr const_res = TryConstFold<Sub>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) - ramp(b2, s2, lanes),
+                    ramp(b1 - b2, s1 - s2, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) - broadcast(x, lanes),
+                    ramp(b1 - x, s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) - ramp(b1, s1, lanes),
+                    ramp(x - b1, 0 - s1, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) - broadcast(y, lanes),
+                    broadcast(x - y, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // Index rules
+    // cancelation rules
+    TVM_TRY_REWRITE((x + y) - y, x);
+    TVM_TRY_REWRITE((x + y) - x, y);
+    TVM_TRY_REWRITE(x - (y + x), 0 - y);
+    TVM_TRY_REWRITE(x - (x + y), 0 - y);
+
+    TVM_TRY_REWRITE(min(x, y) - x, min(0, y - x));
+    TVM_TRY_REWRITE(min(x, y) - y, min(x - y, 0));
+    TVM_TRY_REWRITE(max(x, y) - x, max(0, y - x));
+    TVM_TRY_REWRITE(max(x, y) - y, max(x - y, 0));
+
+    TVM_TRY_REWRITE(x - max(x, y), min(0, x - y));
+    TVM_TRY_REWRITE(y - max(x, y), min(y - x, 0));
+    TVM_TRY_REWRITE(x - min(x, y), max(0, x - y));
+    TVM_TRY_REWRITE(y - min(x, y), max(y - x, 0));
+
+    // mul co-efficient folding
+    TVM_TRY_REWRITE(x - x, ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(x * y - x, x * (y - 1));
+    TVM_TRY_REWRITE(y * x - x, x * (y - 1));
+    TVM_TRY_REWRITE(x - y * x, x * (1 - y));
+    TVM_TRY_REWRITE(x - x * y, x * (1 - y));
+    TVM_TRY_REWRITE(x * y - x * z, x * (y - z));
+    TVM_TRY_REWRITE(y * x - x * z, x * (y - z));
+    TVM_TRY_REWRITE(x * y - z * x, x * (y - z));
+    TVM_TRY_REWRITE(y * x - z * x, x * (y - z));
+
+    // constant cancelation
+    TVM_TRY_REWRITE((x + c1) - c2, x + (c1 - c2));
+    TVM_TRY_REWRITE((c1 - x) - (c2 - y), (y - x) + (c1 - c2));
+
+    // cancelization rule involving 4 operands
+    TVM_TRY_REWRITE((x + y) - (x + z), y - z);
+    TVM_TRY_REWRITE((x + y) - (z + x), y - z);
+    TVM_TRY_REWRITE((y + x) - (z + x), y - z);
+    TVM_TRY_REWRITE((y + x) - (x + z), y - z);
+
+    TVM_TRY_REWRITE(min(x + y, z) - x,  min(y, z - x));
+    TVM_TRY_REWRITE(min(y + x, z) - x,  min(y, z - x));
+    TVM_TRY_REWRITE(min(z, x + y) - x,  min(z - x, y));
+    TVM_TRY_REWRITE(min(z, y + x) - x,  min(z - x, y));
+
+    TVM_TRY_REWRITE(x - min(x + y, z),  max(0 - y, x - z));
+    TVM_TRY_REWRITE(x - min(y + x, z),  max(0 - y, x - z));
+    TVM_TRY_REWRITE(x - min(z, x + y),  max(x - z, 0 - y));
+    TVM_TRY_REWRITE(x - min(z, y + x),  max(x - z, 0 - y));
+
+    TVM_TRY_REWRITE(min(x, y) - min(y, x), ZeroWithTypeLike(x));
+    TVM_TRY_REWRITE(max(x, y) - max(y, x), ZeroWithTypeLike(x));
+
+    TVM_TRY_REWRITE_IF(min(b1, b2) - min(s1, s2), b1 - s1,
+                       CanProveEqual(((b1 - s1) - (b2 - s2)).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(b1, b2) - min(s1, s2), b1 - s2,
+                       CanProveEqual(((b1 - s2) - (b2 - s1)).Eval(), 0));
+    TVM_TRY_REWRITE_IF(max(b1, b2) - max(s1, s2), b1 - s1,
+                       CanProveEqual(((b1 - s1) - (b2 - s2)).Eval(), 0));
+    TVM_TRY_REWRITE_IF(max(b1, b2) - max(s1, s2), b1 - s2,
+                       CanProveEqual(((b1 - s2) - (b2 - s1)).Eval(), 0));
+
+    // modular-div simplification
+    // Always pre-condition on positive integer domain
+    TVM_TRY_REWRITE_IF(x - (x / c1) * c1, x % c1,
+                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x / c1) * c1 - x, 0 - (x % c1),
+                       CanProveGreaterEqual(x.Eval(), 0) && c1.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x + c1) / c3  - (x + c2) / c3,
+                       ((x + (c1 % c3)) % c3 + (c1 - c2)) / c3,
+                       CanProveGreaterEqual(x.Eval(), -c2.Eval()->value) &&
+                       c1.Eval()->value >= c2.Eval()->value &&
+                       c3.Eval()->value > 0);
+    TVM_TRY_REWRITE_IF((x + c1) / c3  - x / c3,
+                       ((x + (c1 % c3)) % c3 + c1) / c3,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       c1.Eval()->value >= 0 &&
+                       c3.Eval()->value > 0);
+    // canonicalization rule
+    // will try rewrite again after canonicalization.
+    TVM_TRY_REWRITE(x - c1, x + (0 - c1));
+    TVM_TRY_RECURSIVE_REWRITE((x + c1) - y, (x - y) + c1);
+    TVM_TRY_RECURSIVE_REWRITE(x - (y - z), (x + z) - y);
+    TVM_TRY_RECURSIVE_REWRITE(x - y * c1, x + y * (0 - c1));
+  }
+
+  // condition rules.
+  TVM_TRY_REWRITE(select(x, b1, b2) - select(x, s1, s2),
+                  select(x, b1 - s1, b2 - s2));
+  TVM_TRY_REWRITE(select(x, y, z) - z,
+                  select(x, y - z, ZeroWithTypeLike(z)));
+  TVM_TRY_REWRITE(select(x, y, z) - y,
+                  select(x, ZeroWithTypeLike(y), z - y));
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Mul* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Mul>();
+  Expr const_res = TryConstFold<Mul>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1, b2, s1, s2;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) * broadcast(y, lanes),
+                    broadcast(x * y, lanes));
+    TVM_TRY_REWRITE(ramp(b1, s1, lanes) * broadcast(x, lanes),
+                    ramp(b1 * x, s1 * x, lanes));
+    TVM_TRY_REWRITE(broadcast(x, lanes) * ramp(b1, s1, lanes),
+                    ramp(b1 * x, s1 * x, lanes));
+  }
+
+  if (IsIndexType(op->type)) {
+    // constant simplification rule
+    TVM_TRY_REWRITE((x + c1) * c2, x * c2 + c1 * c2);
+    TVM_TRY_REWRITE((x * c1) * c2, x * (c1 * c2));
+    TVM_TRY_REWRITE(min(x, y) * max(x, y), x * y);
+    TVM_TRY_REWRITE(max(x, y) * min(x, y), x * y);
+
+    // canonicalization
+    TVM_TRY_RECURSIVE_REWRITE(x * (c1 * y), (x * y) * c1);
+    TVM_TRY_RECURSIVE_REWRITE_IF(
+        (x - y) * c1, (y - x) * (0 - c1),
+        c1.Eval()->value < 0);
+  }
+  return ret;
+}
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Div* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Div>();
+  Expr const_res = TryConstFold<Div>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) / broadcast(y, lanes),
+                    broadcast(x / y, lanes));
+    // ramp / bcast
+    if ((ramp(b1, c1, lanes) / broadcast(c2, lanes)).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val % c2val == 0) {
+        return ramp(b1 / c2, c1 / c2, lanes).Eval();
+      }
+      // If all possible indices in ramp are the same.
+      if (CanProveGreaterEqual(b1.Eval(), 0)) {
+        ModularSet bmod = parent_->modular_set(b1.Eval());
+        int64_t ramp_min = bmod->base / c2val;
+        int64_t ramp_max = (bmod->base + (lanes.Eval() - 1) * c1val) / c2val;
+        if (bmod->coeff % c2val == 0 && ramp_min == ramp_max) {
+          return broadcast(b1 / c2, lanes).Eval();
+        }
+      }
+    }
+  }
+
+  if (IsIndexType(op->type)) {
+    // Be-aware of the division rules:
+    // We adopt the default C division uses truncation instead of floordiv.
+    // This means most rules need to check non-negativeness of the operands.
+
+    // while it is always true for trunc div
+    // restrict to common case(positive div)
+    TVM_TRY_REWRITE_IF((x / c1) / c2, x / (c1 * c2),
+                       c1.Eval()->value > 0 && c2.Eval()->value > 0);
+
+    TVM_TRY_REWRITE_IF((x / c1 + c2) / c3, (x + c1 * c2) / (c1 * c3),
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value >= 0 &&
+                       c3.Eval()->value > 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    if (((x * c1) / c2).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val > 0 && c2val > 0) {
+        if (c1val % c2val == 0) return (x * (c1 / c2)).Eval();
+        if (c2val % c1val == 0) return (x / (c2 / c1)).Eval();
+      }
+    }
+
+    // Rules involving 2-operands.
+    TVM_TRY_REWRITE_IF((x * c1 + y) / c2, x * (c1 / c2) + y / c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(x * c1, y) / c2, min(x * (c1 / c2), y / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(max(x * c1, y) / c2, max(x * (c1 / c2), y / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((y + x * c1) / c2, y / c2 + x * (c1 / c2),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(min(y, x * c1) / c2, min(y / c2, x * (c1 / c2)),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(max(y, x * c1) / c2, max(y / c2, x * (c1 / c2)),
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    // Rules involving 3-operands.
+    TVM_TRY_REWRITE_IF((x * c1 + y + z) / c2, x * (c1 / c2) + (y + z)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * c1 - y + z) / c2, x * (c1 / c2) + (z - y)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((z - y).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * c1 + y - z) / c2, x * (c1 / c2) + (y - z)/ c2,
+                       c1.Eval()->value >= 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y - z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((y + x * c1 + z) / c2, x * (c1 / c2) + (y + z) / c2,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + c1) / c2, x / c2 + c1 / c2,
+                       c1.Eval()->value > 0 &&
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + y) / x, y / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + x) / x, y / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF(((x + y) + z) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF(((y + x) + z) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + (z + x)) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + (x + z)) / x, (y + z) / x + 1,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual((y + z).Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * y) / y, x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y * x) / y, x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x * z + y) / z, x + y / z,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((z * x + y) / z, x + y / z,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + x * z) / z, y / z + x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+    TVM_TRY_REWRITE_IF((y + z * x) / z, y / z + x,
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0) &&
+                       CanProveGreaterEqual(z.Eval(), 0));
+  }
+  return ret;
+}
+
+
+Expr RewriteSimplifier::Impl::
+Mutate_(const Mod* op, const Expr& self) {
+  Expr ret = IRMutator::Mutate_(op, self);
+  op = ret.as<Mod>();
+  Expr const_res = TryConstFold<Mod>(op->a, op->b);
+  if (const_res.defined()) return const_res;
+
+  // Pattern var to match any expression
+  PVar<Expr> x, y, z, b1;
+  // Pattern var match IntImm
+  PVar<Integer> c1, c2, c3;
+  // Pattern var for lanes in broadcast and ramp
+  PVar<int> lanes;
+
+  // Vector rules
+  if (op->type.lanes() != 1) {
+    TVM_TRY_REWRITE(broadcast(x, lanes) % broadcast(y, lanes),
+                    broadcast(x % y, lanes));
+
+    // ramp % bcast
+    if ((ramp(b1, c1, lanes) % broadcast(c2, lanes)).Match(ret)) {
+      int64_t c1val = c1.Eval()->value;
+      int64_t c2val = c2.Eval()->value;
+      if (c1val % c2val == 0) {
+        return broadcast(b1 % c2, lanes).Eval();
+      }
+      // If all possible indices in ramp are the same.
+      if (CanProveGreaterEqual(b1.Eval(), 0)) {
+        ModularSet bmod = parent_->modular_set(b1.Eval());
+        int64_t ramp_min = bmod->base / c2val;
+        int64_t ramp_max = (bmod->base + (lanes.Eval() - 1) * c1val) / c2val;
+        if (bmod->coeff % c2val == 0) {
+          if (ramp_min == ramp_max) {
+            return ramp(bmod->base % c2, c1, lanes).Eval();
+          } else {
+            return (ramp(bmod->base % c2, c1, lanes) % broadcast(c2, lanes)).Eval();
+          }
+        }
+      }
+    }
+  }
+
+  if (IsIndexType(op->type)) {
+    // Be-aware of the division rules:
+    // We adopt the default C division uses truncation instead of floordiv.
+    // This means most rules need to check non-negativeness of the operands.
+    TVM_TRY_REWRITE_IF((x * c1) % c2, ZeroWithTypeLike(x),
+                       c2.Eval()->value != 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0);
+
+    TVM_TRY_REWRITE_IF((x * c1 + y) % c2, y % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + c1) % c2, x % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0));
+
+    TVM_TRY_REWRITE_IF((x + y * c1) % c2, x % c2,
+                       c2.Eval()->value > 0 &&
+                       c1.Eval()->value % c2.Eval()->value == 0 &&
+                       CanProveGreaterEqual(x.Eval(), 0) &&
+                       CanProveGreaterEqual(y.Eval(), 0));
+
+    // try modular analysis
+    if ((x % c1).Match(ret)) {
+      ModularSet mod = parent_->modular_set(x.Eval());
+      int64_t c1val = c1.Eval()->value;
+      if (mod->coeff % c1val == 0 &&
+          CanProveGreaterEqual(x.Eval(), 0)) {
+        return (mod->base % c1).Eval();
+      }
+    }
+  }
+  return ret;
+}
+
+
+Expr RewriteSimplifier::operator()(const Expr& expr) {
+  return impl_->PostOrderSimplify(expr);
+}
+
+void RewriteSimplifier::Update(const Var& var,
+                               const Expr& info,
+                               bool override) {
+  impl_->Update(var, info, override);
+}
+
+
+RewriteSimplifier::RewriteSimplifier(Analyzer* parent)
+    : impl_(new Impl(parent)) {
+}
+
+RewriteSimplifier::~RewriteSimplifier() {
+  delete impl_;
+}
+
+}  // namespace arith
+}  // namespace tvm
diff --git a/tests/cpp/pattern_match_test.cc b/tests/cpp/pattern_match_test.cc
index 1945339a259c..ea1a8427e61a 100644
--- a/tests/cpp/pattern_match_test.cc
+++ b/tests/cpp/pattern_match_test.cc
@@ -117,6 +117,7 @@ TEST(Pattern, Integer) {
     // special case container of Expr
     CHECK((v * c).Match(tx * 3));
     CHECK_EQ(c.Eval()->value, 3);
+    CHECK((v * 3).Match(tx * 3));
   }
   // cannot match c to ty
   CHECK(!(v * c).Match(tx * ty));
diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py
new file mode 100644
index 000000000000..bbfddddd41da
--- /dev/null
+++ b/tests/python/unittest/test_arith_rewrite_simplify.py
@@ -0,0 +1,252 @@
+import tvm
+
+class RewriteChecker:
+    def __init__(self):
+        self.analyzer = tvm.arith.Analyzer()
+
+    def verify(self, data, expected):
+        res = self.analyzer.rewrite_simplify(data)
+        assert tvm.ir_pass.Equal(res, expected), "data={}, res={}, expected={}".format(
+            data, res, expected)
+
+
+def test_vector_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    # Add rules
+    ck.verify(tvm.expr.Ramp(x, 1, 4) + tvm.expr.Ramp(y, 2, 4),
+              tvm.expr.Ramp(x + y, 3, 4))
+    ck.verify(tvm.expr.Ramp(x, 1, 2) + y,
+              tvm.expr.Ramp(x + y, 1, 2))
+    ck.verify(y + tvm.expr.Ramp(x, 1, 2) ,
+              tvm.expr.Ramp(y + x, 1, 2))
+    ck.verify(y.astype("int32x2") + x.astype("int32x2"),
+              (y + x).astype("int32x2"))
+    # Sub rules
+    ck.verify(tvm.expr.Ramp(x, 4, 4) - tvm.expr.Ramp(y, 2, 4),
+              tvm.expr.Ramp(x - y, 2, 4))
+    ck.verify(tvm.expr.Ramp(x, 1, 2) - y,
+              tvm.expr.Ramp(x - y, 1, 2))
+    ck.verify(y - tvm.expr.Ramp(x, 1, 2) ,
+              tvm.expr.Ramp(y - x, -1, 2))
+    ck.verify(y.astype("int32x2") - x.astype("int32x2"),
+              (y - x).astype("int32x2"))
+
+    # Mul rules
+    ck.verify(y.astype("int32x2") * x.astype("int32x2"),
+              (y * x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) * 2,
+              tvm.expr.Ramp(x * 2, 8, 4))
+    ck.verify(2 * tvm.expr.Ramp(x, 4, 4),
+              tvm.expr.Ramp(x * 2, 8, 4))
+
+    ## Div rules
+    ck.verify(y.astype("int32x2") / x.astype("int32x2"),
+              (y / x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) / 2,
+              tvm.expr.Ramp(x/ 2, 2, 4))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) / 8,
+              (x).astype("int32x4"))
+    ck.verify(tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8,
+              tvm.expr.Ramp(x * 8 + 15, 1, 4) / 8)
+
+    ## Mod rules
+    ck.verify(y.astype("int32x2") % x.astype("int32x2"),
+              (y % x).astype("int32x2"))
+    ck.verify(tvm.expr.Ramp(x, 4, 4) % 2,
+              tvm.expr.Broadcast(x % 2, 4))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 1, 4) % 8,
+              tvm.expr.Ramp(1, 1, 4))
+    ck.verify(tvm.expr.Ramp(x * 8 + 1, 15, 4) % 8,
+              tvm.expr.Ramp(1, 15, 4) % 8)
+
+
+
+def test_select_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    # Add rules
+    ck.verify(tvm.expr.Select(x > 0, y, 0) + tvm.expr.Select(x > 0, 1, z),
+              tvm.expr.Select(x > 0, y + 1, z))
+    ck.verify(tvm.expr.Select(x > 0, y, 1) - tvm.expr.Select(x > 0, 1, z),
+              tvm.expr.Select(x > 0, y + (-1), 1 - z))
+    ck.verify(tvm.expr.Select(x > 0, y, z) - y,
+              tvm.expr.Select(x > 0, 0, z - y))
+    ck.verify(tvm.expr.Select(x > 0, y, z) - z,
+              tvm.expr.Select(x > 0, y - z, 0))
+
+
+def test_add_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+
+    ck.verify(x + (y - x), y)
+    ck.verify(x - (y + 1) + (y + 1), x)
+    ck.verify((x - 10) + (10 - z), x - z)
+    ck.verify((x - y) + (z - x), z - y)
+
+    ck.verify(tvm.min(x, y - z) + z, tvm.min(x + z, y))
+    ck.verify(tvm.min(x - z, y) + z, tvm.min(x, y + z))
+    ck.verify(tvm.max(x, y - 10) + 10, tvm.max(x + 10, y))
+    ck.verify(tvm.max(x - 11, y) + 11, tvm.max(x, y + 11))
+
+    ck.verify(tvm.max(x, y * 2) + tvm.min(x, y * 2), x + y * 2);
+    ck.verify(tvm.min(x, y * 2) + tvm.max(x, y * 2), x + y * 2);
+
+    ck.verify(tvm.max(x, y + 2) + (-2), tvm.max(x + (-2), y));
+    ck.verify(tvm.min(x, y + 2) + (-2), tvm.min(x + (-2), y));
+    ck.verify(tvm.min(x + 2, y + 3) + (-2), tvm.min(x, y + 1));
+
+    ck.verify(x * y + x * 10, x * (y + 10))
+    ck.verify(y * x + x * 10, x * (y + 10))
+    ck.verify(y * x + 10 * x, x * (y + 10))
+    ck.verify(x * y + 10 * x, x * (y + 10))
+
+    ck.verify(y * (x % 8) + 10 * (x % 8), (x % 8) * (y + 10))
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify((x / 8) * 8 + x % 8, x)
+
+    # canonicalization
+    ck.verify(x + 2 + 3 + 4 + x, x * 2 + 9);
+    ck.verify(x + 2 + 3 + 4 + x * 3, x * 4 + 9);
+
+    # conservative bound
+    try:
+        ck.analyzer.update(x, tvm.arith.ConstIntBound(-1, 1000), override=True)
+        ck.verify((x / 8) * 8 + x % 8, x)
+        raise RuntimeError("bad")
+    except AssertionError:
+        pass
+
+
+def test_sub_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+
+    ck.verify(x + y - y, x)
+    ck.verify(x + y - x, y)
+    ck.verify(x - (y + x), 0 - y)
+    ck.verify(x - (x + y), 0 - y)
+
+    ck.verify(tvm.min(x, y) - x, tvm.min(0, y - x))
+    ck.verify(tvm.min(x, y) - y, tvm.min(x - y, 0))
+    ck.verify(tvm.max(x, y) - x, tvm.max(0, y - x))
+    ck.verify(tvm.max(x, y) - y, tvm.max(x - y, 0))
+
+    ck.verify(x - tvm.min(x, y), tvm.max(0, x - y))
+    ck.verify(y - tvm.min(x, y), tvm.max(y - x, 0))
+    ck.verify(x - tvm.max(x, y), tvm.min(0, x - y))
+    ck.verify(y - tvm.max(x, y), tvm.min(y - x, 0))
+
+    # mul co-efficient foldng
+    ck.verify(x - x, 0)
+    ck.verify(x * y - x, x * (y + (-1)))
+    ck.verify(x * y - 10 * x, x * (y + (-10)))
+    ck.verify(y * x - x * z, x * (y - z))
+    ck.verify(y * x - z * x, x * (y - z))
+
+    ck.verify(x + 10 - 20, x + (-10))
+
+    # 4-operands pattern
+    ck.verify((x + y) - (x + z), y - z)
+    ck.verify((y + x) - (x + z), y - z)
+    ck.verify((x + y) - (z + x), y - z)
+    ck.verify((y + x) - (z + x), y - z)
+
+    ck.verify(tvm.min(x + y, z) - x, tvm.min(y, z - x))
+    ck.verify(tvm.min(y + x, z) - x, tvm.min(y, z - x))
+    ck.verify(tvm.min(z, x + y) - x, tvm.min(z - x, y))
+    ck.verify(tvm.min(z, y + x) - x, tvm.min(z - x, y))
+
+    ck.verify(x - tvm.min(x + y, z), tvm.max(0 - y, x - z))
+    ck.verify(x - tvm.min(y + x, z), tvm.max(0 - y, x - z))
+    ck.verify(x - tvm.min(z, x + y), tvm.max(x - z, 0 - y))
+    ck.verify(x - tvm.min(z, y + x), tvm.max(x - z, 0 - y))
+
+    ck.verify(tvm.min(x, y) - tvm.min(y, x), 0)
+    ck.verify(tvm.max(x, y) - tvm.max(y, x), 0)
+    ck.verify(tvm.min(x, y) - tvm.min(x + 10, y + 10), -10)
+    ck.verify(tvm.min(x + 10, y + 1) - tvm.min(x, y - 9), 10)
+
+    # div pattern
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.verify(x - (x / 3) * 3, x % 3)
+    ck.verify((x + 5) / 3 - x / 3, (((x + 2) % 3) + 5)/ 3)
+
+
+def test_mul_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.verify((x + 2) * 3, x * 3 + 6)
+    ck.verify((x * 2) * 3, x * 6)
+    ck.verify(tvm.min(x, y) * tvm.max(x, y), x * y)
+    ck.verify(tvm.max(x, y) * tvm.min(x, y), x * y)
+    ck.verify((x - y) * (-2), (y - x) * 2)
+
+
+def test_div_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(z, tvm.arith.ConstIntBound(0, 1000), override=True)
+
+    ck.verify(x / 2 / 3, x / 6)
+    ck.verify((x / 2 + 1) / 3, (x + 2) / 6)
+    ck.verify(x * 2 / 4, x / 2)
+    ck.verify(x * 4 / 2, x * 2)
+
+    ck.verify((x * 4 + y) / 2, x * 2 + y / 2)
+    ck.verify(tvm.min(x * 6, y) / 2, tvm.min(x * 3, y / 2))
+    ck.verify(tvm.max(x * 6, y) / 2, tvm.max(x * 3, y / 2))
+
+    ck.verify((y + x * 4) / 2, y / 2 + x * 2)
+    ck.verify(tvm.min(y, x * 6) / 2, tvm.min(y / 2, x * 3))
+    ck.verify(tvm.max(y, x * 6) / 2, tvm.max(y / 2, x * 3))
+
+    # 3-operands
+    ck.verify((x * 6 + y + z) / 2, x * 3 + (y + z) / 2)
+    ck.verify((x * 6 - y + (y + 3)) / 2, x * 3 + 1)
+    ck.verify((x * 6 + (y + 3) - y) / 2, x * 3 + 1)
+    ck.verify((y + x * 6 + z) / 2, x * 3 + (y + z) / 2)
+    ck.verify((x + 4) / 2, x / 2 + 2)
+
+    ck.verify((x + y) / x, y / x + 1)
+    ck.verify((y + x) / x, y / x + 1)
+    ck.verify(((x + y) + z) / x, (y + z) / x + 1)
+    ck.verify(((y + x) + z) / x, (y + z) / x + 1)
+    ck.verify((y + (x + z)) / x, (y + z) / x + 1)
+    ck.verify((y + (z + x)) / x, (y + z) / x + 1)
+
+    ck.verify((x * y) / y, x)
+    ck.verify((y * x) / y, x)
+
+    ck.verify((x * z + y) / z, x + y / z)
+    ck.verify((z * x + y) / z, x + y / z)
+    ck.verify((y + x * z) / z, y / z + x)
+    ck.verify((y + z * x) / z, y / z + x)
+
+
+def test_mod_index_simplify():
+    ck = RewriteChecker()
+    x, y, z = tvm.var("x"), tvm.var("y"), tvm.var("z")
+    ck.analyzer.update(x, tvm.arith.ConstIntBound(0, 1000), override=True)
+    ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 1000), override=True)
+
+    ck.verify(x * 10 % 2, 0)
+    ck.verify((x * 10 + y) % 2, y % 2)
+    ck.verify((x + 10) % 2, x % 2)
+    ck.verify((x + y * 10) % 2, x % 2)
+    ck.verify((x* 10 + 1 + y * 2 + 2) % 2, 1)
+
+
+if __name__ == "__main__":
+    test_mod_index_simplify()
+    test_vector_simplify()
+    test_add_index_simplify()
+    test_sub_index_simplify()
+    test_mul_index_simplify()
+    test_div_index_simplify()
+    test_select_simplify()

From c96bd9a3ebe5506da894eca943bb12e2b902470f Mon Sep 17 00:00:00 2001
From: abergeron <bergearn@iro.umontreal.ca>
Date: Sun, 10 Mar 2019 12:32:44 -0400
Subject: [PATCH 87/93] Add the new logical operators to the doc. (#2761)

---
 docs/langref/relay_op.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index f706be08009d..d6f85c5e9b1a 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -112,6 +112,9 @@ This level enables additional math and transform operators.
    tvm.relay.greater_equal
    tvm.relay.less
    tvm.relay.less_equal
+   tvm.relay.logical_and
+   tvm.relay.logical_or
+   tvm.relay.logical_not
    tvm.relay.maximum
    tvm.relay.minimum
    tvm.relay.power
@@ -234,6 +237,9 @@ Level 4 Definitions
 .. autofunction:: tvm.relay.greater_equal
 .. autofunction:: tvm.relay.less
 .. autofunction:: tvm.relay.less_equal
+.. autofunction:: tvm.relay.logical_and
+.. autofunction:: tvm.relay.logical_or
+.. autofunction:: tvm.relay.logical_not
 .. autofunction:: tvm.relay.maximum
 .. autofunction:: tvm.relay.minimum
 .. autofunction:: tvm.relay.power

From 2fb9f511dd0b44433646d684872eeb1f9036a173 Mon Sep 17 00:00:00 2001
From: Yong Wu <55wuyong@163.com>
Date: Sun, 10 Mar 2019 21:06:09 -0700
Subject: [PATCH 88/93] update relay python api doc (#2766)

---
 docs/api/python/relay/backend.rst      |  3 ---
 docs/api/python/relay/base.rst         |  3 +++
 docs/api/python/relay/build_module.rst |  2 ++
 docs/api/python/relay/expr.rst         | 14 +++++++++----
 docs/api/python/relay/frontend.rst     |  8 ++++++++
 docs/api/python/relay/image.rst        |  2 +-
 docs/api/python/relay/op.rst           | 28 +++++++++++++++++++++-----
 python/tvm/relay/op/__init__.py        |  5 ++---
 8 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/docs/api/python/relay/backend.rst b/docs/api/python/relay/backend.rst
index 5cbc250b55ba..a6085c3232ef 100644
--- a/docs/api/python/relay/backend.rst
+++ b/docs/api/python/relay/backend.rst
@@ -3,9 +3,6 @@ tvm.relay.backend
 
 .. automodule:: tvm.relay.backend
 
-Interpreter
------------
-
 .. automodule:: tvm.relay.backend.interpreter
     :members:
 
diff --git a/docs/api/python/relay/base.rst b/docs/api/python/relay/base.rst
index 72315dca0193..f2d0db409100 100644
--- a/docs/api/python/relay/base.rst
+++ b/docs/api/python/relay/base.rst
@@ -12,5 +12,8 @@ tvm.relay.base
 .. autoclass:: tvm.relay.base.Span
     :members:
 
+.. autoclass:: tvm.relay.base.SourceName
+    :members:
+
 .. autoclass:: tvm.relay.base.Id
     :members:
diff --git a/docs/api/python/relay/build_module.rst b/docs/api/python/relay/build_module.rst
index a278940f0fd5..b33f1870d5a5 100644
--- a/docs/api/python/relay/build_module.rst
+++ b/docs/api/python/relay/build_module.rst
@@ -5,6 +5,8 @@ tvm.relay.build_module
 
 .. autofunction:: tvm.relay.build_module.build
 
+.. autofunction:: tvm.relay.build_module.build_config
+
 .. autofunction:: tvm.relay.build_module.optimize
 
 .. autofunction:: tvm.relay.build_module.create_executor
diff --git a/docs/api/python/relay/expr.rst b/docs/api/python/relay/expr.rst
index 540d6bfbab65..c21e583f042b 100644
--- a/docs/api/python/relay/expr.rst
+++ b/docs/api/python/relay/expr.rst
@@ -39,15 +39,21 @@ tvm.relay.expr
 .. autoclass:: tvm.relay.expr.TupleGetItem
     :members:
 
-.. autoclass:: tvm.relay.expr.TempExpr
+.. autoclass:: tvm.relay.expr.RefCreate
+    :members:
+
+.. autoclass:: tvm.relay.expr.RefRead
+    :members:
+
+.. autoclass:: tvm.relay.expr.RefWrite
     :members:
 
-.. autoclass:: tvm.relay.expr.ExprFunctor
+.. autoclass:: tvm.relay.expr.TupleGetItem
     :members:
 
-.. autoclass:: tvm.relay.expr.ExprMutator
+.. autoclass:: tvm.relay.expr.TempExpr
     :members:
 
 .. autoclass:: tvm.relay.expr.TupleWrapper
-    :members
+    :members:
 
diff --git a/docs/api/python/relay/frontend.rst b/docs/api/python/relay/frontend.rst
index 054d3cecc1c5..2a22982a1cdf 100644
--- a/docs/api/python/relay/frontend.rst
+++ b/docs/api/python/relay/frontend.rst
@@ -9,3 +9,11 @@ tvm.relay.frontend
 .. autofunction:: tvm.relay.frontend.from_keras
 
 .. autofunction:: tvm.relay.frontend.from_onnx
+
+.. autofunction:: tvm.relay.frontend.from_tflite
+
+.. autofunction:: tvm.relay.frontend.from_coreml
+
+.. autofunction:: tvm.relay.frontend.from_caffe2
+
+.. autofunction:: tvm.relay.frontend.from_tensorflow
diff --git a/docs/api/python/relay/image.rst b/docs/api/python/relay/image.rst
index 223213eca8e3..862dcbbd1fc7 100644
--- a/docs/api/python/relay/image.rst
+++ b/docs/api/python/relay/image.rst
@@ -5,5 +5,5 @@ tvm.relay.image
 .. automodule:: tvm.relay.image
     :members:
 
-.. automodule:: tvm.relay.op.image.image
+.. automodule:: tvm.relay.op.image
     :members:
diff --git a/docs/api/python/relay/op.rst b/docs/api/python/relay/op.rst
index 7413a818f73f..36a7aa00d7b7 100644
--- a/docs/api/python/relay/op.rst
+++ b/docs/api/python/relay/op.rst
@@ -3,8 +3,27 @@ tvm.relay.op
 .. automodule:: tvm.relay.op
     :members:
 
-.. automodule:: tvm.relay.op.op
-    :members:
+.. autofunction:: tvm.relay.op.Op
+
+.. autofunction:: tvm.relay.op.OpPattern
+
+.. autofunction:: tvm.relay.op.get
+
+.. autofunction:: tvm.relay.op.register
+
+.. autofunction:: tvm.relay.op.register_schedule
+
+.. autofunction:: tvm.relay.op.register_pattern
+
+.. autofunction:: tvm.relay.op.register_compute
+
+.. autofunction:: tvm.relay.op.register_gradient
+
+.. autofunction:: tvm.relay.op.register_alter_op_layout
+
+.. autofunction:: tvm.relay.op.schedule_injective
+
+.. autofunction:: tvm.relay.op.debug
 
 .. automodule:: tvm.relay.op.reduce
     :members:
@@ -15,11 +34,10 @@ tvm.relay.op
 .. automodule:: tvm.relay.op.transform
     :members:
 
-.. automodule:: tvm.relay.op.nn.nn
+.. automodule:: tvm.relay.op.nn
     :members:
 
 .. automodule:: tvm.relay.op.vision.multibox
     :members:
 
-.. automodule:: tvm.relay.op.vision.nms
-    :members:
+.. autofunction:: tvm.relay.vision.nms
diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py
index 84b0ceef8524..9dd2bf88c934 100644
--- a/python/tvm/relay/op/__init__.py
+++ b/python/tvm/relay/op/__init__.py
@@ -1,9 +1,8 @@
 #pylint: disable=wildcard-import, redefined-builtin
 """Relay core operators."""
 # operator defs
-from .op import get, register, register_schedule, register_compute, register_alter_op_layout, \
-    Op
-from .op import debug
+from .op import get, register, register_schedule, register_compute, register_gradient, \
+    register_pattern, register_alter_op_layout, schedule_injective, Op, OpPattern, debug
 
 # Operators
 from .reduce import *

From 145698e9d720ca9d096f908a5e99d67d2036b2d8 Mon Sep 17 00:00:00 2001
From: Leyuan Wang <laurawly@gmail.com>
Date: Sun, 10 Mar 2019 23:29:51 -0700
Subject: [PATCH 89/93] [Relay/TOPI][Frontend] Add tile and repeat operators in
 Relay and TOPI (#2720)

* tile and repeat operator added in rely

* fix pylint

* fix make warnings

* comments addressed

* fix lint error

* comment addressed
---
 docs/api/python/topi.rst                 |   4 +
 docs/langref/relay_op.rst                |   4 +
 include/tvm/relay/attrs/transform.h      |  22 +++
 python/tvm/relay/frontend/mxnet.py       |  22 +++
 python/tvm/relay/op/_transform.py        |   2 +
 python/tvm/relay/op/transform.py         |  69 +++++++++
 src/relay/op/tensor/transform.cc         | 169 +++++++++++++++++++++++
 topi/include/topi/transform.h            | 109 +++++++++++++++
 topi/python/topi/transform.py            |  39 ++++++
 topi/src/topi.cc                         |  10 ++
 topi/tests/python/test_topi_transform.py |  56 ++++++++
 11 files changed, 506 insertions(+)

diff --git a/docs/api/python/topi.rst b/docs/api/python/topi.rst
index f0fc78909258..06f4f0d61f34 100644
--- a/docs/api/python/topi.rst
+++ b/docs/api/python/topi.rst
@@ -73,6 +73,8 @@ List of operators
    topi.logical_not
    topi.arange
    topi.stack
+   topi.repeat
+   topi.tile
    topi.layout_transform
    topi.image.resize
 
@@ -132,6 +134,8 @@ topi
 .. autofunction:: topi.less
 .. autofunction:: topi.arange
 .. autofunction:: topi.stack
+.. autofunction:: topi.repeat
+.. autofunction:: topi.tile
 .. autofunction:: topi.layout_transform
 
 topi.nn
diff --git a/docs/langref/relay_op.rst b/docs/langref/relay_op.rst
index d6f85c5e9b1a..f20c443e8404 100644
--- a/docs/langref/relay_op.rst
+++ b/docs/langref/relay_op.rst
@@ -97,6 +97,8 @@ This level enables additional math and transform operators.
    tvm.relay.split
    tvm.relay.arange
    tvm.relay.stack
+   tvm.relay.repeat
+   tvm.relay.tile
 
 
 **Level 4: Broadcast and Reductions**
@@ -225,6 +227,8 @@ Level 3 Definitions
 .. autofunction:: tvm.relay.split
 .. autofunction:: tvm.relay.arange
 .. autofunction:: tvm.relay.stack
+.. autofunction:: tvm.relay.repeat
+.. autofunction:: tvm.relay.tile
 
 
 Level 4 Definitions
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index fea2c960d032..5382017d8c1c 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -124,6 +124,28 @@ struct StackAttrs : public tvm::AttrsNode<StackAttrs> {
   }
 };  // struct StackAttrs
 
+/*! \brief Attributes used in repeat operators */
+struct RepeatAttrs : public tvm::AttrsNode<RepeatAttrs> {
+  Integer repeats;
+  Integer axis;
+  TVM_DECLARE_ATTRS(RepeatAttrs, "relay.attrs.RepeatAttrs") {
+    TVM_ATTR_FIELD(repeats)
+        .describe("The number of repetitions for each element.");
+    TVM_ATTR_FIELD(axis).set_default(NullValue<Integer>())
+        .describe(" The axis along which to repeat values.");
+  }
+};  // struct RepeatAttrs
+
+/*! \brief Attributes used in tile operators */
+struct TileAttrs : public tvm::AttrsNode<TileAttrs> {
+  Array<Integer> reps;
+  TVM_DECLARE_ATTRS(TileAttrs, "relay.attrs.TileAttrs") {
+    TVM_ATTR_FIELD(reps)
+        .describe("The number of times for repeating the tensor a."
+                  "Each dim sizeof reps must be a positive integer.");
+  }
+};  // struct TileAttrs
+
 /*! \brief Attributes used in squeeze operators */
 struct SqueezeAttrs : public tvm::AttrsNode<SqueezeAttrs> {
   // use axis to make the name numpy compatible.
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 2e0ccd07fdc1..e95d0455f1fb 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -166,6 +166,10 @@ def _mx_dropout(inputs, attrs):
     return _op.nn.dropout(inputs[0], rate=rate)
 
 
+def _mx_BlockGrad(inputs, attrs): #pylint: disable=unused-argument
+    return inputs
+
+
 def _mx_batch_norm(inputs, attrs):
     if attrs.get_bool("output_mean_var", False):
         raise RuntimeError("batch_norm do not support output_mean_var")
@@ -357,6 +361,21 @@ def _mx_arange(inputs, attrs):
     return _op.arange(**new_attrs)
 
 
+def _mx_repeat(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["repeats"] = attrs.get_int("repeats")
+    new_attrs["axis"] = attrs.get_int("axis", 0)
+    return _op.repeat(inputs[0], **new_attrs)
+
+
+def _mx_tile(inputs, attrs):
+    assert len(inputs) == 1
+    new_attrs = {}
+    new_attrs["reps"] = attrs.get_int_tuple("reps")
+    return _op.tile(inputs[0], **new_attrs)
+
+
 def _mx_roi_align(inputs, attrs):
     new_attrs = {}
     new_attrs["pooled_size"] = attrs.get_int_tuple("pooled_size")
@@ -490,6 +509,9 @@ def _mx_proposal(inputs, attrs):
     "batch_dot"     : _mx_batch_dot,
     "LeakyReLU"     : _mx_leaky_relu,
     "_arange"       : _mx_arange,
+    "repeat"        : _mx_repeat,
+    "tile"          : _mx_tile,
+    "BlockGrad"     : _mx_BlockGrad,
     "SoftmaxOutput" : _mx_softmax_output,
     "SoftmaxActivation" : _mx_softmax_activation,
     # vision
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 1389f96b8325..2b43c21f8e10 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -19,6 +19,8 @@
 _reg.register_schedule("full", schedule_injective)
 _reg.register_schedule("full_like", schedule_injective)
 _reg.register_schedule("arange", schedule_injective)
+_reg.register_schedule("repeat", schedule_broadcast)
+_reg.register_schedule("tile", schedule_broadcast)
 _reg.register_schedule("cast", schedule_injective)
 _reg.register_schedule("strided_slice", schedule_injective)
 _reg.register_schedule("slice_like", schedule_injective)
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 845ee02b0582..75f1bdc60174 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -316,6 +316,75 @@ def stack(data, axis):
     return _make.stack(data, axis)
 
 
+def repeat(data, repeats, axis):
+    """Repeats elements of an array.
+    By default, repeat flattens the input array into 1-D and then repeats the elements.
+
+    repeats : int
+        The number of repetitions for each element.
+
+    axis: int
+        The axis along which to repeat values. The negative numbers are interpreted
+        counting from the backward. By default, use the flattened input array, and
+        return a flat output array.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        relay.repeat(x, repeats=2) = [1., 1., 2., 2., 3., 3., 4., 4.]
+
+        relay.repeat(x, repeats=2, axis=1) = [[1., 1., 2., 2.],
+                                              [3., 3., 4., 4.]]
+    """
+    return _make.repeat(data, repeats, axis)
+
+
+def tile(data, reps):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        The input data to the operator.
+
+    reps : tuple of int
+        The number of times repeating the tensor data.
+
+    .. note::
+        Each dim size of reps must be a positive integer. If reps has length d,
+        the result will have dimension of max(d, data.ndim); If data.ndim < d,
+        data is promoted to be d-dimensional by prepending new axes.
+        If data.ndim >=  d, reps is promoted to a.ndim by pre-pending 1's to it.
+
+    Returns
+    -------
+    ret : relay.Expr
+        The computed result.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        x = [[1, 2], [3, 4]]
+        relay.tile(x, reps=(2,3)) = [[1., 2., 1., 2., 1., 2.],
+                                     [3., 4., 3., 4., 3., 4.],
+                                     [1., 2., 1., 2., 1., 2.],
+                                     [3., 4., 3., 4., 3., 4.]]
+
+        relay.tile(x, reps=(2,)) = [[1., 2., 1., 2.],
+                                    [3., 4., 3., 4.]]
+    """
+
+    return _make.tile(data, reps)
+
+
 def where(condition, x, y):
     """Selecting elements from either x or y depending on the value of the
     condition.
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index de3ac03977f4..142a16b0b307 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1035,6 +1035,175 @@ RELAY_REGISTER_OP("arange")
 .set_attr<FTVMCompute>("FTVMCompute", ArangeCompute)
 .set_attr<TOpPattern>("TOpPattern", kInjective);
 
+// repeat operator
+TVM_REGISTER_NODE_TYPE(RepeatAttrs);
+
+bool RepeatRel(const Array<Type>& types,
+               int num_inputs,
+               const Attrs& attrs,
+               const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "repeat: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<RepeatAttrs>();
+  const int ndim = static_cast<int>(data->shape.size());
+  const int repeats = param->repeats;
+  const int axis = param->axis;
+  CHECK(repeats >= 1)
+    << "repeat only accepts `repeats >= 1`"
+    << ", but got repeats = " << repeats;
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  const int pivot = axis < 0 ? ndim + axis : axis;
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(ndim + repeats);
+  for (int i = 0; i < pivot; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  oshape.emplace_back(data->shape[pivot] * repeats);
+  for (int i = pivot + 1; i < ndim; ++i) {
+    oshape.emplace_back(data->shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> RepeatCompute(const Attrs& attrs,
+                            const Array<Tensor>& inputs,
+                            const Type& out_type,
+                            const Target& target) {
+  const RepeatAttrs *param = attrs.as<RepeatAttrs>();
+  CHECK(param != nullptr);
+  return { topi::repeat(inputs[0], param->repeats, param->axis) };
+}
+
+Expr MakeRepeat(Expr data,
+                    int repeats,
+                    int axis) {
+  auto attrs = make_node<RepeatAttrs>();
+  attrs->repeats = repeats;
+  attrs->axis = axis;
+  static const Op& op = Op::Get("repeat");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.repeat")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 3>(MakeRepeat, args, rv);
+});
+
+RELAY_REGISTER_OP("repeat")
+.describe(R"code(Repeat elements of an array `repeats` times along axis `axis`
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.Repeat")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Repeat", RepeatRel)
+.set_attr<FTVMCompute>("FTVMCompute", RepeatCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
+// tile operator
+TVM_REGISTER_NODE_TYPE(TileAttrs);
+
+bool TileRel(const Array<Type>& types,
+             int num_inputs,
+             const Attrs& attrs,
+             const TypeReporter& reporter) {
+  // `types` contains: [data, result]
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  if (data == nullptr) {
+    CHECK(types[0].as<IncompleteTypeNode>())
+        << "tile: expect input type to be TensorType but get "
+        << types[0];
+    return false;
+  }
+  const auto* param = attrs.as<TileAttrs>();
+  const size_t ndim = data->shape.size();
+  const Array<Integer>& reps = param->reps;
+  // check dimension match
+  CHECK(!reps.defined())
+    << "repetition array is not defined. data.ndim = " << ndim;
+  const size_t rndim = reps.size();
+  size_t tndim = (ndim > rndim) ? ndim : rndim;
+  // re-construct data shape or reps shape
+  std::vector<IndexExpr> data_shape;
+  std::vector<IndexExpr> reps_shape;
+  data_shape.reserve(tndim);
+  reps_shape.reserve(tndim);
+  if (ndim == rndim) {
+    for (size_t i = 0; i < tndim; ++i) {
+        data_shape.emplace_back(data->shape[i]);
+        reps_shape.emplace_back(reps[i]);
+    }
+  } else if (ndim > rndim) {
+    for (size_t i = 0; i < ndim; ++i)
+        data_shape.emplace_back(data->shape[i]);
+    for (size_t i = 0; i < (ndim - rndim); ++i)
+        reps_shape.emplace_back(1);
+    for (size_t i = 0; i < rndim; ++i)
+        reps_shape.emplace_back(reps[i]);
+  } else {
+    for (size_t i = 0; i < rndim; ++i)
+        reps_shape.emplace_back(reps[i]);
+  }
+  std::vector<IndexExpr> oshape;
+  oshape.reserve(tndim);
+  for (size_t i = 0; i < tndim; ++i) {
+    oshape.emplace_back(data_shape[i] * reps_shape[i]);
+  }
+  reporter->Assign(types[1], TensorTypeNode::make(oshape, data->dtype));
+  return true;
+}
+
+Array<Tensor> TileCompute(const Attrs& attrs,
+                          const Array<Tensor>& inputs,
+                          const Type& out_type,
+                          const Target& target) {
+  const TileAttrs *param = attrs.as<TileAttrs>();
+  CHECK(param != nullptr);
+  return { topi::tile(inputs[0], param->reps) };
+}
+
+Expr MakeTile(Expr data,
+              Array<Integer> reps) {
+  auto attrs = make_node<TileAttrs>();
+  attrs->reps = reps;
+  static const Op& op = Op::Get("tile");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+TVM_REGISTER_API("relay.op._make.tile")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+    runtime::detail::unpack_call<Expr, 2>(MakeTile, args, rv);
+});
+
+RELAY_REGISTER_OP("tile")
+.describe(R"code(Repeat the whole array multiple times.
+
+- **data**: The input data to the operator.
+
+)code" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_attrs_type_key("relay.attrs.Tile")
+.add_argument("data", "Tensor", "The input tensor.")
+.set_support_level(1)
+.add_type_rel("Tile", TileRel)
+.set_attr<FTVMCompute>("FTVMCompute", TileCompute)
+.set_attr<TOpPattern>("TOpPattern", kBroadcast);
+
 // where operator
 bool WhereRel(const Array<Type>& types,
               int num_inputs,
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index fc686f88dba6..06327dac69f4 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -719,6 +719,115 @@ inline Tensor where(const Tensor& condition,
   return out;
 }
 
+/*!
+* \brief Creates an operation to repeat elements of an array
+*
+* \param x The input tensor
+* \param repeats The number of repetitions for each element
+* \param axis The axis along which to repeat values (allows
+* negative indices as offsets from the last dimension)
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the repeat operation
+*/
+inline Tensor repeat(const Tensor& x,
+                     int repeats,
+                     int axis,
+                     std::string name = "tensor",
+                     std::string tag = kBroadcast) {
+  int ndim = static_cast<int>(x->shape.size());
+  CHECK(-ndim - 1 <= axis && axis <= ndim)
+    << "repeat only accepts `axis` in [-data.ndim - 1, data.ndim]"
+    << ", but got axis = " << axis
+    << ", and data.ndim = " << ndim;
+  CHECK(repeats >= 1)
+    << "repeat only accepts `repeats >= 1`"
+    << ", but got repeats = " << repeats;
+  if (axis < 0) {
+    // Calculate offset from last dimension
+    axis += ndim;
+  }
+  Array<Expr> new_shape;
+  for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+  new_shape.push_back(repeats * x->shape[axis]);
+  for (size_t i = axis + 1; i < x->shape.size(); ++i) {
+    new_shape.push_back(x->shape[i]);
+  }
+
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+        idx.push_back(indices[i]);
+      }
+      idx.push_back(indices[axis] / repeats);
+      for (size_t i = axis + 1; i < indices.size(); ++i) {
+        idx.push_back(indices[i]);
+      }
+      return x(idx);
+    }, name, tag);
+}
+
+/*!
+* \brief Creates an operation to tile elements of an array
+*
+* \param x The input tensor
+* \param reps The number of times for repeating the tensor
+* \param name The name of the operation
+* \param tag The tag to mark the operation
+*
+* \return A Tensor whose op member is the tile operation
+*/
+inline Tensor tile(const Tensor& x,
+                   Array<Integer> reps,
+                   std::string name = "tensor",
+                   std::string tag = kBroadcast) {
+  size_t ndim = x->shape.size();
+  size_t rdim = reps.size();
+  size_t tdim = (ndim > rdim) ? ndim : rdim;
+  Array<Expr> data_shape;
+  Array<Expr> reps_shape;
+  Array<Expr> new_shape;
+  if (ndim == rdim) {
+    for (size_t i = 0; i < ndim; ++i) {
+      data_shape.push_back(x->shape[i]);
+      reps_shape.push_back(reps[i]);
+    }
+  } else if (ndim > rdim) {
+    for (size_t i = 0; i < ndim; ++i)
+      data_shape.push_back(x->shape[i]);
+    for (size_t i = 0; i < (ndim - rdim); ++i)
+      reps_shape.push_back(1);
+    for (size_t i = 0; i < rdim; ++i)
+      reps_shape.push_back(reps[i]);
+  } else {
+    for (size_t i = 0; i < (rdim - ndim); ++i)
+      data_shape.push_back(1);
+    for (size_t i = 0; i < ndim; ++i)
+      data_shape.push_back(x->shape[i]);
+    for (size_t i = 0; i < rdim; ++i)
+      reps_shape.push_back(reps[i]);
+  }
+  for (size_t i = 0; i < tdim; ++i)
+    new_shape.push_back(data_shape[i] * reps_shape[i]);
+
+  return compute(
+    new_shape, [&](const Array<Var>& indices) {
+      Array<Expr> idx;
+      if (ndim >= rdim) {
+        for (size_t i = 0; i < ndim; ++i)
+          idx.push_back(indices[i] % x->shape[i]);
+      } else {
+        for (size_t i = 0; i < ndim; ++i)
+          idx.push_back(indices[rdim - ndim + i] % x->shape[i]);
+      }
+      return x(idx);
+    }, name, tag);
+}
+
 /*!
 * \brief Gather elements from a n-dimension array.
 *
diff --git a/topi/python/topi/transform.py b/topi/python/topi/transform.py
index 2ddfee2806a5..063556852d26 100644
--- a/topi/python/topi/transform.py
+++ b/topi/python/topi/transform.py
@@ -339,6 +339,45 @@ def arange(start, stop=None, step=1, dtype="float32"):
     return cpp.arange(start, stop, step, dtype)
 
 
+def repeat(a, repeats, axis):
+    """Repeats elements of an array.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be repeated.
+
+    repeats: int, required
+        Number of repetitions for each element
+
+    axis: int, optional
+        The axis along which to repeat values
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.repeat(a, repeats, axis)
+
+
+def tile(a, reps):
+    """Repeats the whole array multiple times.
+
+    Parameters
+    ----------
+    a : tvm.Tensor
+        The tensor to be tiled.
+
+    reps: tuple of ints, required
+        The number of times for repeating the tensor
+
+    Returns
+    -------
+    ret : tvm.Tensor
+    """
+    return cpp.tile(a, reps)
+
+
 def layout_transform(array, src_layout, dst_layout):
     """Transform the layout according to src_layout and dst_layout
 
diff --git a/topi/src/topi.cc b/topi/src/topi.cc
index 3630c4cf3b85..14f92460fd25 100644
--- a/topi/src/topi.cc
+++ b/topi/src/topi.cc
@@ -305,6 +305,16 @@ TVM_REGISTER_GLOBAL("topi.arange")
   *rv = arange(args[0], args[1], args[2], args[3]);
 });
 
+TVM_REGISTER_GLOBAL("topi.repeat")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = repeat(args[0], args[1], args[2]);
+});
+
+TVM_REGISTER_GLOBAL("topi.tile")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+  *rv = tile(args[0], args[1]);
+});
+
 TVM_REGISTER_GLOBAL("topi.gather_nd")
 .set_body([](TVMArgs args, TVMRetValue *rv) {
   *rv = gather_nd(args[0], args[1]);
diff --git a/topi/tests/python/test_topi_transform.py b/topi/tests/python/test_topi_transform.py
index 66c75854193f..785da6fddbcf 100644
--- a/topi/tests/python/test_topi_transform.py
+++ b/topi/tests/python/test_topi_transform.py
@@ -359,6 +359,50 @@ def check_device(device):
     for device in get_all_backend():
         check_device(device)
 
+def verify_repeat(in_shape, repeats, axis):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.repeat(A, repeats, axis)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
+        foo = tvm.build(s, [A, B], device, name="repeat")
+        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = np.repeat(data_npy, repeats, axis)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
+def verify_tile(in_shape, reps):
+    A = tvm.placeholder(shape=in_shape, name="A")
+    B = topi.tile(A, reps)
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            s = topi.generic.schedule_broadcast(B)
+        foo = tvm.build(s, [A, B], device, name="tile")
+        data_npy = np.random.uniform(size=in_shape).astype(A.dtype)
+        out_npy = np.tile(data_npy, reps)
+        data_nd = tvm.nd.array(data_npy, ctx)
+        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(B.dtype), ctx)
+        foo(data_nd, out_nd)
+        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy)
+
+    for device in get_all_backend():
+        check_device(device)
+
 def test_strided_slice():
     verify_strided_slice((3, 4, 3), [0, 0, 0], [4, -5, 4], [1, -1, 2])
     verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3], [2, 1, 1])
@@ -481,6 +525,16 @@ def test_arange():
     verify_arange(20, 1, -1)
     verify_arange(20, 1, -1.5)
 
+def test_repeat():
+    verify_repeat((2,), 1, 0)
+    verify_repeat((3, 2), 2, 0)
+    verify_repeat((3, 2, 4), 3, 1)
+    verify_repeat((1, 3, 2, 4), 4, -1)
+
+def test_tile():
+    verify_tile((3, 2), (2, 3))
+    verify_tile((3, 2, 5), (2,))
+    verify_tile((3, ), (2, 3, 3))
 
 def test_layout_transform():
     in_shape = (1, 32, 8, 8)
@@ -525,3 +579,5 @@ def check_device(device):
     test_gather_nd()
     test_arange()
     test_layout_transform()
+    test_repeat()
+    test_tile()

From d3a8aa9db8fd830ea62797c561cafad91d51cc0a Mon Sep 17 00:00:00 2001
From: Yong Wu <55wuyong@163.com>
Date: Mon, 11 Mar 2019 11:40:12 -0700
Subject: [PATCH 90/93] [relay][frontend] TensorFlow saved model support
 (#2586)

* [relay][frontend] TensorFlow saved model support

* Add Examples section

* keep one copy of tensorflow_parser in relay
---
 nnvm/python/nnvm/frontend/util/__init__.py    |  0
 python/tvm/relay/frontend/tensorflow.py       | 59 ++++++++++++++-----
 .../tvm/relay/frontend}/tensorflow_parser.py  | 28 +++++----
 3 files changed, 62 insertions(+), 25 deletions(-)
 delete mode 100644 nnvm/python/nnvm/frontend/util/__init__.py
 rename {nnvm/python/nnvm/frontend/util => python/tvm/relay/frontend}/tensorflow_parser.py (93%)

diff --git a/nnvm/python/nnvm/frontend/util/__init__.py b/nnvm/python/nnvm/frontend/util/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py
index 1f2ba4eb435f..8d53b003da1e 100644
--- a/python/tvm/relay/frontend/tensorflow.py
+++ b/python/tvm/relay/frontend/tensorflow.py
@@ -4,6 +4,7 @@
 from __future__ import print_function
 
 import logging
+import warnings
 # Numpy support
 import numpy as np
 
@@ -410,7 +411,7 @@ def _impl(inputs, attr, params):
 def _decode_image():
     def _impl(inputs, attr, params):
         # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer.
-        print("DecodeJpeg: It's a pass through, please handle preprocessing before input")
+        warnings.warn("DecodeJpeg: It's a pass through, please handle preprocessing before input")
         return inputs[0]
     return _impl
 
@@ -1178,6 +1179,7 @@ class GraphProto(object):
     def __init__(self):
         self._nodes = {}
         self._params = {}
+        self._input_shapes = {}
         self._output_shapes = {}
         self._num_param = 0
         self._num_rnn_layer = False
@@ -1229,36 +1231,55 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
             raise NotImplementedError( \
                 "The following operators are not implemented: {}".format(missing_operators))
 
+        for node in graph.node:
+            if node.op == 'Placeholder':
+                if shape and node.name in shape:
+                    self._input_shapes[node.name] = list(shape[node.name])
+                    continue
+                self._input_shapes[node.name] = \
+                    tensor_util.TensorShapeProtoToList(node.attr['shape'].shape)
+                for idx, dim in enumerate(self._input_shapes[node.name]):
+                    if dim < 0:
+                        self._input_shapes[node.name][idx] = 1
+                        warnings.warn("Use 1 instead of -1 in shape of operator %s."
+                                      % node.name)
+
+                # Ignore user's input shape for Non placeholder
+            elif node.op == 'Const':
+                tensor_value = node.attr['value'].tensor
+                self._input_shapes[node.name] = \
+                    tensor_util.TensorShapeProtoToList(tensor_value.tensor_shape)
+                if shape and node.name in shape:
+                    warnings.warn("Ignore the passed shape. Shape in graphdef "
+                                  "will be used for operator %s." % node.name)
+
         # Parse the nodes to re-create TF graph using Relay operators.
         for node in graph.node:
-            # Tensorflow doesn't have seperate list for params extraction.
+            # Tensorflow doesn't have separate list for params extraction.
             # Operator name 'Const' is treated as a parameter to build params dict.
 
             input_shapes = {}
             attr = self._parse_attr(node.attr)
 
-            #Variable converted to Const will not have only value attr
+            # Variable converted to Const will not have only value attr
             if 'value' in attr and node.op == 'Const':
-                tensor_value = attr['value']
-                self._output_shapes[node.name] = \
-                    [tensor_util.TensorShapeProtoToList( \
-                        tensor_value.tensor_shape)]
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
+            elif shape and node.name in shape:
+                # Give priority to user argument.
+                self._output_shapes[node.name] = [shape[node.name]]
             elif '_output_shapes' in attr:
                 self._output_shapes[node.name] = \
                     [tensor_util.TensorShapeProtoToList(tshape) \
                     for tshape in attr['_output_shapes']]
-            elif shape:
+            else:
                 # Keep the list indexable to avoid key error.
                 # Actual value will be filled after node creation.
                 self._output_shapes[node.name] = [None]
-            else:
-                raise NotImplementedError( \
-                    "Please freeze the graph with add_shapes=True")
 
             if node.op == "Placeholder":
-                self._output_shapes[node.name] = [shape[node.name]]
+                self._output_shapes[node.name] = [self._input_shapes[node.name]]
                 self._nodes[node.name] = [_expr.var(node.name,
-                                                    shape=self._output_shapes[node.name][0],
+                                                    shape=self._input_shapes[node.name],
                                                     dtype=attr['dtype'].name)]
 
             elif node.op == "Const":
@@ -1274,7 +1295,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
             else:
                 # Pass the parsed shapes instead
-                attr["_output_shapes"] = self._output_shapes[node.name]
+                attr["_output_shapes"] = output_shapes = self._output_shapes[node.name]
 
                 # Pass the node name too in attr
                 attr["_node_name"] = node.name
@@ -1301,7 +1322,7 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 op = self._convert_operator(node.op, inputs, attr, graph)
 
-                # Check is op is converted to param
+                # Check if op is converted to param
                 if isinstance(op, np.ndarray):
                     self._params[node.name] = tvm.nd.array(op)
                     op = [_expr.var(node.name,
@@ -1317,6 +1338,14 @@ def from_tensorflow(self, graph, layout="NHWC", shape=None, outputs=None):
 
                 self._nodes[node.name] = op
 
+                # Infer shapes even without specifying "add_shapes=True"
+                if output_shapes == [None]:
+                    out_type = ir_pass.infer_type(self._nodes[node.name][0])
+                    self._output_shapes[node.name] = [get_const_tuple(out_type.checked_type.shape)]
+
+                if self._output_shapes[node.name] and shape and node.name in shape:
+                    assert self._output_shapes[node.name] == list(shape[node.name])
+
             # Infer shapes if passed explicitely
             node_output = self._nodes[node.name]
             out_type = ir_pass.infer_type(node_output[0])
diff --git a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py b/python/tvm/relay/frontend/tensorflow_parser.py
similarity index 93%
rename from nnvm/python/nnvm/frontend/util/tensorflow_parser.py
rename to python/tvm/relay/frontend/tensorflow_parser.py
index ce51f7c2315b..04c232b6d577 100644
--- a/nnvm/python/nnvm/frontend/util/tensorflow_parser.py
+++ b/python/tvm/relay/frontend/tensorflow_parser.py
@@ -7,16 +7,21 @@
 
 
 class TFParser(object):
-    """A Wrapper to handle tensorflow models parsing
-       TensorFlow is needed
-    ```
-    parser = TfParser(model_dir)
-    graph = parser.parse()
-    ```
+    """
+    A Wrapper to handle tensorflow models parsing, TensorFlow is needed
+
     Parameters
     ----------
     model_dir : tensorflow frozen pb file or a directory that contains saved
     model or checkpoints.
+
+    Examples
+    --------
+    .. code-block:: python
+
+        parser = TfParser(model_dir)
+        graph = parser.parse()
+        # graph is related graphdef of the model
     """
 
     def __init__(self, model_dir):
@@ -115,13 +120,16 @@ def _load_ckpt(self):
         """TODO: Load checkpoint model."""
         raise RuntimeError("InputConfiguration: Loading tf checkpoint model is "
                            "not supported yet.")
-        # pylint: disable=unreachable
-        return 0
 
     def parse(self):
-        """Parse tensorflow models: checkpoints, saved models, and single pb
-        file.
         """
+        Parse tensorflow models: checkpoints, saved models, and single frozen pb file.
+
+        Returns
+        -------
+        GraphDef of the passed model
+        """
+
         graph = None
 
         if os.path.isdir(self._model_dir):

From 5aa6faad06b4be0b4eee786c71435bceb4199a36 Mon Sep 17 00:00:00 2001
From: Yao Wang <kevinthesunwy@gmail.com>
Date: Mon, 11 Mar 2019 11:41:46 -0700
Subject: [PATCH 91/93] [Object Detection] Gluoncv SSD support on CPU (#2353)

---
 include/tvm/relay/attrs/vision.h              |  47 ++-
 nnvm/include/nnvm/top/nn.h                    |  29 +-
 nnvm/python/nnvm/frontend/mxnet.py            |   6 +-
 nnvm/python/nnvm/top/vision.py                |  23 +-
 nnvm/src/top/vision/nms.cc                    |  21 +-
 nnvm/tests/python/compiler/test_top_level4.py |  16 +-
 .../python/frontend/mxnet/test_forward.py     |   1 -
 python/tvm/relay/frontend/mxnet.py            |  54 ++-
 python/tvm/relay/op/transform.py              |   2 +-
 python/tvm/relay/op/vision/__init__.py        |   2 +-
 .../op/vision/{_multibox.py => _vision.py}    |  36 +-
 python/tvm/relay/op/vision/nms.py             |  60 ++-
 src/relay/op/tensor/transform.cc              |  19 +-
 src/relay/op/vision/multibox_op.cc            |   6 +-
 src/relay/op/vision/nms.cc                    |  88 ++++-
 tests/python/frontend/mxnet/test_forward.py   |   8 +-
 tests/python/relay/test_op_level10.py         |   1 +
 tests/python/relay/test_op_level5.py          |  86 ++++-
 topi/include/topi/nn/l2_normalize.h           |   7 +-
 topi/python/topi/cuda/nms.py                  |  43 ++-
 topi/python/topi/cuda/ssd/multibox.py         |   4 +-
 topi/python/topi/cuda/vision.py               |  17 +
 topi/python/topi/generic/vision.py            |  17 +
 topi/python/topi/testing/__init__.py          |   1 +
 topi/python/topi/testing/slice_axis_python.py |  34 ++
 topi/python/topi/vision/nms.py                | 356 ++++++++++++------
 topi/python/topi/vision/ssd/multibox.py       | 282 +++++++-------
 topi/tests/python/test_topi_vision.py         |  74 +++-
 tutorials/frontend/deploy_ssd_gluoncv.py      | 104 +++++
 .../{deploy_ssd.py => deploy_ssd_mxnet.py}    |   2 +-
 30 files changed, 1060 insertions(+), 386 deletions(-)
 rename python/tvm/relay/op/vision/{_multibox.py => _vision.py} (62%)
 create mode 100644 topi/python/topi/testing/slice_axis_python.py
 create mode 100644 tutorials/frontend/deploy_ssd_gluoncv.py
 rename tutorials/nnvm/{deploy_ssd.py => deploy_ssd_mxnet.py} (98%)

diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h
index df059a6238e1..20b80f33a2a3 100644
--- a/include/tvm/relay/attrs/vision.h
+++ b/include/tvm/relay/attrs/vision.h
@@ -58,19 +58,42 @@ struct MultiBoxTransformLocAttrs
   }
 };
 
-/*! \brief Attributes used in non_maximum_suppression operators */
-struct NMSAttrs : public tvm::AttrsNode<NMSAttrs>{
-  double overlap_threshold;
+/*! \brief Attributes used in get_valid_counts operator */
+struct GetValidCountsAttrs : public tvm::AttrsNode<GetValidCountsAttrs> {
+  double score_threshold;
+
+  TVM_DECLARE_ATTRS(GetValidCountsAttrs, "relay.attrs.GetValidCountsAttrs") {
+    TVM_ATTR_FIELD(score_threshold).set_default(0.0)
+      .describe("Lower limit of score for valid bounding boxes.");
+  }
+};
+
+/*! \brief Attributes used in non_maximum_suppression operator */
+struct NonMaximumSuppressionAttrs : public tvm::AttrsNode<NonMaximumSuppressionAttrs> {
+  int max_output_size;
+  double iou_threshold;
   bool force_suppress;
-  int topk;
-
-  TVM_DECLARE_ATTRS(NMSAttrs, "relay.attrs.NMSAttrs") {
-      TVM_ATTR_FIELD(overlap_threshold).set_default(0.5)
-        .describe("Non-maximum suppression threshold.");
-      TVM_ATTR_FIELD(force_suppress).set_default(false)
-        .describe("Suppress all detections regardless of class_id.");
-      TVM_ATTR_FIELD(topk).set_default(-1)
-        .describe("Keep maximum top k detections before nms, -1 for no limit.");
+  int top_k;
+  int id_index;
+  bool return_indices;
+  bool invalid_to_bottom;
+
+  TVM_DECLARE_ATTRS(NonMaximumSuppressionAttrs, "relay.attrs.NonMaximumSuppressionAttrs") {
+    TVM_ATTR_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
+    TVM_ATTR_FIELD(iou_threshold).set_default(0.5)
+      .describe("Non-maximum suppression threshold.");
+    TVM_ATTR_FIELD(force_suppress).set_default(false)
+      .describe("Suppress all detections regardless of class_id.");
+    TVM_ATTR_FIELD(top_k).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    TVM_ATTR_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    TVM_ATTR_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
+    TVM_ATTR_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
diff --git a/nnvm/include/nnvm/top/nn.h b/nnvm/include/nnvm/top/nn.h
index 143a9548f18a..578f928c5b9f 100644
--- a/nnvm/include/nnvm/top/nn.h
+++ b/nnvm/include/nnvm/top/nn.h
@@ -443,17 +443,30 @@ struct MultiBoxTransformLocParam : public dmlc::Parameter<MultiBoxTransformLocPa
   }
 };
 
-struct NMSParam : public dmlc::Parameter<NMSParam> {
-  float nms_threshold;
+struct NonMaximumSuppressionParam : public dmlc::Parameter<NonMaximumSuppressionParam> {
+  bool return_indices;
+  float iou_threshold;
   bool force_suppress;
-  int nms_topk;
-  DMLC_DECLARE_PARAMETER(NMSParam) {
-    DMLC_DECLARE_FIELD(nms_threshold).set_default(0.5)
+  int top_k;
+  int id_index;
+  int max_output_size;
+  bool invalid_to_bottom;
+  DMLC_DECLARE_PARAMETER(NonMaximumSuppressionParam) {
+    DMLC_DECLARE_FIELD(max_output_size).set_default(-1)
+      .describe("Max number of output valid boxes for each instance."
+                "By default all valid boxes are returned.");
+    DMLC_DECLARE_FIELD(iou_threshold).set_default(0.5)
       .describe("Non-maximum suppression threshold.");
     DMLC_DECLARE_FIELD(force_suppress).set_default(false)
-    .describe("Suppress all detections regardless of class_id.");
-    DMLC_DECLARE_FIELD(nms_topk).set_default(-1)
-    .describe("Keep maximum top k detections before nms, -1 for no limit.");
+      .describe("Suppress all detections regardless of class_id.");
+    DMLC_DECLARE_FIELD(top_k).set_default(-1)
+      .describe("Keep maximum top k detections before nms, -1 for no limit.");
+    DMLC_DECLARE_FIELD(id_index).set_default(0)
+      .describe("Axis index of id.");
+    DMLC_DECLARE_FIELD(return_indices).set_default(true)
+      .describe("Whether to return box indices in input data.");
+    DMLC_DECLARE_FIELD(invalid_to_bottom).set_default(false)
+      .describe("Whether to move all invalid bounding boxes to the bottom.");
   }
 };
 
diff --git a/nnvm/python/nnvm/frontend/mxnet.py b/nnvm/python/nnvm/frontend/mxnet.py
index 0b994861deef..c9f6777e4898 100644
--- a/nnvm/python/nnvm/frontend/mxnet.py
+++ b/nnvm/python/nnvm/frontend/mxnet.py
@@ -245,11 +245,11 @@ def _contrib_multibox_detection(inputs, attrs):
         if attrs.get('variances') is not None else (0.1, 0.1, 0.2, 0.2)
     nms_topk = attrs.get('nms_topk') or -1
     new_attrs0 = {'clip': clip, 'threshold': float(threshold), 'variances': variances}
-    new_attrs1 = {'nms_threshold': float(nms_threshold), 'force_suppress': force_suppress,
-                  'nms_topk': int(nms_topk)}
+    new_attrs1 = {'return_indices': False, 'iou_threshold': float(nms_threshold),
+                  'force_suppress': force_suppress, 'top_k': int(nms_topk)}
     data, valid_count = _get_nnvm_op('multibox_transform_loc')(inputs[0], inputs[1],
                                                                inputs[2], **new_attrs0)
-    return _get_nnvm_op('nms')(data, valid_count, **new_attrs1)
+    return _get_nnvm_op('non_max_suppression')(data, valid_count, **new_attrs1)
 
 def _elemwise_sum(inputs, _):
     new_attrs = {'num_args':len(inputs)}
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index d12c82c1fc88..948f905f1e2b 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -65,21 +65,26 @@ def compute_multibox_transform_loc(attrs, inputs, _):
 reg.register_pattern("multibox_detection", OpPattern.OPAQUE)
 
 # non-maximum suppression
-@reg.register_schedule("nms")
+@reg.register_schedule("non_max_suppression")
 def schedule_nms(_, outs, target):
-    """Schedule definition of nms"""
+    """Schedule definition of non_max_suppression"""
     with tvm.target.create(target):
         return topi.generic.schedule_nms(outs)
 
-@reg.register_compute("nms")
+@reg.register_compute("non_max_suppression")
 def compute_nms(attrs, inputs, _):
-    """Compute definition of nms"""
-    nms_threshold = attrs.get_float('nms_threshold')
+    """Compute definition of non_max_suppression"""
+    return_indices = attrs.get_bool('return_indices')
+    max_output_size = attrs.get_int('max_output_size')
+    iou_threshold = attrs.get_float('iou_threshold')
     force_suppress = attrs.get_bool('force_suppress')
-    nms_topk = attrs.get_int('nms_topk')
+    top_k = attrs.get_int('top_k')
+    id_index = attrs.get_int('id_index')
+    invalid_to_bottom = attrs.get_bool('invalid_to_bottom')
 
     with tvm.target.create(attrs.get_str("target")):
-        return topi.vision.nms(inputs[0], inputs[1], nms_threshold,
-                               force_suppress, nms_topk)
+        return topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                               iou_threshold, force_suppress, top_k,
+                                               id_index, return_indices, invalid_to_bottom)
 
-reg.register_pattern("nms", OpPattern.OPAQUE)
+reg.register_pattern("non_max_suppression", OpPattern.OPAQUE)
diff --git a/nnvm/src/top/vision/nms.cc b/nnvm/src/top/vision/nms.cc
index 2680b894255b..e69a7cb2f036 100644
--- a/nnvm/src/top/vision/nms.cc
+++ b/nnvm/src/top/vision/nms.cc
@@ -19,11 +19,13 @@ using compiler::FTVMCompute;
 using tvm::Tensor;
 using tvm::Array;
 
-DMLC_REGISTER_PARAMETER(NMSParam);
+DMLC_REGISTER_PARAMETER(NonMaximumSuppressionParam);
 
 bool NMSShape(const NodeAttrs& attrs,
               std::vector<TShape> *in_attrs,
               std::vector<TShape> *out_attrs) {
+  const NonMaximumSuppressionParam& param =
+    nnvm::get<NonMaximumSuppressionParam>(attrs.parsed);
   CHECK_EQ(in_attrs->size(), 2U) << "Inputs: [data, valid_count]";
   TShape dshape = in_attrs->at(0);
   TShape vshape = in_attrs->at(1);
@@ -33,7 +35,14 @@ bool NMSShape(const NodeAttrs& attrs,
     "(batch_size, num_anchors, 6).";
   CHECK_EQ(dshape[0], vshape[0]) << "batch_size mismatch.";
   out_attrs->clear();
-  NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  if (param.return_indices) {
+    TShape oshape = TShape(2);
+    oshape[0] = dshape[0];
+    oshape[1] = dshape[1];
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, oshape);
+  } else {
+    NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_attrs, 0, dshape);
+  }
   return true;
 }
 
@@ -56,15 +65,15 @@ inline bool NMSInferLayout(const NodeAttrs& attrs,
   return true;
 }
 
-NNVM_REGISTER_OP(nms)
+NNVM_REGISTER_OP(non_max_suppression)
   .describe(R"doc("Non-maximum suppression."
 )doc" NNVM_ADD_FILELINE)
 .set_num_inputs(2)
 .set_num_outputs(1)
-.set_attr_parser(ParamParser<NMSParam>)
+.set_attr_parser(ParamParser<NonMaximumSuppressionParam>)
 .set_attr<FGetAttrDict>("FGetAttrDict",
-                        ParamGetAttrDict<NMSParam>)
-.add_arguments(NMSParam::__FIELDS__())
+                        ParamGetAttrDict<NonMaximumSuppressionParam>)
+.add_arguments(NonMaximumSuppressionParam::__FIELDS__())
 .add_argument("data", "Tensor", "Input data.")
 .add_argument("valid_count", "Tensor", "Number of valid anchor boxes.")
 .set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
diff --git a/nnvm/tests/python/compiler/test_top_level4.py b/nnvm/tests/python/compiler/test_top_level4.py
index fc4e62fb7156..6a42047151e5 100644
--- a/nnvm/tests/python/compiler/test_top_level4.py
+++ b/nnvm/tests/python/compiler/test_top_level4.py
@@ -550,7 +550,7 @@ def test_multibox_transform_loc():
     anchors = sym.Variable("anchors")
     transform_loc_data, valid_count = sym.multibox_transform_loc(cls_prob=cls_prob, loc_pred=loc_preds,
                                                                  anchor=anchors)
-    out = sym.nms(data=transform_loc_data, valid_count=valid_count)
+    out = sym.non_max_suppression(data=transform_loc_data, valid_count=valid_count, return_indices=False)
 
     # Manually create test case
     np_cls_prob = np.array([[[0.2, 0.5, 0.3], [0.25, 0.3, 0.45], [0.7, 0.1, 0.2]]])
@@ -573,22 +573,22 @@ def test_multibox_transform_loc():
     out = m.get_output(0, tvm.nd.empty(expected_np_out.shape, dtype))
     tvm.testing.assert_allclose(out.asnumpy(), expected_np_out, atol=1e-5, rtol=1e-5)
 
-def test_nms():
+def test_non_max_suppression():
     dshape = (1, 5, 6)
     data = sym.Variable("data")
     valid_count = sym.Variable("valid_count", dtype="int32")
-    nms_threshold = 0.7
+    iou_threshold = 0.7
     force_suppress = True
-    nms_topk = 2
-    out = sym.nms(data=data, valid_count=valid_count, nms_threshold=nms_threshold,
-                  force_suppress=force_suppress, nms_topk=nms_topk)
+    top_k = 2
+    out = sym.non_max_suppression(data=data, valid_count=valid_count, return_indices=False,
+                                  iou_threshold=iou_threshold, force_suppress=force_suppress, top_k=top_k)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
 
     target = "llvm"
@@ -726,7 +726,7 @@ def test_argmax():
     test_flip()
     test_multibox_prior()
     test_multibox_transform_loc()
-    test_nms()
+    test_non_max_suppression()
     test_slice_like()
     test_where()
     test_argmax()
diff --git a/nnvm/tests/python/frontend/mxnet/test_forward.py b/nnvm/tests/python/frontend/mxnet/test_forward.py
index e046f39f02ca..581ae75a4bbc 100644
--- a/nnvm/tests/python/frontend/mxnet/test_forward.py
+++ b/nnvm/tests/python/frontend/mxnet/test_forward.py
@@ -315,4 +315,3 @@ def test_forward_minimum():
     test_forward_slice()
     test_forward_maximum()
     test_forward_minimum()
-
diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index e95d0455f1fb..1585d55ac1b9 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -328,13 +328,14 @@ def _mx_multibox_detection(inputs, attrs):
                                                                   0.2, 0.2))
 
     new_attrs1 = {}
-    new_attrs1["overlap_threshold"] = attrs.get_float("nms_threshold", 0.5)
+    new_attrs1["return_indices"] = False
+    new_attrs1["iou_threshold"] = attrs.get_float("nms_threshold", 0.5)
     new_attrs1["force_suppress"] = attrs.get_bool("force_suppress", False)
-    new_attrs1["topk"] = attrs.get_int("nms_topk", -1)
+    new_attrs1["top_k"] = attrs.get_int("nms_topk", -1)
 
     ret = _op.vision.multibox_transform_loc(inputs[0], inputs[1],
                                             inputs[2], **new_attrs0)
-    return _op.vision.nms(ret[0], ret[1], **new_attrs1)
+    return _op.vision.non_max_suppression(ret[0], ret[1], **new_attrs1)
 
 
 def _mx_batch_dot(inputs, attrs):
@@ -399,6 +400,49 @@ def _mx_proposal(inputs, attrs):
     return _op.vision.proposal(inputs[0], inputs[1], inputs[2], **new_attrs)
 
 
+def _mx_box_nms(inputs, attrs):
+    force_suppress = attrs.get_bool("force_suppress", False)
+    iou_thresh = attrs.get_float('overlap_thresh', 0.5)
+    top_k = attrs.get_int('topk', -1)
+    valid_thresh = attrs.get_float('valid_thresh', 0)
+    coord_start = attrs.get_int('coord_start', 2)
+    score_index = attrs.get_int('score_index', 1)
+    id_index = attrs.get_int('id_index', -1)
+    in_format = attrs.get_str('in_format', 'corner')
+    out_format = attrs.get_str('out_format', 'corner')
+    if coord_start != 2:
+        raise RuntimeError('coord_start %s is not supported.' % coord_start)
+    if score_index != 1:
+        raise RuntimeError('score_index %s is not supported.' % score_index)
+    if id_index != -1 and int(id_index) != 0:
+        raise RuntimeError('id_index %s is not supported.' % id_index)
+    if in_format != 'corner':
+        raise RuntimeError('in_format %s is not supported.' % in_format)
+    if out_format != 'corner':
+        raise RuntimeError('out_format %s is not supported.' % out_format)
+
+    ret = _op.vision.get_valid_counts(inputs[0], score_threshold=valid_thresh)
+    nms_out = _op.vision.non_max_suppression(ret[1],
+                                             ret[0],
+                                             iou_threshold=iou_thresh,
+                                             force_suppress=force_suppress,
+                                             top_k=top_k,
+                                             id_index=id_index,
+                                             return_indices=False,
+                                             invalid_to_bottom=True)
+    return nms_out
+
+
+def _mx_l2_normalize(inputs, attrs):
+    new_attrs = {}
+    mode = attrs.get_str('mode', 'instance')
+    if mode != 'channel':
+        raise RuntimeError('mode %s is not supported.' % mode)
+    new_attrs['eps'] = attrs.get_float('eps', 1e-10)
+    new_attrs['axis'] = [1]
+    return _op.nn.l2_normalize(inputs[0], **new_attrs)
+
+
 # Note: due to attribute conversion constraint
 # ops in the identity set must be attribute free
 _identity_list = [
@@ -497,6 +541,7 @@ def _mx_proposal(inputs, attrs):
     "BatchNorm"     : _mx_batch_norm,
     "BatchNorm_v1"  : _mx_batch_norm,
     "LRN"           : _mx_lrn,
+    "L2Normalization"  : _mx_l2_normalize,
     "slice"         : _mx_slice,
     "slice_like"    : _mx_slice_like,
     "slice_axis"    : _mx_slice_axis,
@@ -520,6 +565,7 @@ def _mx_proposal(inputs, attrs):
     "_contrib_ROIAlign" : _mx_roi_align,
     "_contrib_Proposal" : _mx_proposal,
     "_contrib_MultiProposal" : _mx_proposal,
+    "_contrib_box_nms" : _mx_box_nms,
     # List of missing operators that are present in NNVMv1
     # TODO(tvm-tvm): support all operators.
     #
@@ -662,6 +708,8 @@ def from_mxnet(symbol,
             params[k] = _nd.array(v.data().asnumpy())
         data = mx.sym.Variable("data")
         sym = symbol(data)
+        if isinstance(sym, (list, tuple)):
+            sym = mx.sym.Group(sym)
         shape, dtype = _update_shape_dtype(shape, dtype, params)
         sym = _from_mxnet_impl(sym, shape, dtype)
     elif isinstance(symbol, mx.gluon.Block):
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index 75f1bdc60174..b77269843c91 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -525,7 +525,7 @@ def strided_slice(data, begin, end, strides=None):
         The indices to begin with in the slicing.
 
     end: list of int
-        Indicies indicating end of the slice.
+        Indices indicating end of the slice.
 
     strides: list of int, optional
         Specifies the stride values, it can be negative in that case,
diff --git a/python/tvm/relay/op/vision/__init__.py b/python/tvm/relay/op/vision/__init__.py
index 10cf6c2fd3ee..0cee4e4faeec 100644
--- a/python/tvm/relay/op/vision/__init__.py
+++ b/python/tvm/relay/op/vision/__init__.py
@@ -6,6 +6,6 @@
 from .nms import *
 from .rcnn import *
 from .yolo import *
-from . import _multibox
 from . import _rcnn
 from . import _yolo
+from . import _vision
diff --git a/python/tvm/relay/op/vision/_multibox.py b/python/tvm/relay/op/vision/_vision.py
similarity index 62%
rename from python/tvm/relay/op/vision/_multibox.py
rename to python/tvm/relay/op/vision/_vision.py
index e9ef43f7e06f..c887076e6af8 100644
--- a/python/tvm/relay/op/vision/_multibox.py
+++ b/python/tvm/relay/op/vision/_vision.py
@@ -54,24 +54,46 @@ def compute_multibox_transform_loc(attrs, inputs, _, target):
 reg.register_pattern("vision.multibox_detection", OpPattern.OPAQUE)
 
 
+# Get counts of valid boxes
+@reg.register_schedule("vision.get_valid_counts")
+def schedule_get_valid_counts(_, outs, target):
+    """Schedule definition of get_valid_counts"""
+    with target:
+        return topi.generic.schedule_get_valid_counts(outs)
+
+
+@reg.register_compute("vision.get_valid_counts")
+def compute_get_valid_counts(attrs, inputs, _, target):
+    """Compute definition of get_valid_counts"""
+    score_threshold = get_const_float(attrs.score_threshold)
+    return topi.vision.get_valid_counts(inputs[0], score_threshold)
+
+reg.register_pattern("vision.get_valid_counts", OpPattern.OPAQUE)
+
+
 # non-maximum suppression
-@reg.register_schedule("vision.nms")
+@reg.register_schedule("vision.non_max_suppression")
 def schedule_nms(_, outs, target):
     """Schedule definition of nms"""
     with target:
         return topi.generic.schedule_nms(outs)
 
 
-@reg.register_compute("vision.nms")
+@reg.register_compute("vision.non_max_suppression")
 def compute_nms(attrs, inputs, _, target):
     """Compute definition of nms"""
-    overlap_threshold = get_const_float(attrs.overlap_threshold)
+    return_indices = bool(get_const_int(attrs.return_indices))
+    max_output_size = get_const_int(attrs.max_output_size)
+    iou_threshold = get_const_float(attrs.iou_threshold)
     force_suppress = bool(get_const_int(attrs.force_suppress))
-    topk = get_const_int(attrs.topk)
+    top_k = get_const_int(attrs.top_k)
+    id_index = get_const_int(attrs.id_index)
+    invalid_to_bottom = bool(get_const_int(attrs.invalid_to_bottom))
     return [
-        topi.vision.nms(inputs[0], inputs[1], overlap_threshold,
-                        force_suppress, topk)
+        topi.vision.non_max_suppression(inputs[0], inputs[1], max_output_size,
+                                        iou_threshold, force_suppress, top_k,
+                                        id_index, return_indices, invalid_to_bottom)
     ]
 
 
-reg.register_pattern("vision.nms", OpPattern.OPAQUE)
+reg.register_pattern("vision.non_max_suppression", OpPattern.OPAQUE)
diff --git a/python/tvm/relay/op/vision/nms.py b/python/tvm/relay/op/vision/nms.py
index 8035e3030b17..0124ee29ab9e 100644
--- a/python/tvm/relay/op/vision/nms.py
+++ b/python/tvm/relay/op/vision/nms.py
@@ -1,12 +1,41 @@
 """Non-maximum suppression operations."""
 from __future__ import absolute_import as _abs
 from . import _make
+from ...expr import TupleWrapper
 
-def nms(data,
-        valid_count,
-        overlap_threshold=0.5,
-        force_suppress=False,
-        topk=-1):
+def get_valid_counts(data,
+                     score_threshold):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : relay.Expr
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    valid_count : relay.Expr
+        1-D tensor for valid number of boxes.
+
+    out_tensor : relay.Expr
+        Rearranged data tensor.
+    """
+    return TupleWrapper(_make.get_valid_counts(data, score_threshold), 2)
+
+
+def non_max_suppression(data,
+                        valid_count,
+                        max_output_size=-1,
+                        iou_threshold=0.5,
+                        force_suppress=False,
+                        top_k=-1,
+                        id_index=0,
+                        return_indices=True,
+                        invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
@@ -19,18 +48,33 @@ def nms(data,
     valid_count : relay.Expr
         1-D tensor for valid number of boxes.
 
-    overlap_threshold : float, optional
+    max_output_size : int, optional
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : float, optional
         Non-maximum suppression threshold.
 
     force_suppress : bool, optional
         Suppress all detections regardless of class_id.
 
-    topk : int, optional
+    top_k : int, optional
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : int, optional
+        index of the class categories, -1 to disable.
+
+    return_indices : bool, optional
+        Whether to return box indices in input data.
+
+    invalid_to_bottom : bool, optional
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : relay.Expr
         3-D tensor with shape [batch_size, num_anchors, 6].
     """
-    return _make.nms(data, valid_count, overlap_threshold, force_suppress, topk)
+    return _make.non_max_suppression(data, valid_count, max_output_size,
+                                     iou_threshold, force_suppress, top_k,
+                                     id_index, return_indices, invalid_to_bottom)
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 142a16b0b307..7aa98e3fd87a 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -1516,6 +1516,16 @@ RELAY_REGISTER_OP("broadcast_to_like")
 .set_attr<TOpPattern>("TOpPattern", kBroadcast);
 
 
+// Adapter function to make int array.
+Array<Integer> GetIntArray(Array<IndexExpr> arr) {
+  for (size_t i = 0; i < arr.size(); ++i) {
+    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
+      << "Expect an int array";
+  }
+  return Array<Integer>(arr.node_);
+}
+
+
 // strided_slice
 TVM_REGISTER_NODE_TYPE(StridedSliceAttrs);
 bool StridedSliceRel(const Array<Type>& types,
@@ -1870,15 +1880,6 @@ Expr MakeSliceLike(Expr data,
   return CallNode::make(op, {data, shape_like}, Attrs(attrs), {});
 }
 
-// Adapter function to make int array.
-Array<Integer> GetIntArray(Array<IndexExpr> arr) {
-  for (size_t i = 0; i < arr.size(); ++i) {
-    CHECK(!arr[i].defined() || arr[i].as<IntImm>())
-        << "Expect an int array";
-  }
-  return Array<Integer>(arr.node_);
-}
-
 Array<Tensor> SliceLikeCompute(const Attrs& attrs,
                                const Array<Tensor>& inputs,
                                const Type& out_type,
diff --git a/src/relay/op/vision/multibox_op.cc b/src/relay/op/vision/multibox_op.cc
index 55db8862e849..04f105c44744 100644
--- a/src/relay/op/vision/multibox_op.cc
+++ b/src/relay/op/vision/multibox_op.cc
@@ -70,8 +70,10 @@ RELAY_REGISTER_OP("vision.multibox_prior")
 
 TVM_REGISTER_NODE_TYPE(MultiBoxTransformLocAttrs);
 
-bool MultiBoxTransformLocRel(const Array<Type>& types, int num_inputs,
-                             const Attrs& attrs, const TypeReporter& reporter) {
+bool MultiBoxTransformLocRel(const Array<Type>& types,
+                             int num_inputs,
+                             const Attrs& attrs,
+                             const TypeReporter& reporter) {
   CHECK_EQ(types.size(), 4);
 
   const auto* cls_prob = types[0].as<TensorTypeNode>();
diff --git a/src/relay/op/vision/nms.cc b/src/relay/op/vision/nms.cc
index 3e3f73bc6cb4..6a94da032196 100644
--- a/src/relay/op/vision/nms.cc
+++ b/src/relay/op/vision/nms.cc
@@ -9,7 +9,54 @@
 namespace tvm {
 namespace relay {
 
-TVM_REGISTER_NODE_TYPE(NMSAttrs);
+TVM_REGISTER_NODE_TYPE(GetValidCountsAttrs);
+
+bool GetValidCountRel(const Array<Type>& types,
+                      int num_inputs,
+                      const Attrs& attrs,
+                      const TypeReporter& reporter) {
+  CHECK_EQ(types.size(), 2);
+  const auto* data = types[0].as<TensorTypeNode>();
+  const auto& dshape = data->shape;
+  CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
+
+  std::vector<IndexExpr> oshape({data->shape[0]});
+  std::vector<Type> fields;
+  fields.push_back(TensorTypeNode::make(oshape, Int(32)));
+  fields.push_back(TensorTypeNode::make(data->shape, data->dtype));
+
+  // assign output type
+  reporter->Assign(types[1], TupleTypeNode::make(Array<Type>(fields)));
+  return true;
+}
+
+Expr MakeGetValidCounts(Expr data,
+                        double score_threshold) {
+  auto attrs = make_node<GetValidCountsAttrs>();
+  attrs->score_threshold = score_threshold;
+  static const Op& op = Op::Get("vision.get_valid_counts");
+  return CallNode::make(op, {data}, Attrs(attrs), {});
+}
+
+
+TVM_REGISTER_API("relay.op.vision._make.get_valid_counts")
+.set_body([](const TVMArgs& args, TVMRetValue* rv) {
+  runtime::detail::unpack_call<Expr, 2>(MakeGetValidCounts, args, rv);
+});
+
+
+RELAY_REGISTER_OP("vision.get_valid_counts")
+.describe(R"doc(Get valid count of bounding boxes given
+a score threshold. Also moves valid boxes to the top of
+input data.
+)doc" TVM_ADD_FILELINE)
+.set_num_inputs(1)
+.add_argument("data", "Tensor", "Input data.")
+.set_support_level(5)
+.add_type_rel("GetValidCount", GetValidCountRel);
+
+
+TVM_REGISTER_NODE_TYPE(NonMaximumSuppressionAttrs);
 
 bool NMSRel(const Array<Type>& types,
             int num_inputs,
@@ -18,39 +65,56 @@ bool NMSRel(const Array<Type>& types,
   CHECK_EQ(types.size(), 3);
   const auto* data = types[0].as<TensorTypeNode>();
   const auto* valid_count = types[1].as<TensorTypeNode>();
+  const NonMaximumSuppressionAttrs* param =
+    attrs.as<NonMaximumSuppressionAttrs>();
   const auto& dshape = data->shape;
   const auto& vshape = valid_count->shape;
   CHECK_EQ(dshape.size(), 3) << "Input data should be 3-D.";
   CHECK_EQ(vshape.size(), 1) << "Input valid count should be 1-D.";
 
   // assign output type
-  reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  if (param->return_indices) {
+    std::vector<IndexExpr> oshape({dshape[0], dshape[1]});
+    reporter->Assign(types[2], TensorTypeNode::make(oshape, Int(32)));
+  } else {
+    reporter->Assign(types[2], TensorTypeNode::make(dshape, data->dtype));
+  }
   return true;
 }
 
 
 Expr MakeNMS(Expr data,
              Expr valid_count,
-             double overlap_threshold,
+             int max_output_size,
+             double iou_threshold,
              bool force_suppress,
-             int topk) {
-  auto attrs = make_node<NMSAttrs>();
-  attrs->overlap_threshold = overlap_threshold;
+             int top_k,
+             int id_index,
+             bool return_indices,
+             bool invalid_to_bottom) {
+  auto attrs = make_node<NonMaximumSuppressionAttrs>();
+  attrs->max_output_size = max_output_size;
+  attrs->iou_threshold = iou_threshold;
   attrs->force_suppress = force_suppress;
-  attrs->topk = topk;
-  static const Op& op = Op::Get("vision.nms");
+  attrs->top_k = top_k;
+  attrs->id_index = id_index;
+  attrs->return_indices = return_indices;
+  attrs->invalid_to_bottom = invalid_to_bottom;
+  static const Op& op = Op::Get("vision.non_max_suppression");
   return CallNode::make(op, {data, valid_count}, Attrs(attrs), {});
 }
 
 
-TVM_REGISTER_API("relay.op.vision._make.nms")
+TVM_REGISTER_API("relay.op.vision._make.non_max_suppression")
 .set_body([](const TVMArgs& args, TVMRetValue* rv) {
-  runtime::detail::unpack_call<Expr, 5>(MakeNMS, args, rv);
+  runtime::detail::unpack_call<Expr, 9>(MakeNMS, args, rv);
 });
 
 
-RELAY_REGISTER_OP("vision.nms")
-.describe(R"doc("Non-maximum suppression."
+RELAY_REGISTER_OP("vision.non_max_suppression")
+.describe(R"doc(Non-maximum suppression. The input boxes should
+be in the format of [class_id, score, left, top, right, bottom].
+Set id_index to be -1 to ignore class_id axis.
 )doc" TVM_ADD_FILELINE)
 .set_num_inputs(2)
 .add_argument("data", "Tensor", "Input data.")
diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py
index 2dfe20c503e6..4679876c181b 100644
--- a/tests/python/frontend/mxnet/test_forward.py
+++ b/tests/python/frontend/mxnet/test_forward.py
@@ -374,6 +374,11 @@ def verify(x_shape, y_shape, axes):
     verify((3, 4), (2, 3), (0))
     verify((3, 4), (2, 3), (-1))
 
+def test_forward_l2_normalize():
+    data = mx.sym.var('data')
+    mx_sym = mx.sym.L2Normalization(data, mode="channel")
+    verify_mxnet_frontend_impl(mx_sym, (2, 3, 4, 5), (2, 3, 4, 5))
+
 
 if __name__ == '__main__':
     test_forward_mlp()
@@ -401,5 +406,6 @@ def verify(x_shape, y_shape, axes):
     test_forward_broadcast_ops()
     test_forward_elemwise_ops()
     test_forward_scalar_ops()
-    test_forward_slice_axis()
     test_forward_slice_like()
+    test_forward_slice_axis()
+    test_forward_l2_normalize()
diff --git a/tests/python/relay/test_op_level10.py b/tests/python/relay/test_op_level10.py
index 34285d2b18dd..7237cfbc3b87 100644
--- a/tests/python/relay/test_op_level10.py
+++ b/tests/python/relay/test_op_level10.py
@@ -2,6 +2,7 @@
 """
 import numpy as np
 import tvm
+import topi.testing
 from tvm import relay
 from tvm.relay.testing import ctx_list
 import topi
diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py
index 003318f01a2f..eceedc760d4b 100644
--- a/tests/python/relay/test_op_level5.py
+++ b/tests/python/relay/test_op_level5.py
@@ -135,56 +135,107 @@ def verify_multibox_prior(x, dshape, ref_res, sizes=(1.0,),
     verify_multibox_prior(x, dshape, ref_res, clip=False, check_type_only=True)
 
 
-def test_nms():
-    def verify_nms(x0_data, x1_data, dshape, ref_res, valid_count,
-                   overlap_threshold=0.5, force_suppress=False, topk=-1,
+def test_get_valid_counts():
+    def verify_get_valid_counts(dshape, score_threshold):
+        dtype = "float32"
+        batch_size, num_anchor, elem_length = dshape
+        np_data = np.random.uniform(size=dshape).astype(dtype)
+        np_out1 = np.zeros(shape=(batch_size,))
+        np_out2 = np.zeros(shape=dshape).astype(dtype)
+        for i in range(batch_size):
+            np_out1[i] = 0
+            inter_idx = 0
+            for j in range(num_anchor):
+                score = np_data[i, j, 1]
+                if score >= score_threshold:
+                    for k in range(elem_length):
+                        np_out2[i, inter_idx, k] = np_data[i, j, k]
+                    np_out1[i] += 1
+                    inter_idx += 1
+                if j >= np_out1[i]:
+                    for k in range(elem_length):
+                        np_out2[i, j, k] = -1
+
+        x = relay.var("x", relay.ty.TensorType(dshape, dtype))
+        z = relay.vision.get_valid_counts(x, score_threshold)
+        assert "score_threshold" in z.astext()
+        func = relay.Function([x], z.astuple())
+        func = relay.ir_pass.infer_type(func)
+        ctx_list = [("llvm", tvm.cpu(0))]
+        for target, ctx in ctx_list:
+            intrp = relay.create_executor("debug", ctx=ctx, target=target)
+            out = intrp.evaluate(func)(np_data)
+            tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3)
+            tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3)
+
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
+
+def test_non_max_suppression():
+    def verify_nms(x0_data, x1_data, dshape, ref_res, ref_indices_res,
+                   iou_threshold=0.5, force_suppress=False, top_k=-1,
                    check_type_only=False):
         x0 = relay.var("x0", relay.ty.TensorType(dshape, "float32"))
         x1 = relay.var("x1", relay.ty.TensorType((dshape[0],), "int"))
-        z = relay.vision.nms(x0, x1, overlap_threshold, force_suppress, topk)
-        assert "overlap_threshold" in z.astext()
+        z = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k, return_indices=False)
+        z_indices = relay.vision.non_max_suppression(x0, x1, -1, iou_threshold, force_suppress, top_k)
+        assert "iou_threshold" in z.astext()
+        assert "iou_threshold" in z_indices.astext()
         zz = relay.ir_pass.infer_type(z)
+        zz_indices = relay.ir_pass.infer_type(z_indices)
         assert zz.checked_type == relay.ty.TensorType(dshape, "float32")
+        assert zz_indices.checked_type == relay.ty.TensorType((dshape[0], dshape[1]), "int32")
 
         if check_type_only:
             return
 
         func = relay.Function([x0, x1], z)
         func = relay.ir_pass.infer_type(func)
+        func_indices = relay.Function([x0, x1], z_indices)
+        func_indices = relay.ir_pass.infer_type(func_indices)
         ctx_list = [("llvm", tvm.cpu(0))]
         for target, ctx in ctx_list:
             intrp1 = relay.create_executor("graph", ctx=ctx, target=target)
             op_res1 = intrp1.evaluate(func)(x0_data, x1_data)
+            op_indices_res1 = intrp1.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res1.asnumpy(), ref_indices_res, rtol=1e-5)
             intrp2 = relay.create_executor("debug", ctx=ctx, target=target)
             op_res2 = intrp2.evaluate(func)(x0_data, x1_data)
+            op_indices_res2 = intrp2.evaluate(func_indices)(x0_data, x1_data)
             tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5)
+            tvm.testing.assert_allclose(op_indices_res2.asnumpy(), ref_indices_res, rtol=1e-5)
 
     np_data = np.array([[[0, 0.8, 1, 20, 25, 45], [1, 0.7, 30, 60, 50, 80],
                          [0, 0.4, 4, 21, 19, 40], [2, 0.9, 35, 61, 52, 79],
                          [1, 0.5, 100, 60, 70, 110]]]).astype("float32")
     np_valid_count = np.array([4]).astype("int32")
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
     num_anchors = 5
 
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               force_suppress=True, topk=2, check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+               force_suppress=True, top_k=2, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               force_suppress=True, topk=2, check_type_only=False)
+    verify_nms(np_data, np_valid_count, dshape, np_result, np_indices_result,
+               force_suppress=True, top_k=2, check_type_only=False)
 
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [1, 0.7, 30, 60, 50, 80], [-1, 0.9, 35, 61, 52, 79],
+                           [1, 0.7, 30, 60, 50, 80], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, 1, -1, -1]])
     dshape = (tvm.var("n"), num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               check_type_only=True)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, check_type_only=True)
     dshape = (1, num_anchors, 6)
-    verify_nms(np_data, np_valid_count, dshape, np_result, dshape[0],
-               topk=3)
+    verify_nms(np_data, np_valid_count, dshape, np_result,
+               np_indices_result, top_k=3)
 
 
 def test_multibox_transform_loc():
@@ -226,7 +277,7 @@ def test_default_value():
 
         assert ret.checked_type == ref_type
 
-        nms = relay.vision.nms(mtl[0], mtl[1])
+        nms = relay.vision.non_max_suppression(mtl[0], mtl[1], return_indices=False)
         func = relay.Function([cls_prob, loc_pred, anchors], nms)
         func = relay.ir_pass.infer_type(func)
         ctx_list = [("llvm", tvm.cpu(0))]
@@ -411,8 +462,9 @@ def verify_yolo_reorg(shape, stride):
     test_resize()
     test_multibox_prior()
     test_multibox_transform_loc()
-    test_nms()
+    test_get_valid_counts()
     test_roi_align()
     test_proposal()
     test_yolo_reorg_infer_shape()
     test_yolo_reorg()
+    test_non_max_suppression()
diff --git a/topi/include/topi/nn/l2_normalize.h b/topi/include/topi/nn/l2_normalize.h
index a9fd49cbee64..4f9bdb61ab70 100644
--- a/topi/include/topi/nn/l2_normalize.h
+++ b/topi/include/topi/nn/l2_normalize.h
@@ -30,7 +30,12 @@ inline Tensor l2_normalize(const Tensor& data,
                            const Array<Integer>& axis,
                            std::string name = "tensor",
                            std::string tag = "l2_normalize") {
-  CHECK_EQ(data->shape.size(), 4) << "L2 normalization requires 4-D input";
+  for (size_t i = 0; i < axis.size(); ++i) {
+    int ax = topi::detail::GetConstInt(axis[i]);
+    CHECK_LT(ax, data->shape.size()) <<
+             "Axis " << ax << " exceeds input data dim " <<
+             data->shape.size();
+  }
   auto input_shape = data->shape;
   Tensor dot_value = topi::power(data, static_cast<float>(2.0));
   Tensor sum_value = topi::sum(dot_value, axis, true);
diff --git a/topi/python/topi/cuda/nms.py b/topi/python/topi/cuda/nms.py
index e0d71559f1a0..5f79de25e835 100644
--- a/topi/python/topi/cuda/nms.py
+++ b/topi/python/topi/cuda/nms.py
@@ -1,10 +1,10 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument
 """Non-maximum suppression operator"""
 import math
 import tvm
 
 from tvm import api
-from topi.vision import nms
+from topi.vision import non_max_suppression
 from ..util import get_const_tuple
 
 def sort_ir(data, index, output):
@@ -181,13 +181,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
     return body
 
 
-@nms.register(["cuda", "gpu"])
-def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+@non_max_suppression.register(["cuda", "gpu"])
+def nms_gpu(data, valid_count, return_indices, iou_threshold=0.5, force_suppress=False,
+            topk=-1, id_index=0, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -195,15 +196,24 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    return_indices : boolean
+        Whether to return box indices in input data.
+
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    topk : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    invalid_to_bottom : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -216,14 +226,13 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         # An example to use nms
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
-        valid_count = tvm.placeholder(
-            (dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        topk = -1
+        out = nms(data, valid_count, iou_threshold, force_suppress, topk)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -263,8 +272,8 @@ def nms_gpu(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk
         tvm.extern(data.shape,
                    [data, sort_tensor, valid_count],
                    lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
+                       ins[0], ins[1], ins[2], outs[0], iou_threshold,
+                       force_suppress, topk),
                    dtype="float32",
                    in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
                    tag="nms")
diff --git a/topi/python/topi/cuda/ssd/multibox.py b/topi/python/topi/cuda/ssd/multibox.py
index 746be092ebbe..11062824deb0 100644
--- a/topi/python/topi/cuda/ssd/multibox.py
+++ b/topi/python/topi/cuda/ssd/multibox.py
@@ -11,7 +11,7 @@
 from topi.vision.ssd import multibox_prior
 from topi.vision.ssd import multibox_detection
 from topi.vision.ssd import multibox_transform_loc
-from ..nms import nms
+from ..nms import non_max_suppression
 
 
 def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
@@ -437,6 +437,6 @@ def multibox_detection_gpu(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(
+    out = non_max_suppression(
         inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
     return out
diff --git a/topi/python/topi/cuda/vision.py b/topi/python/topi/cuda/vision.py
index 17497abc0d8b..e3bc0fb9d547 100644
--- a/topi/python/topi/cuda/vision.py
+++ b/topi/python/topi/cuda/vision.py
@@ -162,3 +162,20 @@ def traverse(op):
         scheduled_ops.append(op)
     traverse(outs[0].op)
     return s
+
+@generic.schedule_get_valid_counts.register(["cuda", "gpu"])
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts operator.
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+        The computation graph description of get_valid_counts
+        in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs)
diff --git a/topi/python/topi/generic/vision.py b/topi/python/topi/generic/vision.py
index 76e8545bfc52..bfd6c55d533a 100644
--- a/topi/python/topi/generic/vision.py
+++ b/topi/python/topi/generic/vision.py
@@ -36,6 +36,23 @@ def schedule_reorg(outs):
     cpp_target = cpp.TEST_create_target(target.target_name)
     return cpp.generic.default_schedule(cpp_target, outs, False)
 
+@tvm.target.generic_func
+def schedule_get_valid_counts(outs):
+    """Schedule for get_valid_counts
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+      The computation graph description of nms
+      in the format of an array of tensors.
+
+    Returns
+    -------
+    s: Schedule
+      The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
 @tvm.target.generic_func
 def schedule_nms(outs):
     """Schedule for non-maximum suppression
diff --git a/topi/python/topi/testing/__init__.py b/topi/python/topi/testing/__init__.py
index 0ccc422010c1..1743de13fd85 100644
--- a/topi/python/topi/testing/__init__.py
+++ b/topi/python/topi/testing/__init__.py
@@ -20,3 +20,4 @@
 from .gather_nd_python import gather_nd_python
 from .strided_slice_python import strided_slice_python
 from .batch_matmul import batch_matmul
+from .slice_axis_python import slice_axis_python
diff --git a/topi/python/topi/testing/slice_axis_python.py b/topi/python/topi/testing/slice_axis_python.py
new file mode 100644
index 000000000000..589e5914a36c
--- /dev/null
+++ b/topi/python/topi/testing/slice_axis_python.py
@@ -0,0 +1,34 @@
+"""Slice axis in python"""
+
+def slice_axis_python(data, axis, begin, end=None):
+    """Slice input array along specific axis.
+
+    Parameters
+    ----------
+    data : numpy.ndarray
+        The source array to be sliced.
+
+    axis : int
+        Axis to be sliced.
+
+    begin: int
+        The index to begin with in the slicing.
+
+    end: int, optional
+        The index indicating end of the slice.
+
+    Returns
+    -------
+    ret : numpy.ndarray
+        The computed result.
+    """
+    dshape = data.shape
+    if axis < 0:
+        axis += len(dshape)
+    if begin < 0:
+        begin += dshape[axis]
+    if end <= 0:
+        end += dshape[axis]
+    slc = [slice(None)] * len(dshape)
+    slc[axis] = slice(begin, end)
+    return data[tuple(slc)]
diff --git a/topi/python/topi/vision/nms.py b/topi/python/topi/vision/nms.py
index a41ee5b50089..169daea2d4d3 100644
--- a/topi/python/topi/vision/nms.py
+++ b/topi/python/topi/vision/nms.py
@@ -1,118 +1,247 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements
 """Non-maximum suppression operator"""
 import tvm
 
-from tvm import api
+from tvm import api, hybrid
 
-def nms_ir(data, sort_result, valid_count, out, nms_threshold, force_suppress, nms_topk):
-    """Low level IR routing for transform location in multibox_detection operator.
+@hybrid.script
+def hybrid_rearrange_out(data):
+    """Hybrid routine to rearrange nms output to
+    move all valid entries to top.
 
     Parameters
     ----------
-    data: Buffer
-        Buffer of output boxes with class and score.
+    data : tvm.Tensor or numpy NDArray
+        NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
 
-    sort_result : Buffer
-        Buffer of output box indexes sorted by score.
+    Returns
+    -------
+    output : tvm.Tensor or numpy NDArray
+        Transformed NMS output. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    elem_length = data.shape[2]
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            elem_length),
+                           data.dtype)
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
+    for i in parallel(batch_size):
+        valid_idx = 0
+        for j in range(num_anchors):
+            if data[i, j, 0] >= 0:
+                for k in range(elem_length):
+                    output[i, valid_idx, k] = data[i, j, k]
+                valid_idx += 1
+            if j >= valid_idx:
+                for k in range(elem_length):
+                    output[i, j, k] = -1.0
+    return output
 
-    out : Buffer
-        Output buffer.
 
-    nms_threshold : float
-        Non-maximum suppression threshold.
+@hybrid.script
+def hybrid_get_valid_counts(data, score_threshold):
+    """Hybrid routine to get valid count of bounding boxes
+    given a score threshold. Also moves valid boxes to the
+    top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor or numpy NDArray
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : tvm.const
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : tvm.Tensor or numpy NDArray
+        Rearranged data tensor.
+
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
+    """
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    valid_count = output_tensor((batch_size,), "int32")
+    out_tensor = output_tensor((batch_size,
+                                num_anchors,
+                                box_data_length),
+                               data.dtype)
+    for i in parallel(batch_size):
+        valid_count[i] = 0
+        for j in range(num_anchors):
+            score = data[i, j, 1]
+            if score > score_threshold:
+                for k in range(box_data_length):
+                    out_tensor[i, valid_count[i], k] = data[i, j, k]
+                valid_count[i] += 1
+            if j >= valid_count[i]:
+                for k in range(box_data_length):
+                    out_tensor[i, j, k] = -1.0
+    return valid_count, out_tensor
+
+@tvm.target.generic_func
+def get_valid_counts(data, score_threshold=0):
+    """Get valid count of bounding boxes given a score threshold.
+    Also moves valid boxes to the top of input data.
+
+    Parameters
+    ----------
+    data : tvm.Tensor
+        Input data. 3-D tensor with shape [batch_size, num_anchors, 6].
+
+    score_threshold : optional, float
+        Lower limit of score for valid bounding boxes.
+
+    Returns
+    -------
+    out_tensor : tvm.Tensor
+        Rearranged data tensor.
+
+    valid_count : tvm.Tensor
+        1-D tensor for valid number of boxes.
+    """
+    score_threshold_const = tvm.const(score_threshold, "float")
+    return hybrid_get_valid_counts(data, score_threshold_const)
+
+
+@hybrid.script
+def hybrid_nms(data, sorted_index, valid_count,
+               max_output_size, iou_threshold, force_suppress,
+               top_k, id_index):
+    """Hybrid routing for non-maximum suppression.
+
+    Parameters
+    ----------
+    data: tvm.Tensor or numpy NDArray
+        Bounding boxes with class and score. 3-D tensor with shape
+        [batch_size, num_anchors, 6].
+
+    sorted_index : tvm.Tensor or numpy NDArray
+        Bounding box indexes sorted by score, with shape
+        [batch_size, num_anchors].
+
+    valid_count : tvm.Tensor or numpy NDArray
+        1-D tensor for valid number of boxes.
 
-    force_suppress : boolean
+    max_output_size : tvm.const
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : tvm.const
+        Overlapping(IoU) threshold to suppress object with smaller score.
+
+    force_suppress : tvm.const
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    top_k : tvm.const
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : tvm.const
+        index of the class categories, -1 to disable.
+
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    output : tvm.Tensor
+        3-D tensor with shape [batch_size, num_anchors, 6].
+
+    box_indices: tvm.Tensor
+        2-D tensor with shape [batch_size, num_anchors].
     """
-    def calculate_overlap(out_tensor, box_a_idx, box_b_idx):
-        """Calculate overlap of two boxes.
-        """
-        w = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 2], out_tensor[box_b_idx + 2])
-                         - tvm.make.Max(out_tensor[box_a_idx], out_tensor[box_b_idx]))
-        h = tvm.make.Max(0.0, tvm.make.Min(out_tensor[box_a_idx + 3], out_tensor[box_b_idx + 3])
-                         - tvm.make.Max(out_tensor[box_a_idx + 1], out_tensor[box_b_idx + 1]))
-        i = w * h
-        u = (out_tensor[box_a_idx + 2] - out_tensor[box_a_idx]) * \
-            (out_tensor[box_a_idx + 3] - out_tensor[box_a_idx + 1]) + \
-            (out_tensor[box_b_idx + 2] - out_tensor[box_b_idx]) * \
-            (out_tensor[box_b_idx + 3] - out_tensor[box_b_idx + 1]) - i
-        return tvm.expr.Select(u <= 0.0, 0.0, i / u)
-
-    ib = tvm.ir_builder.create()
-    p_data = ib.buffer_ptr(data)
-    p_sort_result = ib.buffer_ptr(sort_result)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    batch_size = out.shape[0]
-    num_anchors = out.shape[1]
-
-    nms_threshold_node = tvm.make.node("FloatImm", dtype="float32", value=nms_threshold)
-    nms_topk_node = tvm.make.node("IntImm", dtype="int32", value=nms_topk)
-    force_suppress_node = tvm.make.node("IntImm", dtype="int32", value=1 if force_suppress else 0)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        with ib.if_scope(tvm.all(nms_threshold_node > 0, nms_threshold_node < 1,
-                                 p_valid_count[0] > 0)):
-            # Reorder output
-            nkeep = tvm.if_then_else(
-                tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n]),
-                nms_topk, p_valid_count[n])
-            with ib.for_range(0, nkeep, name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[(n * num_anchors * 6
-                                                   + p_sort_result[n * num_anchors + l] * 6 + m)]
-            with ib.if_scope(tvm.all(nms_topk_node > 0, nms_topk < p_valid_count[n])):
-                with ib.for_range(0, p_valid_count[n] - nkeep, name="l") as l:
-                    with ib.for_range(0, 6, name="m") as m:
-                        p_out[(n * num_anchors * 6
-                               + (l + nkeep) * 6 + m)] = p_data[(n * num_anchors * 6
-                                                                 + (l + nkeep) * 6 + m)]
+    batch_size = data.shape[0]
+    num_anchors = data.shape[1]
+    box_data_length = data.shape[2]
+    box_indices = output_tensor((batch_size, num_anchors), "int32")
+    output = output_tensor((batch_size,
+                            num_anchors,
+                            box_data_length,),
+                           data.dtype)
+
+    for i in parallel(batch_size):
+        if iou_threshold > 0:
+            if valid_count[i] > 0:
+                # Reorder output
+                nkeep = valid_count[i]
+                if 0 < top_k < nkeep:
+                    nkeep = top_k
+                for j in range(nkeep):
+                    for k in range(box_data_length):
+                        output[i, j, k] = data[i, sorted_index[i, j], k]
+                    box_indices[i, j] = sorted_index[i, j]
+                if 0 < top_k < valid_count[i]:
+                    for j in range(valid_count[i] - nkeep):
+                        for k in range(box_data_length):
+                            output[i, j + nkeep, k] = -1.0
+                        box_indices[i, j + nkeep] = -1
             # Apply nms
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                offset_l = l * 6
-                with ib.if_scope(p_out[n * num_anchors * 6 + offset_l] >= 0):
-                    with ib.for_range(0, p_valid_count[n], name="m") as m:
-                        offset_m = m * 6
-                        with ib.if_scope(tvm.all(m > l, p_out[n * num_anchors * 6
-                                                              + offset_m] >= 0)):
-                            with ib.if_scope(tvm.any(force_suppress_node > 0,
-                                                     p_out[n * num_anchors * 6 + offset_l] ==
-                                                     p_out[n * num_anchors * 6 + offset_m])):
-                                # When force_suppress == True or class_id equals
-                                iou = calculate_overlap(p_out, n * num_anchors * 6 + offset_l + 2,
-                                                        n * num_anchors * 6 + offset_m + 2)
-                                with ib.if_scope(iou >= nms_threshold):
-                                    p_out[n * num_anchors * 6 + offset_m] = -1.0
-        with ib.else_scope():
-            with ib.for_range(0, p_valid_count[n], name="l") as l:
-                with ib.for_range(0, 6, name="m") as m:
-                    p_out[(n * num_anchors * 6
-                           + l * 6 + m)] = p_data[n * num_anchors * 6 + l * 6 + m]
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    for k in range(valid_count[i]):
+                        check_iou = 0
+                        if k > j and output[i, k, 0] >= 0:
+                            if force_suppress:
+                                check_iou = 1
+                            elif id_index < 0 or output[i, j, 0] == output[i, k, 0]:
+                                check_iou = 1
+                        if check_iou > 0:
+                            batch_idx = i
+                            box_a_idx = j
+                            box_b_idx = k
+                            box_start_idx = 2
+                            a_t = output[batch_idx, box_a_idx, box_start_idx + 1]
+                            a_b = output[batch_idx, box_a_idx, box_start_idx + 3]
+                            a_l = output[batch_idx, box_a_idx, box_start_idx]
+                            a_r = output[batch_idx, box_a_idx, box_start_idx + 2]
+                            b_t = output[batch_idx, box_b_idx, box_start_idx + 1]
+                            b_b = output[batch_idx, box_b_idx, box_start_idx + 3]
+                            b_l = output[batch_idx, box_b_idx, box_start_idx]
+                            b_r = output[batch_idx, box_b_idx, box_start_idx + 2]
+                            w = max(0.0, min(a_r, b_r) - max(a_l, b_l))
+                            h = max(0.0, min(a_b, b_b) - max(a_t, b_t))
+                            area = h * w
+                            u = (a_r - a_l) * (a_b - a_t) + (b_r - b_l) * (b_b - b_t) - area
+                            iou = 0.0 if u <= 0.0 else area / u
+                            if iou >= iou_threshold:
+                                output[i, k, 0] = -1.0
+                                box_indices[i, k] = -1
+        else:
+            for j in range(valid_count[i]):
+                for k in range(box_data_length):
+                    output[i, j, k] = data[i, j, k]
+                box_indices[i, j] = j
         # Set invalid entry to be -1
-        with ib.for_range(0, num_anchors - p_valid_count[n], name="l") as l:
-            with ib.for_range(0, 6, name="m") as m:
-                p_out[n * num_anchors * 6 + (l + p_valid_count[n]) * 6 + m] = -1.0
-    return ib.get()
+        for j in range(num_anchors - valid_count[i]):
+            for k in range(box_data_length):
+                output[i, j + valid_count[i], k] = -1.0
+            box_indices[i, j + valid_count[i]] = -1
+        # Only return max_output_size valid boxes
+        num_valid_boxes = 0
+        if max_output_size > 0:
+            for j in range(valid_count[i]):
+                if output[i, j, 0] >= 0:
+                    if num_valid_boxes == max_output_size:
+                        for k in range(box_data_length):
+                            output[i, j, k] = -1.0
+                        box_indices[i, j] = -1
+                    else:
+                        num_valid_boxes += 1
+    return output, box_indices
 
 
 @tvm.target.generic_func
-def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1):
+def non_max_suppression(data, valid_count, max_output_size=-1,
+                        iou_threshold=0.5, force_suppress=False, top_k=-1,
+                        id_index=0, return_indices=True, invalid_to_bottom=False):
     """Non-maximum suppression operator for object detection.
 
     Parameters
     ----------
-    data: tvm.Tensor
+    data : tvm.Tensor
         3-D tensor with shape [batch_size, num_anchors, 6].
         The last dimension should be in format of
         [class_id, score, box_left, box_top, box_right, box_bottom].
@@ -120,15 +249,28 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count : tvm.Tensor
         1-D tensor for valid number of boxes.
 
-    nms_threshold : float
+    max_output_size : optional, int
+        Max number of output valid boxes for each instance.
+        By default all valid boxes are returned.
+
+    iou_threshold : optional, float
         Non-maximum suppression threshold.
 
-    force_suppress : boolean
+    force_suppress : optional, boolean
         Whether to suppress all detections regardless of class_id.
 
-    nms_topk : int
+    top_k : optional, int
         Keep maximum top k detections before nms, -1 for no limit.
 
+    id_index : optional, int
+        index of the class categories, -1 to disable.
+
+    return_indices : optional, boolean
+        Whether to return box indices in input data.
+
+    invalid_to_bottom : optional, boolean
+        Whether to move all valid bounding boxes to the top.
+
     Returns
     -------
     out : tvm.Tensor
@@ -138,16 +280,17 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     --------
     .. code-block:: python
 
-        # An example to use nms
+        # An example to use non_max_suppression
         dshape = (1, 5, 6)
         data = tvm.placeholder(dshape, name="data")
         valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
-        nms_threshold = 0.7
+        iou_threshold = 0.7
         force_suppress = True
-        nms_topk = -1
-        out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
-        np_data = np.random.uniform(size=dshape).astype("float32")
-        np_valid_count = np.array([4]).astype("int32")
+        top_k = -1
+        out = non_max_suppression(data, valid_count, iou_threshold=iou_threshold,
+                                  force_suppress=force_suppress, top_k=top_k)
+        np_data = np.random.uniform(dshape)
+        np_valid_count = np.array([4])
         s = topi.generic.schedule_nms(out)
         f = tvm.build(s, [data, valid_count, out], "llvm")
         ctx = tvm.cpu()
@@ -161,7 +304,6 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
     valid_count_dtype = "int32"
     valid_count_buf = api.decl_buffer(valid_count.shape, valid_count_dtype,
                                       "valid_count_buf", data_alignment=4)
-    data_buf = api.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8)
     score_axis = 1
     score_shape = (batch_size, num_anchors)
     score_tensor = tvm.compute(score_shape, lambda i, j: data[i, j, score_axis])
@@ -180,13 +322,13 @@ def nms(data, valid_count, nms_threshold=0.5, force_suppress=False, nms_topk=-1)
                    in_buffers=[score_tensor_buf, valid_count_buf],
                    out_buffers=sort_tensor_buf,
                    name="nms_sort")
-    out = \
-        tvm.extern(data.shape,
-                   [data, sort_tensor, valid_count],
-                   lambda ins, outs: nms_ir(
-                       ins[0], ins[1], ins[2], outs[0], nms_threshold,
-                       force_suppress, nms_topk),
-                   dtype="float32",
-                   in_buffers=[data_buf, sort_tensor_buf, valid_count_buf],
-                   tag="nms")
-    return out
+    out, box_indices = hybrid_nms(data, sort_tensor, valid_count,
+                                  tvm.const(max_output_size, dtype="int32"),
+                                  tvm.const(iou_threshold, dtype="float32"),
+                                  tvm.const(force_suppress, dtype="bool"),
+                                  tvm.const(top_k, dtype="int32"),
+                                  tvm.const(id_index, dtype="int32"))
+    if not return_indices and invalid_to_bottom:
+        out = hybrid_rearrange_out(out)
+
+    return box_indices if return_indices else out
diff --git a/topi/python/topi/vision/ssd/multibox.py b/topi/python/topi/vision/ssd/multibox.py
index f1de42430dd6..2de1723dbd7b 100644
--- a/topi/python/topi/vision/ssd/multibox.py
+++ b/topi/python/topi/vision/ssd/multibox.py
@@ -1,75 +1,76 @@
-# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments
+# pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable
 """SSD multibox operators"""
 from __future__ import absolute_import as _abs
-import math
 import tvm
 
-from tvm import api
+from tvm import hybrid
+from tvm.intrin import exp, sqrt
 
 import topi
 
-from ..nms import nms
+from ..nms import non_max_suppression
 
-def multibox_prior_ir(data, out, sizes, ratios, steps, offsets):
-    """Low level IR routing for multibox_prior operator.
+@hybrid.script
+def hybrid_multibox_prior(data, sizes, ratios, steps, offsets):
+    """Hybrid routing for multibox_prior operator.
 
     Parameters
     ----------
-    data : Buffer
-        Input data buffer.
+    data : tvm.Tensor or numpy NDArray
+        4-D tensor with shape [batch, channel, height, width]]
 
-    out : Buffer
-        Output buffer.
+    sizes : tvm ConsExpr
+        Sizes for anchor boxes.
 
-    sizes : tuple of float
-        Tuple of sizes for anchor boxes.
-
-    ratios : tuple of float
-        Tuple of ratios for anchor boxes.
+    ratios : tvm ConsExpr
+        Ratios for anchor boxes.
 
-    steps : Tuple of float
+    steps : tvm ConsExpr
         Priorbox step across y and x, -1 for auto calculation.
 
-    offsets : tuple of int
+    offsets : tvm ConsExpr
         Priorbox center offsets, y and x respectively.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
+    output : tvm.Tensor or numpy NDArray
+        3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    ib = tvm.ir_builder.create()
-    p_out = ib.buffer_ptr(out)
     in_height = data.shape[2]
     in_width = data.shape[3]
     num_sizes = len(sizes)
     num_ratios = len(ratios)
-    size_ratio_concat = sizes + ratios
-    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
-    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
+    num_boxes = in_height * in_width * (num_sizes + num_ratios - 1)
+    output = output_tensor((1, num_boxes, 4), "float32")
+    steps_h = steps[0] * 1.0 if steps[0] > 0 else 1.0 / in_height
+    steps_w = steps[1] * 1.0 if steps[1] > 0 else 1.0 / in_width
     offset_h = offsets[0]
     offset_w = offsets[1]
 
-    with ib.for_range(0, in_height, for_type="parallel", name="i") as i:
+    # Need to define var out of const_range + if
+    w = 0.0
+    h = 0.0
+
+    for i in parallel(in_height):
         center_h = (i + offset_h) * steps_h
-        with ib.for_range(0, in_width, name="j") as j:
+        for j in range(in_width):
             center_w = (j + offset_w) * steps_w
-            for k in range(num_sizes + num_ratios - 1):
-                w = tvm.if_then_else(k < num_sizes,
-                                     size_ratio_concat[k] * in_height / in_width / 2.0,
-                                     size_ratio_concat[0] * in_height / in_width *
-                                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                h = tvm.if_then_else(
-                    k < num_sizes, size_ratio_concat[k] / 2.0,
-                    size_ratio_concat[0] / math.sqrt(size_ratio_concat[k + 1]) / 2.0)
-                count = (i * in_width * (num_sizes + num_ratios - 1) +
-                         j * (num_sizes + num_ratios - 1) + k) * 4
-                p_out[count] = center_w - w
-                p_out[count + 1] = center_h - h
-                p_out[count + 2] = center_w + w
-                p_out[count + 3] = center_h + h
-
-    return ib.get()
+            for k in const_range(num_sizes + num_ratios - 1):
+                if k < num_sizes:
+                    w = sizes[k] * in_height / in_width / 2.0
+                    h = sizes[k] / 2.0
+                else:
+                    w = sizes[0] * in_height / in_width \
+                        * sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
+                    h = sizes[0] / sqrt(ratios[k - num_sizes + 1] * 1.0) / 2.0
+                count = i * in_width * (num_sizes + num_ratios - 1) \
+                        + j * (num_sizes + num_ratios - 1) + k
+                output[0, count, 0] = center_w - w
+                output[0, count, 1] = center_h - h
+                output[0, count, 2] = center_w + w
+                output[0, count, 3] = center_h + h
+
+    return output
 
 
 @tvm.target.generic_func
@@ -101,115 +102,120 @@ def multibox_prior(data, sizes=(1,), ratios=(1,), steps=(-1, -1), offsets=(0.5,
     out : tvm.Tensor
         3-D tensor with shape [1, h_in * w_in * (num_sizes + num_ratios - 1), 4]
     """
-    num_sizes = len(sizes)
-    num_ratios = len(ratios)
-    oshape = (1, data.shape[2] * data.shape[3] * (num_sizes + num_ratios - 1), 4)
-    out = tvm.extern(oshape, [data], lambda ins, outs:
-                     multibox_prior_ir(ins[0], outs[0], sizes, ratios, steps, offsets),
-                     tag="multibox_prior")
+    out = hybrid_multibox_prior(data, tvm.convert(sizes), tvm.convert(ratios),
+                                tvm.convert(steps), tvm.convert(offsets))
     if clip:
         out = topi.clip(out, 0, 1)
     return out
 
 
-def transform_loc_ir(cls_prob, loc_pred, anchor, valid_count, out, clip, threshold, variances):
-    """Low level IR routing for transform location in multibox_detection operator.
+@hybrid.script
+def _hybridy_transform_loc(box, pred_loc, variance, clip):
+    """Transform prior anchor box to output box through location predictions.
+    """
+    al = box[0]
+    at = box[1]
+    ar = box[2]
+    ab = box[3]
+
+    px = pred_loc[0]
+    py = pred_loc[1]
+    pw = pred_loc[2]
+    ph = pred_loc[3]
+
+    vx = variance[0]
+    vy = variance[1]
+    vw = variance[2]
+    vh = variance[3]
+
+    output = output_tensor((4,), pred_loc.dtype)
+
+    aw = ar - al
+    ah = ab - at
+    ax = (al + ar) / 2.0
+    ay = (at + ab) / 2.0
+    ox = px * vx * aw + ax
+    oy = py * vy * ah + ay
+    ow = exp(pw * vw) * aw / 2.0
+    oh = exp(ph * vh) * ah / 2.0
+    output[0] = max(0.0, min(1.0, ox - ow)) if clip else ox - ow
+    output[1] = max(0.0, min(1.0, oy - oh)) if clip else oy - oh
+    output[2] = max(0.0, min(1.0, ox + ow)) if clip else ox + ow
+    output[3] = max(0.0, min(1.0, oy + oh)) if clip else oy + oh
+    return output
+
+@hybrid.script
+def hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                  clip, threshold, variances):
+    """Hybrid routing for transform location in multibox_detection operator.
 
     Parameters
     ----------
-    cls_prob : Buffer
-        Buffer of class probabilities.
+    cls_prob : tvm.Tensor or numpy NDArray
+        3-D tensor of class probabilities.
 
-    loc_pred : Buffer
-        Buffer of location regression predictions.
+    loc_pred : tvm.Tensor or numpy NDArray
+        2-D tensor of location regression predictions.
 
-    anchor : Buffer
-        Buffer of prior anchor boxes.
+    anchor : tvm.Tensor or numpy NDArray
+        3-D tensor of prior anchor boxes.
 
-    valid_count : Buffer
-        Buffer of number of valid output boxes.
-
-    out : Buffer
-        Output buffer.
-
-    clip : boolean
+    clip : tvm.const
         Whether to clip out-of-boundary boxes.
 
-    threshold : float
+    threshold : tvm.const
         Threshold to be a positive prediction.
 
-    variances : tuple of float
+    variances : tvm.ndarray
         Variances to be decoded from box regression output.
 
     Returns
     -------
-    stmt : Stmt
-        The result IR statement.
-    """
-    def transform_loc(loc, loc_base_idx, anchor, anchor_base_idx, clip, vx, vy, vw, vh):
-        """Transform prior anchor box to output box through location predictions.
-        """
-        al = anchor[anchor_base_idx]
-        at = anchor[anchor_base_idx + 1]
-        ar = anchor[anchor_base_idx + 2]
-        ab = anchor[anchor_base_idx + 3]
-        aw = ar - al
-        ah = ab - at
-        ax = (al + ar) / 2.0
-        ay = (at + ab) / 2.0
-        px = loc[loc_base_idx]
-        py = loc[loc_base_idx + 1]
-        pw = loc[loc_base_idx + 2]
-        ph = loc[loc_base_idx + 3]
-        ox = px * vx * aw + ax
-        oy = py * vy * ah + ay
-        ow = tvm.exp(pw * vw) * aw / 2.0
-        oh = tvm.exp(ph * vh) * ah / 2.0
-        return tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox - ow)), ox - ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy - oh)), oy - oh), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, ox + ow)), ox + ow), \
-               tvm.if_then_else(clip, tvm.max(0, tvm.min(1, oy + oh)), oy + oh)
+    out_loc : tvm.Tensor or numpy NDArray
+        3-D tensor of transformed location.
 
+    valid_count : tvm.Tensor or numpy NDArray
+        1_d tensor of valid counts for boxes.
+    """
     batch_size = cls_prob.shape[0]
     num_classes = cls_prob.shape[1]
     num_anchors = cls_prob.shape[2]
-
-    ib = tvm.ir_builder.create()
-    p_cls_prob = ib.buffer_ptr(cls_prob)
-    p_loc_pred = ib.buffer_ptr(loc_pred)
-    p_anchor = ib.buffer_ptr(anchor)
-    p_valid_count = ib.buffer_ptr(valid_count)
-    p_out = ib.buffer_ptr(out)
-    with ib.for_range(0, batch_size, for_type="parallel", name="n") as n:
-        p_valid_count[n] = 0
-        with ib.for_range(0, num_anchors, name="i") as i:
+    box_coord = allocate((4,), loc_pred.dtype)
+    pred_coord = allocate((4,), loc_pred.dtype)
+    out_loc = output_tensor((batch_size, num_anchors, 6),
+                            loc_pred.dtype)
+    valid_count = output_tensor((batch_size,), "int32")
+
+    for i in parallel(batch_size):
+        valid_count[i] = 0
+        for j in range(num_anchors):
             # Find the predicted class id and probability
-            score = ib.allocate('float32', (1,), name="score", scope="local")
-            cls_id = ib.allocate('int32', (1,), name="id", scope="local")
-            score[0] = -1.0
-            cls_id[0] = 0
-            with ib.for_range(0, num_classes, name="j") as j:
-                with ib.if_scope(j > 0):
-                    temp = p_cls_prob[n * num_anchors * num_classes + j * num_anchors + i]
-                    cls_id[0] = tvm.if_then_else(temp > score[0], j, cls_id[0])
-                    score[0] = tvm.max(temp, score[0])
-            with ib.if_scope(tvm.all(cls_id[0] > 0, score[0] < threshold)):
-                cls_id[0] = 0
+            score = -1.0
+            cls_id = 0
+            for k in range(num_classes):
+                if k > 0:
+                    temp = cls_prob[i, k, j]
+                    cls_id = k if temp > score else cls_id
+                    score = max(temp, score)
+            if cls_id > 0 and score < threshold:
+                cls_id = 0
             # [id, prob, xmin, ymin, xmax, ymax]
             # Remove background, restore original id
-            with ib.if_scope(cls_id[0] > 0):
-                out_base_idx = n * num_anchors * 6 + p_valid_count[n] * 6
-                p_out[out_base_idx] = cls_id[0] - 1.0
-                p_out[out_base_idx + 1] = score[0]
-                offset = i * 4
-                p_out[out_base_idx + 2], p_out[out_base_idx + 3], p_out[out_base_idx + 4], \
-                p_out[out_base_idx + 5] = transform_loc(p_loc_pred, n * num_anchors * 4 + offset,
-                                                        p_anchor, offset, clip, variances[0],
-                                                        variances[1], variances[2], variances[3])
-                p_valid_count[n] += 1
-
-    return ib.get()
-
+            if cls_id > 0:
+                out_loc[i, valid_count[i], 0] = cls_id - 1.0
+                out_loc[i, valid_count[i], 1] = score
+                for l in range(4):
+                    box_coord[l] = anchor[0, j, l]
+                    pred_coord[l] = loc_pred[i, j * 4 + l]
+                out_coord = _hybridy_transform_loc(box_coord, pred_coord,
+                                                   variances, clip)
+                out_loc[i, valid_count[i], 2] = out_coord[0]
+                out_loc[i, valid_count[i], 3] = out_coord[1]
+                out_loc[i, valid_count[i], 4] = out_coord[2]
+                out_loc[i, valid_count[i], 5] = out_coord[3]
+                valid_count[i] += 1
+
+    return out_loc, valid_count
 
 @tvm.target.generic_func
 def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01,
@@ -240,24 +246,10 @@ def multibox_transform_loc(cls_prob, loc_pred, anchor, clip=True, threshold=0.01
     -------
     ret : tuple of tvm.Tensor
     """
-    batch_size = cls_prob.shape[0]
-    num_anchors = anchor.shape[1]
-    oshape = (batch_size, num_anchors, 6)
-    # Define data alignment for intermediate buffer
-    valid_count_dtype = "int32"
-    valid_count_buf = api.decl_buffer((batch_size,), valid_count_dtype,
-                                      "valid_count_buf", data_alignment=4)
-    out_buf = api.decl_buffer(oshape, cls_prob.dtype, "out_buf", data_alignment=8)
-    valid_count, out = \
-        tvm.extern([(batch_size,), oshape],
-                   [cls_prob, loc_pred, anchor],
-                   lambda ins, outs: transform_loc_ir(
-                       ins[0], ins[1], ins[2], outs[0], outs[1], clip, threshold, variances),
-                   dtype=[valid_count_dtype, cls_prob.dtype],
-                   out_buffers=[valid_count_buf, out_buf],
-                   tag="multibox_transform_loc")
-    return [out, valid_count]
-
+    return hybrid_multibox_transform_loc(cls_prob, loc_pred, anchor,
+                                         tvm.const(clip, "bool"),
+                                         tvm.const(threshold, "float32"),
+                                         tvm.convert(variances))
 
 @tvm.target.generic_func
 def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nms_threshold=0.5,
@@ -300,5 +292,7 @@ def multibox_detection(cls_prob, loc_pred, anchor, clip=True, threshold=0.01, nm
     """
     inter_out = multibox_transform_loc(cls_prob, loc_pred, anchor,
                                        clip, threshold, variances)
-    out = nms(inter_out[0], inter_out[1], nms_threshold, force_suppress, nms_topk)
+    out = non_max_suppression(inter_out[0], inter_out[1], -1,
+                              nms_threshold, force_suppress, nms_topk,
+                              return_indices=False)
     return out
diff --git a/topi/tests/python/test_topi_vision.py b/topi/tests/python/test_topi_vision.py
index 3c0c3aa854d7..02e04212b63e 100644
--- a/topi/tests/python/test_topi_vision.py
+++ b/topi/tests/python/test_topi_vision.py
@@ -8,11 +8,62 @@
 
 from tvm.contrib.pickle_memoize import memoize
 from topi.util import get_const_tuple
-from topi.vision import ssd, nms
+from topi.vision import ssd, non_max_suppression, get_valid_counts
+
+
+def verify_get_valid_counts(dshape, score_threshold):
+    dtype = "float32"
+    batch_size, num_anchor, elem_length = dshape
+    np_data = np.random.uniform(size=dshape).astype(dtype)
+    np_out1 = np.zeros(shape=(batch_size,))
+    np_out2 = np.zeros(shape=dshape).astype(dtype)
+    for i in range(batch_size):
+        np_out1[i] = 0
+        inter_idx = 0
+        for j in range(num_anchor):
+            score = np_data[i, j, 1]
+            if score > score_threshold:
+                for k in range(elem_length):
+                    np_out2[i, inter_idx, k] = np_data[i, j, k]
+                np_out1[i] += 1
+                inter_idx += 1
+            if j >= np_out1[i]:
+                for k in range(elem_length):
+                    np_out2[i, j, k] = -1.0
 
+    def check_device(device):
+        ctx = tvm.context(device, 0)
+        if not ctx.exist:
+            print("Skip because %s is not enabled" % device)
+            return
+        print("Running on target: %s" % device)
+        with tvm.target.create(device):
+            data = tvm.placeholder(dshape, name="data", dtype=dtype)
+            outs = get_valid_counts(data, score_threshold)
+            s = topi.generic.schedule_multibox_prior(outs)
+
+        tvm_input_data = tvm.nd.array(np_data, ctx)
+        tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx)
+        tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx)
+        f = tvm.build(s, [data, outs[0], outs[1]], device)
+        f(tvm_input_data, tvm_out1, tvm_out2)
+        tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3)
+        tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3)
 
-def test_nms():
+    for device in ['llvm']:
+        check_device(device)
+
+
+def test_get_valid_counts():
+    verify_get_valid_counts((1, 2500, 6), 0)
+    verify_get_valid_counts((1, 2500, 6), -1)
+    verify_get_valid_counts((3, 1000, 6), 0.55)
+    verify_get_valid_counts((16, 500, 6), 0.95)
+
+
+def test_non_max_suppression():
     dshape = (1, 5, 6)
+    indices_dshape = (1, 5)
     data = tvm.placeholder(dshape, name="data")
     valid_count = tvm.placeholder((dshape[0],), dtype="int32", name="valid_count")
     nms_threshold = 0.7
@@ -24,8 +75,9 @@ def test_nms():
                          [1, 0.5, 100, 60, 70, 110]]]).astype(data.dtype)
     np_valid_count = np.array([4]).astype(valid_count.dtype)
     np_result = np.array([[[2, 0.9, 35, 61, 52, 79], [0, 0.8, 1, 20, 25, 45],
-                           [0, 0.4, 4, 21, 19, 40], [-1, 0.9, 35, 61, 52, 79],
+                           [-1, -1, -1, -1, -1, -1], [-1, -1, -1, -1, -1, -1],
                            [-1, -1, -1, -1, -1, -1]]])
+    np_indices_result = np.array([[3, 0, -1, -1, -1]])
 
     def check_device(device):
         ctx = tvm.context(device, 0)
@@ -35,18 +87,27 @@ def check_device(device):
         print("Running on target: %s" % device)
         with tvm.target.create(device):
             if device == 'llvm':
-                out = nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             else:
-                out = topi.cuda.nms(data, valid_count, nms_threshold, force_suppress, nms_topk)
+                out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk, return_indices=False)
+                indices_out = topi.cuda.non_max_suppression(data, valid_count, -1, nms_threshold, force_suppress, nms_topk)
             s = topi.generic.schedule_nms(out)
+            indices_s = topi.generic.schedule_nms(indices_out)
 
         tvm_data = tvm.nd.array(np_data, ctx)
         tvm_valid_count = tvm.nd.array(np_valid_count, ctx)
+
         tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx)
         f = tvm.build(s, [data, valid_count, out], device)
         f(tvm_data, tvm_valid_count, tvm_out)
         tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)
 
+        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"), ctx)
+        f = tvm.build(indices_s, [data, valid_count, indices_out], device)
+        f(tvm_data, tvm_valid_count, tvm_indices_out)
+        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(), np_indices_result, rtol=1e-4)
+
     for device in ['llvm']:
         check_device(device)
 
@@ -274,7 +335,8 @@ def test_proposal():
 
 
 if __name__ == "__main__":
-    test_nms()
+    test_get_valid_counts()
+    test_non_max_suppression()
     test_multibox_prior()
     test_multibox_detection()
     test_roi_align()
diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py
new file mode 100644
index 000000000000..6a5d63b9f8cf
--- /dev/null
+++ b/tutorials/frontend/deploy_ssd_gluoncv.py
@@ -0,0 +1,104 @@
+"""
+Deploy Single Shot Multibox Detector(SSD) model
+===============================================
+**Author**: `Yao Wang <https://github.com/kevinthesun>`_
+
+This article is an introductory tutorial to deploy SSD models with TVM.
+We will use GluonCV pre-trained SSD model and convert it to Relay IR
+"""
+import tvm
+
+from matplotlib import pyplot as plt
+from nnvm import compiler
+from nnvm.frontend import from_mxnet
+from nnvm.testing.config import ctx_list
+from tvm import relay
+from tvm.contrib import graph_runtime
+from gluoncv import model_zoo, data, utils
+
+
+######################################################################
+# Preliminary and Set parameters
+# ------------------------------
+# We should build TVM with sort support, in TVM root directory
+#
+# .. code-block:: bash
+#
+#   echo "set(USE_SORT ON)" > config.mk
+#   make -j8
+#
+# .. note::
+#
+#   Currently we support compiling SSD on CPU only.
+#   GPU support is in progress.
+#
+#   To get best inference performance on CPU, change
+#   target argument according to your device and
+#   follow the :ref:`tune_relay_x86` to tune x86 CPU and
+#   :ref:`tune_relay_arm` for arm cpu.
+#
+#   SSD with VGG as body network is not supported yet since
+#   x86 conv2d schedule doesn't support dilation.
+
+supported_model = [
+    'ssd_512_resnet18_v1_voc',
+    'ssd_512_resnet18_v1_coco',
+    'ssd_512_resnet50_v1_voc',
+    'ssd_512_resnet50_v1_coco',
+    'ssd_512_resnet101_v2_voc',
+    'ssd_512_mobilenet1_0_voc',
+    'ssd_512_mobilenet1_0_coco',
+]
+
+model_name = "ssd_512_resnet50_v1_voc"
+dshape = (1, 3, 512, 512)
+dtype = "float32"
+target_list = ctx_list()
+
+######################################################################
+# Download and pre-process demo image
+
+im_fname = utils.download('https://github.com/dmlc/web-data/blob/master/' +
+                          'gluoncv/detection/street_small.jpg?raw=true',
+                          path='street_small.jpg')
+x, img = data.transforms.presets.ssd.load_test(im_fname, short=512)
+
+######################################################################
+# Convert and compile model for CPU.
+
+block = model_zoo.get_model(model_name, pretrained=True)
+
+def compile(target):
+    net, params = relay.frontend.from_mxnet(block, {"data": dshape})
+    with relay.build_config(opt_level=3):
+        graph, lib, params = relay.build(net, target, params=params)
+    return graph, lib, params
+
+######################################################################
+# Create TVM runtime and do inference
+
+def run(graph, lib, params, ctx):
+    # Build TVM runtime
+    m = graph_runtime.create(graph, lib, ctx)
+    tvm_input = tvm.nd.array(x.asnumpy(), ctx=ctx)
+    m.set_input('data', tvm_input)
+    m.set_input(**params)
+    # execute
+    m.run()
+    # get outputs
+    class_IDs, scores, bounding_boxs = m.get_output(0), m.get_output(1), m.get_output(2)
+    return class_IDs, scores, bounding_boxs
+
+for target, ctx in target_list:
+    if target == "cuda":
+        print("GPU not supported yet, skip.")
+        continue
+    graph, lib, params = compile(target)
+    class_IDs, scores, bounding_boxs = run(graph, lib, params, ctx)
+
+######################################################################
+# Display result
+
+ax = utils.viz.plot_bbox(img, bounding_boxs.asnumpy()[0], scores.asnumpy()[0],
+                         class_IDs.asnumpy()[0], class_names=block.classes)
+plt.show()
diff --git a/tutorials/nnvm/deploy_ssd.py b/tutorials/nnvm/deploy_ssd_mxnet.py
similarity index 98%
rename from tutorials/nnvm/deploy_ssd.py
rename to tutorials/nnvm/deploy_ssd_mxnet.py
index eadb8fd28e0c..1a71c96eaa0c 100644
--- a/tutorials/nnvm/deploy_ssd.py
+++ b/tutorials/nnvm/deploy_ssd_mxnet.py
@@ -61,7 +61,7 @@
 image_url = "https://cloud.githubusercontent.com/assets/3307514/20012567/" \
             "cbb60336-a27d-11e6-93ff-cbc3f09f5c9e.jpg"
 inference_symbol_folder = \
-"c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
+    "c1904e900848df4548ce5dfb18c719c7-a28c4856c827fe766aa3da0e35bad41d44f0fb26"
 inference_symbol_url = "https://gist.github.com/kevinthesun/c1904e900848df4548ce5dfb18c719c7/" \
                        "archive/a28c4856c827fe766aa3da0e35bad41d44f0fb26.zip"
 

From 0128af8c692326fdc0792da2e04d744b37359745 Mon Sep 17 00:00:00 2001
From: Andrew Tulloch <andrew@tullo.ch>
Date: Mon, 11 Mar 2019 12:55:01 -0700
Subject: [PATCH 92/93] Implement flop support for int8 models (#2776)

---
 python/tvm/autotvm/task/task.py               |  6 +-
 .../unittest/test_autotvm_flop_calculator.py  | 73 ++++++++++++++++---
 2 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py
index 7c587fe39783..a0c992b07347 100644
--- a/python/tvm/autotvm/task/task.py
+++ b/python/tvm/autotvm/task/task.py
@@ -338,7 +338,7 @@ def _count_flop(exp):
                             expr.Max, expr.Min,
                             expr.EQ, expr.NE, expr.LT, expr.LE, expr.GT, expr.GE,
                             expr.And, expr.Or, expr.Not)):
-            base = 1 if "float" in exp.a.dtype else 0
+            base = 1
 
             if isinstance(exp, expr.Not):  # unary
                 return base + _count_flop(exp.a)
@@ -348,6 +348,10 @@ def _count_flop(exp):
             return _count_flop(exp.condition) + max(_count_flop(exp.true_value),
                                                     _count_flop(exp.false_value))
         if isinstance(exp, expr.Call):
+            if exp.call_type == expr.Call.Halide:
+                # Ignore flops from indexing expressions.
+                return 0
+
             return sum([_count_flop(x) for x in exp.args])
 
         raise FlopCalculationError("Found unsupported operator in the compute expr")
diff --git a/tests/python/unittest/test_autotvm_flop_calculator.py b/tests/python/unittest/test_autotvm_flop_calculator.py
index 27bd49fe14df..c5c046894f0c 100644
--- a/tests/python/unittest/test_autotvm_flop_calculator.py
+++ b/tests/python/unittest/test_autotvm_flop_calculator.py
@@ -5,11 +5,17 @@
 
 from tvm.autotvm.task.task import compute_flop
 
+def random_dtypes():
+    """Return pair of (input, accumulator) dtypes"""
+    candidates = [("float32", "float32"), ("float16", "float32"), ("int8", "int32")]
+    return candidates[np.random.choice(len(candidates))]
+
 def test_conv():
     for i in range(5):
         N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
-        D = tvm.placeholder((N, CI, H, W))
-        K = tvm.placeholder((CO, CI, KH, KW))
+        (input_dtype, acc_dtype) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+        K = tvm.placeholder((CO, CI, KH, KW), dtype=input_dtype)
 
         KH = min(H, KH)
         KW = min(W, KW)
@@ -22,7 +28,8 @@ def test_conv():
         OW = (W - KW) + 1
 
         C = tvm.compute((N, CO, OH, OW), lambda n, co, h, w:
-        tvm.sum(D[n][ci][h][w] * K[co][ci][h][w], axis=[ci, kh, kw]))
+        tvm.sum(D[n][ci][h][w].astype(acc_dtype) * K[co][ci][h][w].astype(acc_dtype),
+                axis=[ci, kh, kw]))
 
         s = tvm.create_schedule([C.op])
 
@@ -31,15 +38,16 @@ def test_conv():
 def test_pack_gemm():
     for i in range(5):
         N, L, M = [np.random.randint(10, 128) * 4 for _ in range(3)]
-        A = tvm.placeholder((N, L))
-        B = tvm.placeholder((M, L))
+        (input_dtype, acc_dtype) = random_dtypes()
+        A = tvm.placeholder((N, L), dtype=input_dtype)
+        B = tvm.placeholder((M, L), dtype=input_dtype)
         k = tvm.reduce_axis((0, L))
 
         bn = 4
         A_pack = tvm.compute((N // bn, L, bn), lambda i, j, k: A[i * bn + k][j])
         B_pack = tvm.compute((M // bn, L, bn), lambda i, j, k: B[i * bn + k][j])
         C_pack = tvm.compute((N // bn, M // bn, bn, bn), lambda i, j, ii, jj:
-        tvm.sum(A_pack[i, k, ii] * B_pack[j, k, jj], axis=[k]))
+        tvm.sum(A_pack[i, k, ii].astype(acc_dtype) * B_pack[j, k, jj].astype(acc_dtype), axis=[k]))
         C = tvm.compute((N, M), lambda i, j: C_pack[i // bn][j // bn][i % bn][j % bn])
 
         s = tvm.create_schedule([C.op])
@@ -48,14 +56,61 @@ def test_pack_gemm():
 def test_outer_dot():
     for i in range(5):
         N, M = [np.random.randint(10, 128) * 4 for _ in range(2)]
-        A = tvm.placeholder((N,))
-        B = tvm.placeholder((M,))
+        (input_dtype, acc_dtype) = random_dtypes()
+        A = tvm.placeholder((N,), dtype=input_dtype)
+        B = tvm.placeholder((M,), dtype=input_dtype)
 
-        C = tvm.compute((N, M), lambda i, j: A[i] * B[j])
+        C = tvm.compute((N, M), lambda i, j: A[i].astype(acc_dtype) * B[j].astype(acc_dtype))
 
         s = tvm.create_schedule([C.op])
         assert compute_flop(s) == N * M
 
+def test_max_pool():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        (input_dtype, _) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute(
+            (N, CO, OH, OW),
+            lambda n, co, h, w: tvm.max(D[n][co][h + kh][w + kw], axis=[kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == N * CO * OH * OW * KH * KW
+
+def test_average_pool():
+    for i in range(5):
+        N, H, W, CO, CI, KH, KW = [np.random.randint(10, 32) for _ in range(7)]
+        (input_dtype, acc_dtype) = random_dtypes()
+        D = tvm.placeholder((N, CI, H, W), dtype=input_dtype)
+
+        KH = min(H, KH)
+        KW = min(W, KW)
+
+        kh = tvm.reduce_axis((0, KH))
+        kw = tvm.reduce_axis((0, KW))
+
+        OH = (H - KH) + 1
+        OW = (W - KW) + 1
+
+        C = tvm.compute(
+            (N, CO, OH, OW),
+            lambda n, co, h, w: tvm.sum(D[n][co][h + kh][w + kw].astype(acc_dtype) / (KW * KH), axis=[kh, kw]))
+
+        s = tvm.create_schedule([C.op])
+
+        assert compute_flop(s) == 2 * N * CO * OH * OW * KH * KW
+
 def test_move():
     """No float number operation in simple move. So the estimator should raise an error """
     N = 1024

From cc12f7d5371c89ab51936ce35bd06917d298082f Mon Sep 17 00:00:00 2001
From: SasakiSaki <galaster@foxmail.com>
Date: Tue, 12 Mar 2019 05:57:59 +0800
Subject: [PATCH 93/93] [Relay] Improve more operator mxnet frontend importer
 (#2772)

---
 python/tvm/relay/frontend/mxnet.py | 72 ++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 8 deletions(-)

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
index 1585d55ac1b9..93bd8efc6752 100644
--- a/python/tvm/relay/frontend/mxnet.py
+++ b/python/tvm/relay/frontend/mxnet.py
@@ -298,6 +298,51 @@ def _mx_leaky_relu(inputs, attrs):
     raise RuntimeError("act_type: {} is not supported".format(act_type))
 
 
+def _mx_make_power(power):
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _expr.const(power, dtype=None)
+        # Note: int maps to "int32", float maps to "float32"
+        return _op.power(inputs[0], scalar)
+    return _impl
+
+
+def _mx_make_exponent(base):
+    # exp(b, x) = e^b * e^x
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _op.exp(_expr.const(base, dtype="float32"))
+        return _op.multiply(inputs[0], scalar)
+    return _impl
+
+
+def _mx_make_logarithm(base):
+    # log(b, x) = log(x) / log(b)
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        scalar = _op.log(_expr.const(base, dtype="float32"))
+        return _op.divide(inputs[0], scalar)
+    return _impl
+
+
+def _mx_expm1():
+    # exp_minus_1 x = exp(x) - 1
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        one = _expr.const(1, dtype="float32")
+        return _op.log(_op.subtract(inputs[0], one))
+    return _impl
+
+
+def _mx_log1p():
+    # 1_plus_log x = log(x + 1)
+    def _impl(inputs, _):  # Note: no attrs
+        assert len(inputs) == 1
+        one = _expr.const(1, dtype="float32")
+        return _op.log(_op.add(inputs[0], one))
+    return _impl
+
+
 def _mx_lrn(inputs, attrs):
     new_attrs = {}
     new_attrs["alpha"] = attrs.get_float("alpha", 0.0001)
@@ -450,7 +495,6 @@ def _mx_l2_normalize(inputs, attrs):
     "exp",
     "sigmoid",
     "tanh",
-    "exp",
     "negative",
     "reshape_like",
     "zeros_like",
@@ -482,6 +526,20 @@ def _mx_l2_normalize(inputs, attrs):
     "_minimum"               : _rename(_op.minimum),
     "flatten"                : _rename(_op.nn.batch_flatten),
     "Flatten"                : _rename(_op.nn.batch_flatten),
+    # scalar power
+    "square"                 : _mx_make_power(2),
+    "sqrt"                   : _mx_make_power(1/2),
+    "rsqrt"                  : _mx_make_power(-1/2),
+    "cbrt"                   : _mx_make_power(1/3),
+    "rcbrt"                  : _mx_make_power(-1/3),
+    "__pow_scalar__"         : _binop_scalar(_op.power),
+    "_power_scalar"          : _binop_scalar(_op.power),
+    "__rsub_scalar__"        : _rbinop_scalar(_op.subtract),
+    "_rminus_scalar"         : _rbinop_scalar(_op.subtract),
+    "__rdiv_scalar__"        : _rbinop_scalar(_op.divide),
+    "_rdiv_scalar"           : _rbinop_scalar(_op.divide),
+    "__rpow_scalar__"        : _rbinop_scalar(_op.power),
+    # scalar op
     "__add_scalar__"         : _binop_scalar(_op.add),
     "_plus_scalar"           : _binop_scalar(_op.add),
     "__sub_scalar__"         : _binop_scalar(_op.subtract),
@@ -490,13 +548,10 @@ def _mx_l2_normalize(inputs, attrs):
     "_mul_scalar"            : _binop_scalar(_op.multiply),
     "__div_scalar__"         : _binop_scalar(_op.divide),
     "_div_scalar"            : _binop_scalar(_op.divide),
-    "__pow_scalar__"         : _binop_scalar(_op.power),
-    "_power_scalar"          : _binop_scalar(_op.power),
-    "__rsub_scalar__"        : _rbinop_scalar(_op.subtract),
-    "_rminus_scalar"         : _rbinop_scalar(_op.subtract),
-    "__rdiv_scalar__"        : _rbinop_scalar(_op.divide),
-    "_rdiv_scalar"           : _rbinop_scalar(_op.divide),
-    "__rpow_scalar__"        : _rbinop_scalar(_op.power),
+    "log2"                   : _mx_make_logarithm(2),
+    "log10"                  : _mx_make_logarithm(10),
+    "log1p"                  : _mx_log1p,
+    "expm1"                  : _mx_expm1,
     "_equal_scalar"          : _mx_compare(_op.equal, _binop_scalar),
     "_not_equal_scalar"      : _mx_compare(_op.not_equal, _binop_scalar),
     "_greater_scalar"        : _mx_compare(_op.greater, _binop_scalar),
@@ -506,6 +561,7 @@ def _mx_l2_normalize(inputs, attrs):
     "_maximum_scalar"        : _binop_scalar(_op.maximum),
     "_minimum_scalar"        : _binop_scalar(_op.minimum),
     # reduction ops
+    "mean"          : _reduce(_op.mean),
     "max"           : _reduce(_op.max),
     "min"           : _reduce(_op.min),
     "sum"           : _reduce(_op.sum),