From dc06e32c614d7af7bb19bd23db619bc1a29daef9 Mon Sep 17 00:00:00 2001
From: Zhi <5145158+zhiics@users.noreply.github.com>
Date: Mon, 11 Mar 2019 15:43:54 -0700
Subject: [PATCH] [NNVM] Add missed part of annotation (#10)

* add missed part of annotation

* fix check_computation and slice_like

* keep _build as before

* fix vta failure
---
 nnvm/python/nnvm/compiler/build_module.py     | 109 ++---
 nnvm/python/nnvm/top/nn.py                    | 135 +++---
 nnvm/python/nnvm/top/vision.py                |  10 +-
 nnvm/src/compiler/graph_compile.cc            |  38 +-
 nnvm/src/compiler/precompute_prune.cc         |  11 +-
 nnvm/src/pass/device_copy_op.cc               |  59 +--
 nnvm/src/pass/graph_annotate.cc               |  49 ++-
 nnvm/src/pass/graph_annotate.h                |  44 +-
 nnvm/src/pass/insert_copy_op.cc               |  13 +-
 nnvm/src/top/nn/nn.cc                         |  14 +-
 nnvm/src/top/tensor/transform.cc              |  17 +-
 .../python/compiler/test_compiler_cache.py    |   5 +-
 .../python/unittest/test_graph_annotation.py  | 407 +++++++++---------
 topi/python/topi/arm_cpu/conv2d.py            |   8 +
 topi/python/topi/cuda/conv2d_winograd.py      |   8 +
 topi/python/topi/x86/conv2d.py                |   5 +
 vta/python/vta/top/vta_conv2d.py              |  16 +-
 17 files changed, 513 insertions(+), 435 deletions(-)

diff --git a/nnvm/python/nnvm/compiler/build_module.py b/nnvm/python/nnvm/compiler/build_module.py
index 0a0cf4e67692..4083e3adcbf1 100644
--- a/nnvm/python/nnvm/compiler/build_module.py
+++ b/nnvm/python/nnvm/compiler/build_module.py
@@ -25,7 +25,7 @@
 
 class AnnotationType(IntEnum):
     """The purpose of annotation."""
-    TARGET = 1          # Only set target to the node attribute.
+    HOMO_TARGET = 1     # Only set the same target to the node attribute.
     DEVICE_TARGET = 2   # Annotate both device type and target info to a node.
     COPY_INSERTION = 3  # Annotate device type and target. Insert copy node.
 
@@ -44,6 +44,8 @@ class BuildConfig(object):
         "opt_level": 2,
         "add_pass": None,
         "ext_accel": None,
+        "fallback_device": None,
+        "op_name_device": None,
     }
     def __init__(self, **kwargs):
         self._old_scope = None
@@ -105,6 +107,13 @@ def build_config(**kwargs):
     ext_accel: str
         External accelerator for optimizing the operators it supports in the whole graph.
 
+    fallback_device : str or tvm.TVMContext
+        The fallback device. It is also used as the default device for
+        operators without specified device during heterogeneous execution.
+
+    op_name_device : dict of str to str or tvm.TVMContext.
+        A dictionary contains operator name to device context mapping.
+
     Returns
     -------
     config: BuildConfig
@@ -131,11 +140,11 @@ def _lower(sch, inputs, func_name, graph):
         f, (tvm.container.Array, tuple, list)) else [f]
 
 
-@tvm.register_func("nnvm.compiler.build_module")
-def _build(funcs, target_host):
+@tvm.register_func("nnvm.compiler.build_target")
+def _build(funcs, target, target_host):
     if target_host == "":
         target_host = None
-    return tvm.build(funcs, target_host=target_host)
+    return tvm.build(funcs, target=target, target_host=target_host)
 
 
 def _update_shape_dtype(shape, dtype, params):
@@ -203,8 +212,7 @@ def optimize(graph, shape, dtype="float32", layout=None, target=None):
 
 
 def build(graph, target=None, shape=None, dtype="float32",
-          params=None, target_host=None, layout=None, op_name_device=None,
-          fallback_device=None):
+          params=None, target_host=None, layout=None):
     """Build graph into runtime library.
 
     The build function will optimize the graph and do the compilation.
@@ -218,8 +226,9 @@ def build(graph, target=None, shape=None, dtype="float32",
     graph : Graph
         The graph to be used in lowering
 
-    target : str or :any:`tvm.target.Target`, optional
-        The build target
+    target : str, :any:`tvm.target.Target`, or a str to str dict, optional
+       The build target or a dictionay contains the device name to compilation
+       target.
 
     shape : dict of str to tuple, optional
         The input shape to the graph
@@ -244,12 +253,6 @@ def build(graph, target=None, shape=None, dtype="float32",
     layout : dict of str to str or str optional
         The input layout
 
-    op_name_device : dict of str to int.
-        A dictionary contains operator name to device mapping.
-
-    fallback_device : TVMContext.
-        The fallback device.
-
     Returns
     -------
     graph : Graph
@@ -313,20 +316,23 @@ def build(graph, target=None, shape=None, dtype="float32",
         if _all_var_init:
             init_var = initialize_variables(shape, dtype)
 
-        _annotate_graph(graph, device_target, op_name_device, fallback_device)
+        graph = _annotate_graph(graph, device_target,
+                                AnnotationType.DEVICE_TARGET)
         # Apply optimization
-        graph = optimize(graph, shape, dtype, layout, target)
+        graph = optimize(graph, shape, dtype, layout)
 
         # Clear extra params without nodes.
         _remove_noref_params(params, graph)
 
-        _annotate_graph(graph, device_target)
+        graph = _annotate_graph(graph, device_target,
+                                AnnotationType.HOMO_TARGET)
         # Precompute prune
         if params and cfg.pass_enabled("PrecomputePrune"):
             graph, params = precompute_prune(graph, params)
             shape, dtype = _update_shape_dtype(shape, dtype, params)
-        _annotate_graph(graph, device_target, op_name_device, fallback_device,
-                        insert_copy_node=True)
+        graph = _annotate_graph(graph, device_target,
+                                AnnotationType.COPY_INSERTION)
+
         # Operator Fusion and generation
         graph = graph_attr.set_shape_inputs(graph, shape)
         graph = graph.apply("InferShape")
@@ -352,14 +358,8 @@ def build(graph, target=None, shape=None, dtype="float32",
 
 def _annotate_graph(graph,
                     device_target,
-                    op_name_device=None,
-                    fallback_device=None,
-                    insert_copy_node=False):
-    """Helper function to anntoate the graph. Both the target and the device
-    info of a graph node will be annotated if `op_name_device` is set.
-    Otherwise, only the target info will be attached. `insert_copy_node`
-    indicates if we need to insert cross device data copy node. It is only for
-    heterogeneous execution purpose.
+                    annotation_type):
+    """Helper function to anntoate the graph according to the annotation type.
 
     Parameters
     ----------
@@ -370,37 +370,42 @@ def _annotate_graph(graph,
         A dictionary contain device type to compilation target pairs that will
         be used to build the graph.
 
-    op_name_device : dict of str to int.
-        A dictionary contains operator name to device mapping.
-
-    fallback_device : TVMContext.
-        The fallback device.
-
-    insert_copy_node : bool.
-        A bool value indicates wheter or not cross device data copy node is
-        required.
+    annotation_type : AnnotationType.
+        The annotation type. This is used to indicate if we annotate all nodes
+        to the same type (AnnotationType.HOMO_TARGET), attach different target
+        to different nodes (AnnotationType.DEVICE_TARGET), or attach target and
+        insert across device copy nodes (AnnotationType.COPY_INSERTION).
 
     Returns
     -------
     graph : Graph.
         The updated graph.
     """
-    annotation_type = AnnotationType.TARGET
-    if op_name_device:
-        annotation_type = AnnotationType.COPY_INSERTION if insert_copy_node \
-            else AnnotationType.DEVICE_TARGET
-        if not isinstance(op_name_device, dict):
-            raise ValueError("op_name_device must be a dictionary.")
-        fallback_device = fallback_device if fallback_device else tvm.cpu(0)
-        if not isinstance(fallback_device, TVMContext):
-            raise ValueError("fallback_device must be the type of TVMContext.")
-        op_name_device.update((name, tvm.context(dev).device_type)
-                              for name, dev in op_name_device.items())
-        graph._set_json_attr("fallback", fallback_device.device_type, "int")
-        graph._set_json_attr("op_name", list(op_name_device.keys()),
-                             "list_str")
-        graph._set_json_attr("op_device", list(op_name_device.values()),
-                             "list_int")
+    if not isinstance(annotation_type, AnnotationType):
+        raise ValueError("annotation_type must be the type of AnnotationType")
+
+    if annotation_type != AnnotationType.HOMO_TARGET:
+        # Heterogeneous execution.
+        if len(device_target) > 1 or 0 not in device_target:
+            op_name_device = BuildConfig.current.op_name_device
+            op_name_device = op_name_device if op_name_device else {}
+            if not isinstance(op_name_device, dict):
+                raise ValueError("op_name_device must be a dictionary of operator "
+                                 "name to device context.")
+            fallback_device = BuildConfig.current.fallback_device
+            fallback_device = fallback_device if fallback_device else tvm.cpu(0)
+            if not isinstance(fallback_device, TVMContext):
+                raise ValueError("fallback_device must be the type of TVMContext.")
+            op_name_device.update((name, tvm.context(dev).device_type)
+                                  for name, dev in op_name_device.items())
+            graph._set_json_attr("fallback", fallback_device.device_type, "int")
+            graph._set_json_attr("op_name", list(op_name_device.keys()),
+                                 "list_str")
+            graph._set_json_attr("op_device", list(op_name_device.values()),
+                                 "list_int")
+        else:
+            # Homogeneous execution.
+            annotation_type = AnnotationType.HOMO_TARGET
 
     graph._set_json_attr("annotation_type", int(annotation_type), "int")
     graph._set_json_attr("device_type", list(device_target.keys()), "list_int")
diff --git a/nnvm/python/nnvm/top/nn.py b/nnvm/python/nnvm/top/nn.py
index 1235dd34508e..55ab9d29e2bd 100644
--- a/nnvm/python/nnvm/top/nn.py
+++ b/nnvm/python/nnvm/top/nn.py
@@ -61,9 +61,10 @@ def schedule_log_softmax(_, outs, target):
 @reg.register_compute("dense")
 def compute_dense(attrs, inputs, _):
     """Compute definition of dense"""
-    if attrs.get_bool("use_bias"):
-        return topi.nn.dense(inputs[0], inputs[1], bias=inputs[2])
-    return topi.nn.dense(inputs[0], inputs[1])
+    with tvm.target.create(attrs.get_str("target")):
+        if attrs.get_bool("use_bias"):
+            return topi.nn.dense(inputs[0], inputs[1], bias=inputs[2])
+        return topi.nn.dense(inputs[0], inputs[1])
 
 @reg.register_schedule("dense")
 def schedule_dense(_, outs, target):
@@ -95,37 +96,42 @@ def compute_conv2d(attrs, inputs, _):
     if dilation_h < 1 or dilation_w < 1:
         raise ValueError("dilation should be positive value")
 
-    if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
-        # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
-                             dilation, layout, out_dtype=out_dtype)
-        # pylint: enable=assignment-from-no-return
-    elif groups == 1:
-        out = topi.nn.conv2d(
-            inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype=out_dtype)
-    elif layout == "NCHW" and \
-         groups == get_const_int(inputs[0].shape[1]) and \
-         groups == channels:
-        out = topi.nn.depthwise_conv2d_nchw(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
-    elif layout in ["NCHW", "NCHW4c"]:
-        out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides, padding, dilation, groups,
-                                        out_dtype=out_dtype)
-    elif layout == "NHWC" and \
-         kernel_layout == "HWOI" and \
-         groups == get_const_int(inputs[0].shape[3]) and \
-         groups == channels:
-        out = topi.nn.depthwise_conv2d_nhwc(
-            inputs[0], inputs[1], strides, padding, dilation, out_dtype=out_dtype)
-    else:
-        raise ValueError("not support arbitrary group number for now")
+    with tvm.target.create(attrs.get_str("target")):
+        if groups == 1 and layout == 'NCHW4c' and inputs[0].dtype == 'int8':
+            # pylint: disable=assignment-from-no-return
+            out = topi.nn.conv2d(inputs[0], inputs[1], strides, padding,
+                                 dilation, layout, out_dtype=out_dtype)
+            # pylint: enable=assignment-from-no-return
+        elif groups == 1:
+            out = topi.nn.conv2d(
+                inputs[0], inputs[1], strides, padding, dilation, layout,
+                out_dtype=out_dtype)
+        elif layout == "NCHW" and \
+             groups == get_const_int(inputs[0].shape[1]) and \
+             groups == channels:
+            out = topi.nn.depthwise_conv2d_nchw(
+                inputs[0], inputs[1], strides, padding, dilation,
+                out_dtype=out_dtype)
+        elif layout in ["NCHW", "NCHW4c"]:
+            out = topi.nn.group_conv2d_nchw(inputs[0], inputs[1], strides,
+                                            padding, dilation, groups,
+                                            out_dtype=out_dtype)
+        elif layout == "NHWC" and \
+             kernel_layout == "HWOI" and \
+             groups == get_const_int(inputs[0].shape[3]) and \
+             groups == channels:
+            out = topi.nn.depthwise_conv2d_nhwc(
+                inputs[0], inputs[1], strides, padding, dilation,
+                out_dtype=out_dtype)
+        else:
+            raise ValueError("not support arbitrary group number for now")
 
-    if attrs.get_bool("use_bias"):
-        bias = inputs[2]
-        expand_axis = 1 if layout == "NCHW" else 0
-        bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
-        out = topi.add(out, bias)
-    return out
+        if attrs.get_bool("use_bias"):
+            bias = inputs[2]
+            expand_axis = 1 if layout == "NCHW" else 0
+            bias = topi.expand_dims(bias, axis=expand_axis, num_newaxis=2)
+            out = topi.add(out, bias)
+        return out
 
 @reg.register_schedule("conv2d")
 def schedule_conv2d(attrs, outs, target):
@@ -171,7 +177,8 @@ def _reshape(*args, **kwargs):
         return raw_reshape(*args, **kwargs)
     sym.reshape = _reshape
 
-    return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, sym)
+    with tvm.target.create(attrs.get_str("target")):
+        return topi.nn.conv2d_alter_layout(attrs, inputs, tinfos, sym)
 
 reg.register_pattern("conv2d", OpPattern.OUT_ELEMWISE_FUSABLE)
 
@@ -194,21 +201,24 @@ def compute_contrib_conv2d_NCHWc(attrs, inputs, _):
         _, in_channel_chunk, _, _, in_channel_block = get_const_tuple(inputs[0].shape)
         in_channel = in_channel_chunk * in_channel_block
     assert dilation == (1, 1), "not support dilate now"
-    if groups == 1:
-        # pylint: disable=assignment-from-no-return
-        out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding, dilation,
-                                   layout, out_layout, out_dtype)
-    elif groups == in_channel and groups == out_channel:
-        out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
-                                             dilation, layout, out_layout, out_dtype)
-        # pylint: enable=assignment-from-no-return
-    else:
-        raise ValueError("not support arbitrary group number > 1 for now")
-    if attrs.get_bool("use_bias"):
-        bias = inputs[2]
-        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
-        out = topi.add(out, bias)
-    return out
+
+    with tvm.target.create(attrs.get_str("target")):
+        if groups == 1:
+            # pylint: disable=assignment-from-no-return
+            out = topi.nn.conv2d_NCHWc(inputs[0], inputs[1], strides, padding,
+                                       dilation, layout, out_layout, out_dtype)
+        elif groups == in_channel and groups == out_channel:
+            out = topi.nn.depthwise_conv2d_NCHWc(inputs[0], inputs[1], strides,
+                                                 padding, dilation, layout,
+                                                 out_layout, out_dtype)
+            # pylint: enable=assignment-from-no-return
+        else:
+            raise ValueError("not support arbitrary group number > 1 for now")
+        if attrs.get_bool("use_bias"):
+            bias = inputs[2]
+            bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+            out = topi.add(out, bias)
+        return out
 
 @reg.register_schedule("_contrib_conv2d_NCHWc")
 def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
@@ -228,7 +238,7 @@ def schedule_contrib_conv2d_NCHWc(attrs, outs, target):
 
 @reg.register_compute("_contrib_conv2d_winograd_weight_transform")
 def compute_contrib_conv2d_winograd_weight_transform(attrs, inputs, _):
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.nn.conv2d_winograd_weight_transform(
             inputs[0], attrs.get_int('tile_size'))
 
@@ -254,16 +264,17 @@ def compute_contrib_conv2d_winograd_without_weight_transform(attrs, inputs, _):
     assert dilation == (1, 1), "Do not support dilate now"
     assert groups == 1, "Do not supoort arbitrary group number"
 
-    # pylint: disable=assignment-from-no-return
-    out = topi.nn.conv2d_winograd_without_weight_transform(
-        inputs[0], inputs[1], strides, padding, dilation, layout, out_dtype,
-        tile_size)
+    with tvm.target.create(attrs.get_str("target")):
+        # pylint: disable=assignment-from-no-return
+        out = topi.nn.conv2d_winograd_without_weight_transform(
+            inputs[0], inputs[1], strides, padding, dilation, layout,
+            out_dtype, tile_size)
 
-    if attrs.get_bool("use_bias"):
-        bias = inputs[2]
-        bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
-        out = topi.add(out, bias)
-    return out
+        if attrs.get_bool("use_bias"):
+            bias = inputs[2]
+            bias = topi.expand_dims(bias, axis=1, num_newaxis=2)
+            out = topi.add(out, bias)
+        return out
 
 @reg.register_schedule("_contrib_conv2d_winograd_without_weight_transform")
 def schedule_contrib_conv2d_winograd_without_weight_transform(attrs, outs, target):
@@ -290,7 +301,7 @@ def compute_conv2d_transpose(attrs, inputs, _):
     assert dilation == (1, 1), "not support dilate now"
     assert groups == 1, "only support groups == 1 for now"
 
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         out = topi.nn.conv2d_transpose_nchw(inputs[0], inputs[1], strides,
                                             padding, out_dtype)
         if attrs.get_bool("use_bias"):
@@ -388,7 +399,7 @@ def compute_lrn(attrs, inputs, _):
     alpha = attrs.get_float("alpha")
     beta = attrs.get_float("beta")
     bias = attrs.get_float("bias")
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.nn.lrn(inputs[0], size, axis, alpha, beta, bias)
 
 @reg.register_schedule("lrn")
@@ -404,7 +415,7 @@ def compute_l2_normalize(attrs, inputs, _):
     """Compute definition of l2 normalize"""
     eps = attrs.get_float("eps")
     axis = attrs.get_int_tuple("axis")
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.nn.l2_normalize(inputs[0], eps, axis)
 
 @reg.register_schedule("l2_normalize")
diff --git a/nnvm/python/nnvm/top/vision.py b/nnvm/python/nnvm/top/vision.py
index ee415c485536..42cb32214abf 100644
--- a/nnvm/python/nnvm/top/vision.py
+++ b/nnvm/python/nnvm/top/vision.py
@@ -10,7 +10,7 @@
 @reg.register_compute("yolo_reorg")
 def compute_reorg(attrs, inputs, _):
     """Compute definition of reorg"""
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.vision.reorg(inputs[0], attrs.get_int("stride"))
 
 @reg.register_schedule("yolo_reorg")
@@ -29,7 +29,7 @@ def compute_region(attrs, inputs, _):
     coords = attrs.get_int("coords")
     background = attrs.get_int("background")
     softmax = attrs.get_int("softmax")
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.vision.yolo.region(inputs[0], n, classes, coords,
                                        background, softmax)
 
@@ -58,7 +58,7 @@ def compute_multibox_prior(attrs, inputs, _):
     offsets = attrs.get_float_tuple('offsets')
     clip = attrs.get_bool('clip')
 
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.vision.ssd.multibox_prior(inputs[0], sizes, ratios,
                                               steps, offsets, clip)
 
@@ -78,7 +78,7 @@ def compute_multibox_transform_loc(attrs, inputs, _):
     threshold = attrs.get_float('threshold')
     variance = attrs.get_float_tuple('variances')
 
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.vision.ssd.multibox_transform_loc(
             inputs[0], inputs[1], inputs[2], clip, threshold, variance)
 
@@ -99,7 +99,7 @@ def compute_nms(attrs, inputs, _):
     force_suppress = attrs.get_bool('force_suppress')
     nms_topk = attrs.get_int('nms_topk')
 
-    with tvm.target.create(attrs.get_string("target")):
+    with tvm.target.create(attrs.get_str("target")):
         return topi.vision.nms(inputs[0], inputs[1], nms_threshold,
                                force_suppress, nms_topk)
 
diff --git a/nnvm/src/compiler/graph_compile.cc b/nnvm/src/compiler/graph_compile.cc
index c384f9799bde..742de0386abc 100644
--- a/nnvm/src/compiler/graph_compile.cc
+++ b/nnvm/src/compiler/graph_compile.cc
@@ -287,28 +287,28 @@ nnvm::Graph GraphCompile(const nnvm::Graph& g) {
   ret.attrs["dtype"] = std::make_shared<any>(std::move(new_dtype_vec));
   ret.attrs["dltype"] = std::make_shared<any>(std::move(new_dltype_vec));
 
-  tvm::runtime::Module module;
-  if (tar_func_map.size() >= 1) {
-    // Setup device assignment for heterogeneous execution.
-    if (tar_func_map.size() > 1) {
-      DeviceVector device_vec(new_idx.num_node_entries(), 0);
-      for (size_t i = 0; i < new_idx.num_nodes(); i++) {
-        device_vec[new_idx.entry_id(i, 0)] = new_idx[i].source->attrs.device_type;
-      }
-      for (uint32_t nid = 0; nid < new_idx.num_nodes(); nid++) {
-        const auto& inode = new_idx[nid];
-        for (const auto& e : inode.inputs) {
-          device_vec[new_idx.entry_id(e)] =
-              new_idx[e.node_id].source->attrs.device_type;
-        }
+  // Setup device assignment for heterogeneous execution.
+  if (tar_func_map.size() > 1) {
+    DeviceVector device_vec(new_idx.num_node_entries(), 0);
+    for (size_t i = 0; i < new_idx.num_nodes(); i++) {
+      device_vec[new_idx.entry_id(i, 0)] = new_idx[i].source->attrs.device_type;
+    }
+    for (uint32_t nid = 0; nid < new_idx.num_nodes(); nid++) {
+      const auto& inode = new_idx[nid];
+      for (const auto& e : inode.inputs) {
+        device_vec[new_idx.entry_id(e)] =
+            new_idx[e.node_id].source->attrs.device_type;
       }
-      ret.attrs["device_index"] = std::make_shared<any>(std::move(device_vec));
     }
-    // Setup module.
-    static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_module");
-    module = fbuild(tvm::Map<std::string, Array<tvm::LoweredFunc>>(
-          tar_func_map.begin(), tar_func_map.end()), target_host);
+    ret.attrs["device_index"] = std::make_shared<any>(std::move(device_vec));
   }
+  // Setup module.
+  static const PackedFunc& fbuild = GetPackedFunc("nnvm.compiler.build_target");
+  tvm::runtime::Module module =
+      fbuild(tvm::Map<std::string, Array<tvm::LoweredFunc>>(
+                 tar_func_map.begin(), tar_func_map.end()),
+             "", target_host);
+
   ret.attrs["module"] = std::make_shared<any>(std::move(module));
   ret = nnvm::ApplyPass(ret, "PlanMemory");
   ret = DecorateMemoryPlan(ret, assign_flag);
diff --git a/nnvm/src/compiler/precompute_prune.cc b/nnvm/src/compiler/precompute_prune.cc
index cb07c0193a86..c0c0c4b4c0ec 100644
--- a/nnvm/src/compiler/precompute_prune.cc
+++ b/nnvm/src/compiler/precompute_prune.cc
@@ -26,6 +26,7 @@ nnvm::Graph PrecomputePrune(nnvm::Graph src) {
   std::unordered_set<std::string> unique_name;
   // number of edges that are not variable
   int non_var_edge = 0;
+  std::unordered_map<Node*, std::vector<NodeEntry> > version_hist;
 
   auto replace_pruned_entry = [&] (const NodeEntry& e) {
     if (!entry_var.count(e)) {
@@ -34,8 +35,9 @@ nnvm::Graph PrecomputePrune(nnvm::Graph src) {
       }
       nnvm::NodePtr var = nnvm::Node::Create();
       var->attrs.name = e.node->attrs.name;
-      if (e.version) {
-          var->attrs.name += "_" + std::to_string(e.version);
+      if (e.version && version_hist.count(e.node.get()) == 0) {
+        var->attrs.name += "_" + std::to_string(e.version);
+        version_hist[e.node.get()] = std::vector<NodeEntry>{};
       }
       if (e.node->num_outputs() != 1) {
         var->attrs.name += "_output" + std::to_string(e.index);
@@ -75,6 +77,11 @@ nnvm::Graph PrecomputePrune(nnvm::Graph src) {
     }
   });
 
+  // nothing being pruned.
+  if (non_var_edge == 0 && version_hist.size() == 0) {
+    return src;
+  }
+
   for (auto& e : src.outputs) {
     if (pruned.count(e.node.get())) {
       e = replace_pruned_entry(e);
diff --git a/nnvm/src/pass/device_copy_op.cc b/nnvm/src/pass/device_copy_op.cc
index aafd2c2c069f..d0ad107f31f6 100644
--- a/nnvm/src/pass/device_copy_op.cc
+++ b/nnvm/src/pass/device_copy_op.cc
@@ -3,18 +3,8 @@
  * \file device_copy_op.h
  * \brief Register an operator to perform data copy across different devices.
  */
-#include <nnvm/compiler/op_attr_types.h>
-#include <nnvm/graph_attr_types.h>
 #include <nnvm/op.h>
 #include <nnvm/op_attr_types.h>
-#include <nnvm/pass_functions.h>
-#include <nnvm/symbolic.h>
-#include <tvm/tensor.h>
-
-#include <memory>
-#include <set>
-#include <string>
-#include <vector>
 
 #include "../top/elemwise_op_common.h"
 #include "../top/op_common.h"
@@ -22,42 +12,21 @@
 namespace nnvm {
 namespace op {
 
-inline bool DeviceCopyOpInferShape(const nnvm::NodeAttrs& attrs,
-                                   std::vector<TShape>* in_shapes,
-                                   std::vector<TShape>* out_shapes) {
-  CHECK_EQ(in_shapes->size(), 1U)
-      << "Cross device copy op can only have one input.";
-  CHECK_EQ(out_shapes->size(), 1U)
-      << "Cross device copy op can only have one output.";
-
-  if (out_shapes->at(0).ndim() != 0) return true;
-  SHAPE_ASSIGN(out_shapes->at(0), in_shapes->at(0));
-  return true;
-}
-
-inline bool DeviceCopyOpInferType(const nnvm::NodeAttrs& attrs,
-                                  std::vector<int>* in_types,
-                                  std::vector<int>* out_types) {
-  CHECK_EQ(in_types->size(), 1U)
-      << "Cross device copy op can only have one input.";
-  CHECK_EQ(out_types->size(), 1U)
-      << "Cross device copy op can only have one output.";
-
-  out_types->back() = in_types->at(0);
-  return true;
-}
-
 NNVM_REGISTER_OP(device_copy_op)
-    .describe(
-        R"code(Copy data from one tensor to antoher.
-               The source and destination might be \
-               one different devices.)code" NNVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .set_num_outputs(1)
-    .set_attr<nnvm::FInferShape>("FInferShape", DeviceCopyOpInferShape)
-    .set_attr<nnvm::FInferType>("FInferType", DeviceCopyOpInferType)
-    .set_attr<nnvm::FCorrectLayout>(
-        "FCorrectLayout", nnvm::top::ElemwiseFixedLayoutCopyToOut<1, 1>);
+.describe(R"code(
+Copy data from one tensor to another. The source and destination might be
+on different devices.
+)code" NNVM_ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::FInferShape>("FInferShape", nnvm::top::ElemwiseShape<1, 1>)
+.set_attr<nnvm::FInferType>("FInferType", nnvm::top::ElemwiseType<1, 1>)
+.set_attr<FInplaceOption>("FInplaceOption",
+  [](const NodeAttrs& attrs){
+    return std::vector<std::pair<int, int> >{{0, 0}};
+  })
+.set_attr<nnvm::FCorrectLayout>(
+  "FCorrectLayout", nnvm::top::ElemwiseArbitraryLayout<1, 1>);
 
 }  // namespace op
 }  // namespace nnvm
diff --git a/nnvm/src/pass/graph_annotate.cc b/nnvm/src/pass/graph_annotate.cc
index c26cac1875a7..2698f3deb744 100644
--- a/nnvm/src/pass/graph_annotate.cc
+++ b/nnvm/src/pass/graph_annotate.cc
@@ -9,6 +9,7 @@
 #include <nnvm/graph.h>
 #include <nnvm/pass.h>
 #include <nnvm/pass_functions.h>
+#include <dlpack/dlpack.h>
 
 #include <algorithm>
 #include <queue>
@@ -21,7 +22,7 @@ using StringVector = std::vector<std::string>;
 using IntVector = std::vector<int>;
 
 enum class AnnotationType : int {
-  kTarget = 1,        // Only set target to the node attribute.
+  kHomoTarget = 1,        // Only set target to the node attribute.
   kDeivceTarget = 2,  // Annotate both device type and target info to a node.
   kCopyInsertion = 3  // Annotate device type and target. Insert copy node.
 };
@@ -34,8 +35,17 @@ enum class AnnotationType : int {
 nnvm::Graph AnnotateTarget(nnvm::Graph&& g, const StringVector& targets,
                            const IntVector& device_types) {
   DFSVisit(g.outputs, [&](const nnvm::NodePtr& node) {
-    node->attrs.device_type = device_types[0];
-    node->attrs.dict["target"] = targets[0];
+    if (device_types.size() == 1) {
+        node->attrs.device_type = device_types[0];
+        node->attrs.dict["target"] = targets[0];
+    } else {
+        const auto& it = std::find(device_types.begin(), device_types.end(),
+                                   static_cast<int>(kDLCPU));
+        CHECK(it != device_types.end()) << "No cpu target is found";
+        node->attrs.device_type = static_cast<int>(kDLCPU);;
+        node->attrs.dict["target"] =
+            targets[std::distance(device_types.begin(), it)];
+    }
   });
   return g;
 }
@@ -76,6 +86,14 @@ nnvm::Graph AnnotateGraph(nnvm::Graph g) {
   AnnotationType annotation_type =
       static_cast<AnnotationType>(g.MoveCopyAttr<int>("annotation_type"));
 
+  const auto& targets = g.GetAttr<StringVector>("target");
+  const auto& device_types = g.GetAttr<IntVector>("device_type");
+
+  if (annotation_type == AnnotationType::kHomoTarget) {
+    g = AnnotateTarget(std::move(g), targets, device_types);
+    return g;
+  }
+
   const StringVector& op_names = g.HasAttr("op_name")
                                      ? g.MoveCopyAttr<StringVector>("op_name")
                                      : StringVector();
@@ -84,26 +102,21 @@ nnvm::Graph AnnotateGraph(nnvm::Graph g) {
                                     : IntVector();
   CHECK_EQ(op_names.size(), op_devices.size())
       << "The number of op names doesn't match the number of assigned device.";
-  const auto& targets = g.GetAttr<StringVector>("target");
-  const auto& device_types = g.GetAttr<IntVector>("device_type");
 
   int fallback_device = 0;
   nnvm::ManualAnnotatorPtr annotate = nullptr;
-  if (!op_names.empty()) {
-    CHECK(g.HasAttr("fallback"))
-        << "The fallback device is not attached to the graph.";
-    fallback_device = g.MoveCopyAttr<int>("fallback");
-    std::unordered_map<std::string, int> op_name_dev_map;
-    for (size_t i = 0; i < op_names.size(); i++) {
-      op_name_dev_map.emplace(std::make_pair(op_names[i], op_devices[i]));
-    }
-    annotate = std::make_shared<nnvm::ManualAnnotator>(op_name_dev_map,
-                                                       fallback_device);
+  CHECK(g.HasAttr("fallback"))
+      << "The fallback device is not attached to the graph.";
+  fallback_device = g.MoveCopyAttr<int>("fallback");
+
+  std::unordered_map<std::string, int> op_name_dev_map;
+  for (size_t i = 0; i < op_names.size(); i++) {
+    op_name_dev_map.emplace(std::make_pair(op_names[i], op_devices[i]));
   }
+  annotate = std::make_shared<nnvm::ManualAnnotator>(op_name_dev_map,
+                                                     fallback_device);
 
-  if (annotation_type == AnnotationType::kTarget) {
-    g = AnnotateTarget(std::move(g), targets, device_types);
-  } else if (annotation_type == AnnotationType::kDeivceTarget) {
+  if (annotation_type == AnnotationType::kDeivceTarget) {
     g = AnnotateDeviceTarget(std::move(g), targets, device_types, annotate);
   } else if (annotation_type == AnnotationType::kCopyInsertion) {
     g = AnnotateDeviceTarget(std::move(g), targets, device_types, annotate);
diff --git a/nnvm/src/pass/graph_annotate.h b/nnvm/src/pass/graph_annotate.h
index eba958eff915..58e464355e8a 100644
--- a/nnvm/src/pass/graph_annotate.h
+++ b/nnvm/src/pass/graph_annotate.h
@@ -1,4 +1,4 @@
-/*!	
+/*!
  * Copyright (c) 2018 by Contributors
  * \file graph_annotate.h
  * \brief Define rules to annotate a graph.
@@ -14,20 +14,20 @@
 namespace nnvm {
 
 class ManualAnnotator;
- /*
-  * This class is an abstract class that can be derived by other classes to
-  * implement how a node should be selected.
-  */
+/*
+ * This class is an abstract class that can be derived by other classes to
+ * implement how a node should be selected.
+ */
 class GraphAnnotator {
  public:
   explicit GraphAnnotator(int fallback_device)
-       : fallback_device_(fallback_device) {}
+      : fallback_device_(fallback_device) {}
   virtual ~GraphAnnotator() = default;
   // A virtual function that is implemented by different annotation methods.
   virtual int AnnotateNode(const nnvm::Node* n) const = 0;
 
   int GetFallbackDevice() const {
-     return fallback_device_;
+    return fallback_device_;
   }
 
  private:
@@ -36,28 +36,28 @@ class GraphAnnotator {
   int fallback_device_;
 };
 
- /*
-  * This class defines a manual way to annotate a graph node. In this method,
-  * users are expected to provide the node name and also the device type that it
-  * should be assigned to. However, if the operator contained in the graph node
-  * is registered with a fallback property or the operator name has not been
-  * saved, this node will be annotated with the fallback device.
-  */
+/*
+ * This class defines a manual way to annotate a graph node. In this method,
+ * users are expected to provide the node name and also the device type that it
+ * should be assigned to. However, if the operator contained in the graph node
+ * is registered with a fallback property or the operator name has not been
+ * saved, this node will be annotated with the fallback device.
+ */
 class ManualAnnotator : public GraphAnnotator {
   using OpNameDeviceMap = std::unordered_map<std::string, int>;
  public:
   explicit ManualAnnotator(const OpNameDeviceMap& op_name_dev_map,
-                            int fallback_device)
-       : GraphAnnotator(fallback_device),
-         op_name_dev_map_(new OpNameDeviceMap(op_name_dev_map)) {}
+                           int fallback_device)
+      : GraphAnnotator(fallback_device),
+        op_name_dev_map_(new OpNameDeviceMap(op_name_dev_map)) {}
 
   int AnnotateNode(const nnvm::Node* n) const final {
-     if (n->is_variable()) return 0;
-     if (n->op()->fallback) return fallback_device_;
+    if (n->is_variable()) return 0;
+    if (n->op()->fallback) return fallback_device_;
 
-     return op_name_dev_map_->count(n->op()->name)
-                ? op_name_dev_map_->at(n->op()->name)
-                : fallback_device_;
+    return op_name_dev_map_->count(n->op()->name)
+               ? op_name_dev_map_->at(n->op()->name)
+               : fallback_device_;
   }
 
  private:
diff --git a/nnvm/src/pass/insert_copy_op.cc b/nnvm/src/pass/insert_copy_op.cc
index 225d95bbf102..e54b37c5290a 100644
--- a/nnvm/src/pass/insert_copy_op.cc
+++ b/nnvm/src/pass/insert_copy_op.cc
@@ -1,6 +1,6 @@
 /*!
  * Copyright (c) 2018 by Contributors
- * \file place_copy_op.cc
+ * \file insert_copy_op.cc
  * \brief Place corss device data copy nodes on entries where two nodes are
  * assigned to different devices.
  */
@@ -17,7 +17,6 @@
 #include <vector>
 
 namespace nnvm {
-namespace pass {
 
 nnvm::Graph InsertDataCopy(nnvm::Graph g) {
   const nnvm::Op* copy_op = nnvm::Op::Get("device_copy_op");
@@ -59,11 +58,9 @@ nnvm::Graph InsertDataCopy(nnvm::Graph g) {
 }
 
 NNVM_REGISTER_PASS(InsertDataCopy)
-    .describe(
-        "Insert cross device data copy nodes to transfer data between "
-        "opertors that are executed on different devices.")
-    .set_body(InsertDataCopy)
-    .set_change_graph(true);
+.describe("Insert cross device data copy nodes to transfer data between "
+"opertors that are executed on different devices.")
+.set_body(InsertDataCopy)
+.set_change_graph(true);
 
-}  // namespace pass
 }  // namespace nnvm
diff --git a/nnvm/src/top/nn/nn.cc b/nnvm/src/top/nn/nn.cc
index eba8ec5ba2d9..e9a556281ff0 100644
--- a/nnvm/src/top/nn/nn.cc
+++ b/nnvm/src/top/nn/nn.cc
@@ -42,7 +42,9 @@ inline bool DenseInferShape(const nnvm::NodeAttrs& attrs,
   }
   CHECK_EQ(out_shape->size(), 1U);
   const TShape& dshape = (*in_shape)[DenseParam::kData];
-  CHECK(!shape_is_none(dshape));
+  if (shape_is_none(dshape)) {
+    return false;
+  }
   if ((*in_shape)[DenseParam::kData].ndim() != 0) {
     NNVM_ASSIGN_OUTPUT_SHAPE(attrs, *out_shape, 0, TShape({dshape[0], param.units}));
   }
@@ -52,6 +54,16 @@ inline bool DenseInferShape(const nnvm::NodeAttrs& attrs,
   if (param.use_bias) {
     NNVM_ASSIGN_INPUT_SHAPE(attrs, *in_shape, DenseParam::kBias, TShape({param.units}));
   }
+  for (const auto& shape : *in_shape) {
+    if (shape_is_none(shape)) {
+      return false;
+    }
+  }
+  for (const auto& shape : *out_shape) {
+    if (shape_is_none(shape)) {
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/nnvm/src/top/tensor/transform.cc b/nnvm/src/top/tensor/transform.cc
index 31a2c5057ea2..d12fa0a06063 100644
--- a/nnvm/src/top/tensor/transform.cc
+++ b/nnvm/src/top/tensor/transform.cc
@@ -1180,7 +1180,6 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
   CHECK_EQ(in_attrs->size(), 2U);
   CHECK_EQ(out_attrs->size(), 1U);
   const SliceLikeParam& param = nnvm::get<SliceLikeParam>(attrs.parsed);
-  CHECK_EQ(param.axis.ndim(), param.offset.ndim());
   const TShape& src_shape = in_attrs->at(0);
   const TShape& target_shape = in_attrs->at(1);
   Tuple<dim_t> end_idx;
@@ -1195,18 +1194,22 @@ inline bool SliceLikeShape(const nnvm::NodeAttrs& attrs,
       }
     }
   } else {
+    const bool has_offset = param.offset.ndim() != 0;
+    if (has_offset) {
+      CHECK_EQ(param.axis.ndim(), param.offset.ndim());
+    }
     for (uint32_t i = 0; i < param.axis.ndim(); ++i) {
       int axis = param.axis[i];
       if (axis < 0) {
-        axis = src_shape.ndim() + i;
+        axis += src_shape.ndim();
       }
       CHECK_LT(axis, target_shape.ndim())
         << "Axis " << axis << " exceeds dimension "
         << target_shape.ndim()<< " of target_shape.";
       end_idx[axis] = target_shape[axis];
-      CHECK_LE(end_idx[axis] + param.offset[i], src_shape[axis])
+      CHECK_LE(end_idx[axis] + (has_offset? param.offset[i] : 0), src_shape[axis])
         << "End index of axis " << axis << " + offset" << " exceeds input shape: "
-        << end_idx[axis] << " + " << param.offset[i] << " vs " << src_shape[axis];
+        << end_idx[axis] << " + " << (has_offset? param.offset[i] : 0) << " vs " << src_shape[axis];
     }
   }
   TShape out_shape = TShape(std::move(end_idx));
@@ -1261,13 +1264,15 @@ NNVM_REGISTER_OP(slice_like)
         }
       }
     } else {
+      const bool has_offset = param.offset.ndim() != 0;
       for (uint32_t i = 0; i < param.axis.ndim(); ++i) {
         int axis = param.axis[i];
         if (axis < 0) {
           axis = static_cast<int>(src_shape.size()) + axis;
         }
-        begin_idx.Set(static_cast<size_t>(axis), param.offset[i]);
-        end_idx.Set(static_cast<size_t>(axis), target_shape[axis] + param.offset[i]);
+        begin_idx.Set(static_cast<size_t>(axis), has_offset? param.offset[i] : 0);
+        end_idx.Set(static_cast<size_t>(axis),
+            target_shape[axis] + (has_offset? param.offset[i] : 0));
         CHECK_LE(topi::GetConstInt(end_idx[axis]),
                  topi::GetConstInt(src_shape[axis]))
           << "End index of axis " << axis << " exceeds input shape: "
diff --git a/nnvm/tests/python/compiler/test_compiler_cache.py b/nnvm/tests/python/compiler/test_compiler_cache.py
index dff5d76cfbaa..d50100741533 100644
--- a/nnvm/tests/python/compiler/test_compiler_cache.py
+++ b/nnvm/tests/python/compiler/test_compiler_cache.py
@@ -3,6 +3,8 @@
 from tvm.contrib import graph_runtime
 import nnvm.symbol as sym
 import nnvm.compiler
+import nnvm.compiler.build_module as build_module
+from nnvm.compiler.build_module import AnnotationType 
 
 def test_compile_cache():
     x = sym.Variable("x")
@@ -26,7 +28,8 @@ def verify(graph, lib):
     graph, lib, _ = nnvm.compiler.build(z, "llvm", shape_dict)
     inputs = [tvm.placeholder((10,)), tvm.placeholder((10,))]
     new_graph = nnvm.graph.create(z)
-    new_graph = nnvm.compiler.build_module._annotate_graph(new_graph, {1: "llvm"})
+    new_graph = build_module._annotate_graph(new_graph, {1: "llvm"},
+                                             AnnotationType.HOMO_TARGET)
 
     gkey = nnvm.compiler.graph_key(new_graph, inputs, "llvm")
     gkey2 = nnvm.compiler.graph_key(new_graph, inputs + inputs, "llvm")
diff --git a/nnvm/tests/python/unittest/test_graph_annotation.py b/nnvm/tests/python/unittest/test_graph_annotation.py
index 48d9c04057f9..a96c36d1c547 100644
--- a/nnvm/tests/python/unittest/test_graph_annotation.py
+++ b/nnvm/tests/python/unittest/test_graph_annotation.py
@@ -1,5 +1,4 @@
 """Unit tests for graph annotation."""
-
 import time
 import zipfile
 import os
@@ -35,14 +34,14 @@ def execute_original_graph(sym, target, shape, dtype, params):
 
 def check_annotated_graph(sym, target, op_name_device, expected_num_nodes,
                           fallback_device, data_shape, params):
-    deploy_graph, _, params = nnvm.compiler.build(
-        sym,
-        target=target,
-        shape=data_shape,
-        dtype="float32",
-        params=params,
-        op_name_device=op_name_device,
-        fallback_device=fallback_device)
+    with nnvm.compiler.build_config(fallback_device=fallback_device,
+                                    op_name_device=op_name_device):
+        deploy_graph, _, params = nnvm.compiler.build(
+            sym,
+            target=target,
+            shape=data_shape,
+            dtype="float32",
+            params=params)
 
     new_sym = deploy_graph.symbol()
     assert len(new_sym.list_input_names()) == len(sym.list_input_names())
@@ -50,7 +49,7 @@ def check_annotated_graph(sym, target, op_name_device, expected_num_nodes,
     assert deploy_graph.index.num_nodes == expected_num_nodes
 
 
-def test_conv_network(device, target):
+def test_conv_network():
     R""" The network is as following:
             data1       data2
               |           |
@@ -60,42 +59,47 @@ def test_conv_network(device, target):
                     |
                   conv2d
     """
-    if not tvm.module.enabled(device):
-        print("Skip test because %s is not enabled." % device)
-        return
-
-    out_channels = 16
-    data1 = symbol.Variable(name="data1")
-    data2 = symbol.Variable(name="data2")
-    simple_net1 = symbol.conv2d(data=data1, kernel_size=(3, 3),
-                                channels=out_channels, padding=(1, 1),
-                                use_bias=True)
-
-    simple_net2 = symbol.conv2d(data=data2, kernel_size=(3, 3),
-                                channels=out_channels, padding=(1, 1),
-                                use_bias=True)
-    ret = symbol.elemwise_add(simple_net1, simple_net2)
-    ret = symbol.conv2d(ret, kernel_size=(3, 3),
-                        channels=out_channels, padding=(1, 1),
-                        use_bias=True)
-
-    batch_size = 1
-    data_shape = (batch_size, 3, 224, 224)
-    shape_dict = {"data1": data_shape, "data2": data_shape}
-    params = {}
-    params["data1"] = np.random.uniform(-1, 1,
-                                        size=data_shape).astype("float32")
-    params["data2"] = np.random.uniform(-1, 1,
-                                        size=data_shape).astype("float32")
-    op_name_device = {"elemwise_add": "cpu", "conv2d": device}
-    fallback_device = tvm.context("cpu")
-    target = {"cpu": "llvm", device: target}
-    # No op will be fused. 3 additional device copy nodes are required.
-    check_annotated_graph(ret, target, op_name_device, 15, fallback_device,
-                          shape_dict, params)
-
-
-def test_fusible_network(device, target):
+    def compile_run_graph(device, target):
+        if not tvm.module.enabled(device):
+            print("Skip test because %s is not enabled." % device)
+            return
+
+        out_channels = 16
+        data1 = symbol.Variable(name="data1")
+        data2 = symbol.Variable(name="data2")
+        simple_net1 = symbol.conv2d(data=data1, kernel_size=(3, 3),
+                                    channels=out_channels, padding=(1, 1),
+                                    use_bias=True)
+
+        simple_net2 = symbol.conv2d(data=data2, kernel_size=(3, 3),
+                                    channels=out_channels, padding=(1, 1),
+                                    use_bias=True)
+        ret = symbol.elemwise_add(simple_net1, simple_net2)
+        ret = symbol.conv2d(ret, kernel_size=(3, 3),
+                            channels=out_channels, padding=(1, 1),
+                            use_bias=True)
+
+        batch_size = 1
+        data_shape = (batch_size, 3, 224, 224)
+        shape_dict = {"data1": data_shape, "data2": data_shape}
+        params = {}
+        params["data1"] = np.random.uniform(-1, 1,
+                                            size=data_shape).astype("float32")
+        params["data2"] = np.random.uniform(-1, 1,
+                                            size=data_shape).astype("float32")
+        op_name_device = {"elemwise_add": "cpu", "conv2d": device}
+        fallback_device = tvm.context("cpu")
+        target = {"cpu": "llvm", device: target}
+        # No op will be fused. 3 additional device copy nodes are required.
+        check_annotated_graph(ret, target, op_name_device, 15, fallback_device,
+                              shape_dict, params)
+
+    for dev, tar in [("opencl", "opencl"), ("cuda", "cuda"),
+                     ("opencl", str(tvm.target.intel_graphics()))]:
+        compile_run_graph(dev, tar)
+
+
+def test_fusible_network():
     R""" The network is as following:
                 data
                   |
@@ -107,99 +111,110 @@ def test_fusible_network(device, target):
                   |
                 tanh
     """
-    if not tvm.module.enabled(device):
-        print("Skip test because %s is not enabled." % device)
-        return
-
-    batch_size = 1
-    data_shape = (batch_size, 3, 224, 224)
-    data = symbol.Variable('data', shape=data_shape, dtype="float32")
-    shape_dict = {"data": data_shape}
-    params = {}
-    params["data"] = np.random.uniform(-1, 1,
-                                       size=data_shape).astype("float32")
-
-    exp = symbol.exp(data, name='exp')
-    sqrt = symbol.sqrt(exp, name='sqrt')
-    log = symbol.log(exp, name='log')
-    ret = sqrt + log
-    ret = symbol.tanh(ret)
-
-    fallback_device = tvm.context("cpu")
-    target = {"cpu": "llvm", device: target}
-
-    # Fuse log and broadcast_add.
-    op_name_device = {
-        "exp": "cpu",
-        "log": "cpu",
-        "broadcast_add": "cpu",
-        "sqrt": device,
-        "elemwise_add": device,
-        "tanh": device
-    }
-    check_annotated_graph(ret, target, op_name_device, 8, fallback_device,
-                          shape_dict, params)
-
-    # Fuse log, broadcast_add, and tanh
-    op_name_device = {
-        "exp": "cpu",
-        "log": device,
-        "broadcast_add": device,
-        "sqrt": "cpu",
-        "elemwise_add": "cpu",
-        "tanh": device
-    }
-    check_annotated_graph(ret, target, op_name_device, 6, fallback_device,
-                          shape_dict, params)
-
-    # No operator will be fused.
-    op_name_device = {
-        "exp": device,
-        "log": "cpu",
-        "broadcast_add": device,
-        "sqrt": "cpu",
-        "elemwise_add": device,
-        "tanh": "cpu"
-    }
-    check_annotated_graph(ret, target, op_name_device, 11, fallback_device,
-                          shape_dict, params)
-
-    # All operators will be fused.
-    op_name_device = {
-        "exp": device,
-        "log": device,
-        "broadcast_add": device,
-        "sqrt": device,
-        "elemwise_add": device,
-        "tanh": device
-    }
-    check_annotated_graph(ret, target, op_name_device, 2, fallback_device,
-                          shape_dict, params)
-
-    # All operators will be fused since all of them are annotated to the
-    # same device.
-    op_name_device = {
-        "exp": "cpu",
-        "log": "cpu",
-        "broadcast_add": "cpu",
-        "sqrt": "cpu",
-        "elemwise_add": "cpu",
-        "tanh": "cpu"
-    }
-    check_annotated_graph(ret, target, op_name_device, 2, fallback_device,
-                          shape_dict, params)
-
-    # Fuse exp, sqrt, log, and boradcast_add
-    op_name_device = {
-        "exp": device,
-        "log": device,
-        "broadcast_add": device,
-        "sqrt": device,
-        "elemwise_add": device,
-        "tanh": "cpu"
-    }
-    check_annotated_graph(ret, target, op_name_device, 4, fallback_device,
-                          shape_dict, params)
+    def compile_run_graph(device, target):
+        if not tvm.module.enabled(device):
+            print("Skip test because %s is not enabled." % device)
+            return
+
+        batch_size = 1
+        data_shape = (batch_size, 3, 224, 224)
+        data = symbol.Variable('data', shape=data_shape, dtype="float32")
+        shape_dict = {"data": data_shape}
+        params = {}
+        params["data"] = np.random.uniform(-1, 1,
+                                           size=data_shape).astype("float32")
+
+        exp = symbol.exp(data, name='exp')
+        sqrt = symbol.sqrt(exp, name='sqrt')
+        log = symbol.log(exp, name='log')
+        ret = sqrt + log
+        ret = symbol.tanh(ret)
+
+        fallback_device = tvm.context("cpu")
+        target = {"cpu": "llvm", device: target}
+
+        # Fuse log and broadcast_add.
+        op_name_device = {
+            "exp": "cpu",
+            "log": "cpu",
+            "broadcast_add": "cpu",
+            "sqrt": device,
+            "elemwise_add": device,
+            "tanh": device
+        }
+        check_annotated_graph(ret, target, op_name_device, 8, fallback_device,
+                              shape_dict, params)
+
+        # Fuse log, broadcast_add, and tanh
+        op_name_device = {
+            "exp": "cpu",
+            "log": device,
+            "broadcast_add": device,
+            "sqrt": "cpu",
+            "elemwise_add": "cpu",
+            "tanh": device
+        }
+        check_annotated_graph(ret, target, op_name_device, 6, fallback_device,
+                              shape_dict, params)
+
+        # No operator will be fused.
+        op_name_device = {
+            "exp": device,
+            "log": "cpu",
+            "broadcast_add": device,
+            "sqrt": "cpu",
+            "elemwise_add": device,
+            "tanh": "cpu"
+        }
+        check_annotated_graph(ret, target, op_name_device, 11,
+                              tvm.context(device),
+                              shape_dict, params)
+
+        # All operators will be fused.
+        op_name_device = {
+            "exp": device,
+            "log": device,
+            "broadcast_add": device,
+            "sqrt": device,
+            "elemwise_add": device,
+            "tanh": device
+        }
+        check_annotated_graph(ret, target, op_name_device, 2, fallback_device,
+                              shape_dict, params)
+
+        # All operators will be fuesed and fallback to the device context.
+        op_name_device = None
+        check_annotated_graph(ret, target, op_name_device, 2, fallback_device,
+                              shape_dict, params)
+
+        # All operators will be fused since all of them are annotated to the
+        # same device.
+        op_name_device = {
+            "exp": "cpu",
+            "log": "cpu",
+            "broadcast_add": "cpu",
+            "sqrt": "cpu",
+            "elemwise_add": "cpu",
+            "tanh": "cpu"
+        }
+        check_annotated_graph(ret, target, op_name_device, 2, fallback_device,
+                              shape_dict, params)
+
+        # Fuse exp, sqrt, log, and boradcast_add
+        op_name_device = {
+            "exp": device,
+            "log": device,
+            "broadcast_add": device,
+            "sqrt": device,
+            "elemwise_add": device,
+            "tanh": "cpu"
+        }
+        check_annotated_graph(ret, target, op_name_device, 4, fallback_device,
+                              shape_dict, params)
+
+    for dev, tar in [("opencl", "opencl"), ("cuda", "cuda"),
+                     ("opencl", str(tvm.target.intel_graphics()))]:
+        compile_run_graph(dev, tar)
 
 
 def check_graph(sym, target, op_name_device, fallback_device, data_shape,
@@ -213,14 +228,14 @@ def check_graph(sym, target, op_name_device, fallback_device, data_shape,
                                       dtype=dtype, params=params1)
 
     # annotate and compile the graph
-    deploy_graph, libmod, params = nnvm.compiler.build(
-        sym,
-        target=target,
-        shape=data_shape,
-        dtype=dtype,
-        params=params,
-        op_name_device=op_name_device,
-        fallback_device=fallback_device)
+    with nnvm.compiler.build_config(fallback_device=fallback_device,
+                                    op_name_device=op_name_device):
+        deploy_graph, libmod, params = nnvm.compiler.build(
+            sym,
+            target=target,
+            shape=data_shape,
+            dtype=dtype,
+            params=params)
     contexts = [tvm.context(dev) for dev in target.keys()]
 
     def check_load_module():
@@ -267,52 +282,58 @@ def check_inmemory_module():
     check_inmemory_module()
 
 
-def test_duplex_data_transfer(device, target):
-    R""" This unittest tests duplex communication between the host and
-    accelerator device. The network is as following:
-                data
-                  |
-                conv2d  (acc)
-                  |
-             batch_norm (cpu)
-                  |
-                conv2d  (acc)
-    """
-    if not tvm.module.enabled(device):
-        print("Skip test because %s is not enabled." % device)
-        return
-
-    out_channels = 16
-    data = symbol.Variable(name="data")
-    simple_net = symbol.conv2d(data=data, kernel_size=(3, 3),
-                               channels=out_channels, padding=(1, 1),
-                               use_bias=False)
-    simple_net = symbol.batch_norm(simple_net)
-    simple_net = symbol.conv2d(data=simple_net, kernel_size=(3, 3),
-                               channels=out_channels, padding=(1, 1),
-                               use_bias=False)
-
-    batch_size = 1
-    data_shape = (batch_size, 3, 224, 224)
-    shape_dict = {"data": data_shape}
-    net, params = utils.create_workload(simple_net, batch_size,
-                                        data_shape[1:])
-    params["data"] = data = np.random.uniform(-1, 1,
-                                              size=data_shape).astype(
-        "float32")
-
-    target = {"cpu": "llvm", device: target}
-    op_name_device = {"conv2d": device, "batch_norm": "cpu",
-                      "broadcast_add": "cpu", "elemwise_mul": "cpu"}
-    fallback_device = tvm.context("cpu")
-    check_graph(net, target, op_name_device, fallback_device, shape_dict,
-                params)
+# FIXME: comment out the following test for now. Uncomment it after we rebased
+# to upstream where
+# https://github.com/neo-ai/tvm/blob/unstable/python/tvm/_ffi/ndarray.py#L114
+# returns _make_array(handle, False, False))
+# def test_duplex_data_transfer():
+#     R""" This unittest tests duplex communication between the host and
+#     accelerator device. The network is as following:
+#                 data
+#                   |
+#                 conv2d  (acc)
+#                   |
+#              batch_norm (cpu)
+#                   |
+#                 conv2d  (acc)
+#     """
+#     def compile_run_graph(device, target):
+#         if not tvm.module.enabled(device):
+#             print("Skip test because %s is not enabled." % device)
+#             return
+# 
+#         out_channels = 16
+#         data = symbol.Variable(name="data")
+#         simple_net = symbol.conv2d(data=data, kernel_size=(3, 3),
+#                                    channels=out_channels, padding=(1, 1),
+#                                    use_bias=False)
+#         simple_net = symbol.batch_norm(simple_net)
+#         simple_net = symbol.conv2d(data=simple_net, kernel_size=(3, 3),
+#                                    channels=out_channels, padding=(1, 1),
+#                                    use_bias=False)
+# 
+#         batch_size = 1
+#         data_shape = (batch_size, 3, 224, 224)
+#         shape_dict = {"data": data_shape}
+#         net, params = utils.create_workload(simple_net, batch_size,
+#                                             image_shape=data_shape[1:])
+#         params["data"] = data = np.random.uniform(-1, 1,
+#                                                   size=data_shape).astype(
+#             "float32")
+# 
+#         target = {"cpu": "llvm", device: target}
+#         op_name_device = {"conv2d": device, "batch_norm": "cpu",
+#                           "broadcast_add": "cpu", "elemwise_mul": "cpu"}
+#         fallback_device = tvm.context("cpu")
+#         check_graph(net, target, op_name_device, fallback_device, shape_dict,
+#                     params)
+# 
+#     for dev, tar in [("opencl", "opencl"), ("cuda", "cuda"),
+#                      ("opencl", str(tvm.target.intel_graphics()))]:
+#         compile_run_graph(dev, tar)
 
 
 if __name__ == "__main__":
-    for dev, tar in [("opencl", "opencl"), ("cuda", "cuda"),
-                     ("opencl", str(tvm.target.intel_graphics()))]:
-        test_conv_network(dev, tar)
-        test_fusible_network(dev, tar)
-        test_duplex_data_transfer(dev, tar)
-
+    test_conv_network()
+    test_fusible_network()
+    # test_duplex_data_transfer()
diff --git a/topi/python/topi/arm_cpu/conv2d.py b/topi/python/topi/arm_cpu/conv2d.py
index 605749d460f7..e402d808096a 100644
--- a/topi/python/topi/arm_cpu/conv2d.py
+++ b/topi/python/topi/arm_cpu/conv2d.py
@@ -548,6 +548,14 @@ def _alter_conv2d_layout_arm(attrs, inputs, tinfos, F):
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
     dilation = attrs.get_int_tuple("dilation")
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+
+    # Remove attached compilation target because conv2d_NCHWc needs to create
+    # a conv2d_nchwc op and target is not one of conv2d's parameters.
+    if "target" in new_attrs:
+        del new_attrs["target"]
+
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     groups = attrs.get_int('groups')
diff --git a/topi/python/topi/cuda/conv2d_winograd.py b/topi/python/topi/cuda/conv2d_winograd.py
index 0c2ea3db6f80..2f2d0deab69d 100644
--- a/topi/python/topi/cuda/conv2d_winograd.py
+++ b/topi/python/topi/cuda/conv2d_winograd.py
@@ -355,6 +355,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, F):
     copy_inputs = [s for s in inputs]
     new_attrs = {k: attrs[k] for k in attrs.keys()}
 
+    assert attrs.get_int_tuple("dilation") == (1, 1), "Does not support dilation " \
+                                                      "when alter_op_layout is enabled"
+
+     # Remove attached compilation target because `transform` needs to
+     # a conv2d_nchwc op and target is not one of conv2d's parameters.
+    if "target" in new_attrs:
+        del new_attrs["target"]
+
     strides = attrs.get_int_tuple("strides")
     padding = attrs.get_int_tuple("padding")
     dilation = attrs.get_int_tuple("dilation")
diff --git a/topi/python/topi/x86/conv2d.py b/topi/python/topi/x86/conv2d.py
index 720b35a1134b..eefa5fec80df 100644
--- a/topi/python/topi/x86/conv2d.py
+++ b/topi/python/topi/x86/conv2d.py
@@ -328,6 +328,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfo, F):
     new_attrs[layout_name] = 'NCHW%dc' % ic_bn
     new_attrs['out_layout'] = 'NCHW%dc' % oc_bn
 
+    # Remove attached compilation target because conv2d_NCHWc needs to create
+    # a conv2d_nchwc op and target is not one of conv2d's parameters.
+    if "target" in new_attrs:
+        del new_attrs["target"]
+
     new_data = tvm.placeholder((batch_size, in_channel//ic_bn, height, width, ic_bn),
                                dtype=data.dtype)
     if is_depthwise:
diff --git a/vta/python/vta/top/vta_conv2d.py b/vta/python/vta/top/vta_conv2d.py
index e7d584a791fc..ab06cadf8247 100644
--- a/vta/python/vta/top/vta_conv2d.py
+++ b/vta/python/vta/top/vta_conv2d.py
@@ -188,12 +188,26 @@ def packed_conv2d(data,
         name="res", tag="packed_conv2d")
     return res
 
+
 @tvm.register_func("nnvm.compiler.build_target", override=True)
 def _build(funcs, target, target_host):
+    if isinstance(funcs, (dict, tvm.container.Map)):
+        new_funcs = {}
+        for key, val in funcs.items():
+            tvm_t = tvm.target.create(key)
+            print(tvm_t.device_name)
+            if tvm_t.device_name == "vta":
+                new_funcs["ext_dev"] = val
+            elif tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
+                new_funcs[target_host] = val
+            else:
+                new_funcs[key] = val
+        return tvm.build(new_funcs, target=target, target_host=target_host)
+
     tvm_t = tvm.target.create(target)
     if tvm_t.device_name == "vta":
         return tvm.build(funcs, target="ext_dev", target_host=target_host)
-    elif tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
+    if tvm_t.device_name == "rasp" or tvm_t.device_name == "vtacpu":
         return tvm.build(funcs, target=target_host)
     return tvm.build(funcs, target=target)