From 1aa40b2eb7aa8045c71bfc881b6299bc03756385 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 23 Mar 2022 16:27:07 +0900
Subject: [PATCH 1/9] [ARM] Support NCHWc alter layout in the fallback mode

---
 python/tvm/relay/op/strategy/arm_cpu.py    |  2 ++
 python/tvm/topi/arm_cpu/conv2d_alter_op.py |  8 +++++---
 python/tvm/topi/x86/conv2d_int8.py         | 15 ++++++++++++---
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 44c46ae988af..0fb74507de6d 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -93,6 +93,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack),
                     wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack),
                     name="conv2d_nchw_spatial_pack.arm_cpu",
+                    plevel=10,
                 )
 
                 if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
@@ -100,6 +101,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
                         name="conv2d_nchw_int8.arm_cpu",
+                        plevel=15,
                     )
                 else:
                     strategy.add_implementation(
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 409768fc8f75..2ec4d412635d 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -27,6 +27,7 @@
 from ..nn import conv2d_alter_layout, conv2d_legalize
 from ..utils import get_const_tuple
 from ..x86.conv2d import _get_default_config as _get_x86_default_config
+from ..x86.conv2d_int8 import _get_default_config_int8
 from .conv2d_int8 import is_int8_hw_support
 from .arm_utils import get_tiling_B_interleaved_t
 from ..generic.conv2d import conv2d_alter_int8_common
@@ -101,9 +102,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         # we then assume it's not necessary to alter this op.
         return None
     cfg = dispatch_ctx.query(target, workload)
-    if cfg.is_fallback:  # if is fallback, clear query cache and return None
-        autotvm.task.clear_fallback_cache(target, workload)
-        return None
+    # if cfg.is_fallback:  # if is fallback, clear query cache and return None
+    #     autotvm.task.clear_fallback_cache(target, workload)
+    #     return None
 
     topi_tmpl = workload[0]
     new_attrs = {k: attrs[k] for k in attrs.keys()}
@@ -357,6 +358,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
+                int32_lanes=32
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 075723303841..8a20dfa6974c 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -34,7 +34,16 @@
 
 
 def _get_default_config_int8(
-    cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW"
+    cfg,
+    data,
+    kernel,
+    strides,
+    padding,
+    dilation,
+    out_dtype,
+    is_depthwise=False,
+    layout="NCHW",
+    int32_lanes=16,
 ):
     """
     Get default schedule config for the workload
@@ -50,11 +59,11 @@ def _get_default_config_int8(
         is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1
         if is_kernel_1x1:
             conv2d_generic.fallback_schedule_cpu_1x1_int8(
-                cfg, wkl, int32_lanes=16, num_int8_elements=4
+                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
             )
         else:
             conv2d_generic.fallback_schedule_cpu_common_int8(
-                cfg, wkl, int32_lanes=16, num_int8_elements=4
+                cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4
             )
 
 

From 45946ab525ee777e710363b9cf34ef2fbcb167a5 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 23 Mar 2022 16:33:29 +0900
Subject: [PATCH 2/9] remove fallback path

---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 2ec4d412635d..2892a381fb9b 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -102,9 +102,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         # we then assume it's not necessary to alter this op.
         return None
     cfg = dispatch_ctx.query(target, workload)
-    # if cfg.is_fallback:  # if is fallback, clear query cache and return None
-    #     autotvm.task.clear_fallback_cache(target, workload)
-    #     return None
 
     topi_tmpl = workload[0]
     new_attrs = {k: attrs[k] for k in attrs.keys()}

From fc2d9efd7503cd8b7463802fde1fd69ac0704c32 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Wed, 23 Mar 2022 16:45:59 +0900
Subject: [PATCH 3/9] add test

---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py    |  2 +-
 .../test_meta_schedule_integration.py         | 46 +++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 2892a381fb9b..0c0e3f695b95 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -355,7 +355,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
-                int32_lanes=32
+                int32_lanes=32,
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 68ee840d15ea..36dd02068b36 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -16,9 +16,11 @@
 # under the License.
 import sys
 from typing import List
+import numpy as np
 
 import pytest
 import tvm
+from tvm import relay
 from tvm import meta_schedule as ms
 from tvm.ir.module import IRModule
 from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload
@@ -149,5 +151,49 @@ def extract_task_qbert():
         assert "vnni" in annotations["schedule_rule"]
 
 
+def extract_task_arm_conv2d_nchwc():
+    data_shape = (1, 32, 128, 128)
+    weight_shape = (32, 32, 1, 1)
+    bias_shape = (weight_shape[0],)
+    padding = (1, 1)
+
+    data = relay.var("data", shape=data_shape, dtype="int8")
+    weight = relay.var("weight", shape=weight_shape, dtype="int8")
+    bias = relay.var("bias", shape=bias_shape, dtype="int32")
+    conv2d = relay.nn.conv2d(
+        data=data,
+        weight=weight,
+        kernel_size=weight_shape[2:],
+        channels=weight_shape[0],
+        padding=padding,
+        strides=(1, 1),
+        out_dtype="int32",
+    )
+    bias_add = relay.nn.bias_add(conv2d, bias)
+    relay_mod = tvm.IRModule.from_expr(bias_add)
+
+    weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8")
+    bias_np = np.random.uniform(1, 10, size=bias_shape).astype("int32")
+
+    params = {"weight": weight_np, "bias": bias_np}
+
+    target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon"
+    extracted_tasks = extract_task_from_relay(relay_mod, target, params)
+    tune_tasks = list(
+        filter(
+            lambda task: "conv2d" in task.task_name,
+            extracted_tasks,
+        )
+    )
+
+    assert len(tune_tasks) == 1
+
+    relay_func = list(tune_tasks[0].mod.functions.values())[0]
+    out_type = relay_func.body.checked_type
+
+    # Check that the output is in NCHWc layout
+    assert list(out_type.shape) == [1, 1, 130, 130, 32]
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))

From 88a2e0250a2cd1540c402114275f4641e6d5a708 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 05:58:25 +0900
Subject: [PATCH 4/9] fixed int32_lanes and add channel check

---
 python/tvm/relay/op/strategy/arm_cpu.py    |  5 ++++-
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 14 ++++++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 0fb74507de6d..9f34134f14d5 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -96,7 +96,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                     plevel=10,
                 )
 
-                if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
+                if (
+                    topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype)
+                    and kernel.shape[1] >= 64
+                ):
                     strategy.add_implementation(
                         wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
                         wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 0c0e3f695b95..19fd3725f364 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -344,6 +344,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
 
     if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
         assert data_layout == "NCHW" and kernel_layout == "OIHW"
+        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
+        out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
+
+        n_elems = 8
+
         if cfg.is_fallback:
             _get_default_config_int8(
                 cfg,
@@ -355,13 +360,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
-                int32_lanes=32,
+                int32_lanes=4,
             )
 
-        batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
-        out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
-        n_elems = 8
+
+        if cfg.is_fallback:
+            # ic_bn needs to be devided by n_elems below
+            ic_bn = max(ic_bn, n_elems)
 
         # update new attrs
         new_attrs["channels"] = out_channel

From d6f35747478233a995dab198a814263adaf78883 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 11:17:39 +0900
Subject: [PATCH 5/9] fixed schedule dispatch bug

---
 python/tvm/relay/op/strategy/arm_cpu.py |  8 ++++++--
 python/tvm/topi/arm_cpu/conv2d_int8.py  | 12 ++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 9f34134f14d5..862377887fec 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -388,12 +388,16 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ
     if layout == "NHWC" and data.dtype in ["int8", "uint8"]:
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(native_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native),
+            wrap_topi_schedule(
+                topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform
+            ),
             name="conv2d_NHWC_quantized_native_without_transform.arm_cpu",
         )
         strategy.add_implementation(
             wrap_compute_conv2d_gemm(interleaved_compute),
-            wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved),
+            wrap_topi_schedule(
+                topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform
+            ),
             name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu",
         )
     else:
diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py
index 8d9c47966113..d09433b16a78 100644
--- a/python/tvm/topi/arm_cpu/conv2d_int8.py
+++ b/python/tvm/topi/arm_cpu/conv2d_int8.py
@@ -297,6 +297,12 @@ def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs):
     return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
 
 
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu")
+def schedule_conv2d_NHWC_quantized_interleaved_without_transform(cfg, outs):
+    """Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, True)
+
+
 # Native schedules: those schedule won't interleave A (which is left in its native form).
 # The weights are interleaved and transposed
 @autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu")
@@ -330,3 +336,9 @@ def compute_conv2d_NHWC_quantized_native_without_transform(
 def schedule_conv2d_NHWC_quantized_native(cfg, outs):
     """Interface for native schedule_conv2d_NHWC_quantized"""
     return _schedule_conv2d_NHWC_quantized(cfg, outs, False)
+
+
+@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native_without_transform.arm_cpu")
+def schedule_conv2d_NHWC_quantized_native_without_transform(cfg, outs):
+    """Interface for native schedule_conv2d_NHWC_quantized"""
+    return _schedule_conv2d_NHWC_quantized(cfg, outs, False)

From c8b14e15c47f39a40ea250a3847f6dac646c2914 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 11:29:11 +0900
Subject: [PATCH 6/9] add workaround fallback path for NHWC im2col based GEMM
 schedule

---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index 19fd3725f364..ba2c59916ced 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -400,6 +400,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)
 
     if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
+        # TODO(masahi): This schedule can easily result in a tensorization error
+        # if used in the fallback mode
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu"
@@ -416,6 +422,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
             inputs[0], new_kernel_expr, **new_attrs
         )
     if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu":
+        # TODO(masahi): This schedule can easily result in a tensorization error
+        # if used in the fallback mode
+        if cfg.is_fallback:  # if is fallback, clear query cache and return None
+            autotvm.task.clear_fallback_cache(target, workload)
+            return None
+
         assert data_layout == "NHWC" and kernel_layout == "HWIO"
         KH, KW, _, OC = get_const_tuple(kernel.shape)
         new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu"

From 0beacdf340f078c0b158400593fb077d0ea30245 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 11:33:07 +0900
Subject: [PATCH 7/9] int32_lanes=4 by default

---
 python/tvm/topi/x86/conv2d_alter_op.py | 1 +
 python/tvm/topi/x86/conv2d_int8.py     | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py
index 9234581f1d5b..032c0e2e236b 100644
--- a/python/tvm/topi/x86/conv2d_alter_op.py
+++ b/python/tvm/topi/x86/conv2d_alter_op.py
@@ -159,6 +159,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
                 out_dtype,
                 False,
                 data_layout,
+                int32_lanes=16,
             )
 
         batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py
index 8a20dfa6974c..b0edb02b0804 100644
--- a/python/tvm/topi/x86/conv2d_int8.py
+++ b/python/tvm/topi/x86/conv2d_int8.py
@@ -43,7 +43,7 @@ def _get_default_config_int8(
     out_dtype,
     is_depthwise=False,
     layout="NCHW",
-    int32_lanes=16,
+    int32_lanes=4,
 ):
     """
     Get default schedule config for the workload
@@ -172,6 +172,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out
             padding,
             dilation,
             out_dtype,
+            int32_lanes=16,
         )
 
     # Pack data if raw 4-D data is provided.

From f43aabd9ea82d2fc87131847805985e5365dd50e Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 11:36:37 +0900
Subject: [PATCH 8/9] typo

---
 python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
index ba2c59916ced..eb719dd66777 100644
--- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py
+++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py
@@ -366,7 +366,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
         ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
 
         if cfg.is_fallback:
-            # ic_bn needs to be devided by n_elems below
+            # ic_bn needs to be divided by n_elems below
             ic_bn = max(ic_bn, n_elems)
 
         # update new attrs

From 77d169420c7a13913931fa9b79c898482198b383 Mon Sep 17 00:00:00 2001
From: Masahiro Masuda <masahi129@gmail.com>
Date: Thu, 24 Mar 2022 11:39:54 +0900
Subject: [PATCH 9/9] update test

---
 tests/python/unittest/test_meta_schedule_integration.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py
index 36dd02068b36..8186d3c178d6 100644
--- a/tests/python/unittest/test_meta_schedule_integration.py
+++ b/tests/python/unittest/test_meta_schedule_integration.py
@@ -152,8 +152,8 @@ def extract_task_qbert():
 
 
 def extract_task_arm_conv2d_nchwc():
-    data_shape = (1, 32, 128, 128)
-    weight_shape = (32, 32, 1, 1)
+    data_shape = (1, 64, 128, 128)
+    weight_shape = (32, 64, 1, 1)
     bias_shape = (weight_shape[0],)
     padding = (1, 1)
 
@@ -192,7 +192,7 @@ def extract_task_arm_conv2d_nchwc():
     out_type = relay_func.body.checked_type
 
     # Check that the output is in NCHWc layout
-    assert list(out_type.shape) == [1, 1, 130, 130, 32]
+    assert list(out_type.shape) == [1, 8, 130, 130, 4]
 
 
 if __name__ == "__main__":