From 1aa40b2eb7aa8045c71bfc881b6299bc03756385 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 23 Mar 2022 16:27:07 +0900 Subject: [PATCH 1/9] [ARM] Support NCHWc alter layout in the fallback mode --- python/tvm/relay/op/strategy/arm_cpu.py | 2 ++ python/tvm/topi/arm_cpu/conv2d_alter_op.py | 8 +++++--- python/tvm/topi/x86/conv2d_int8.py | 15 ++++++++++++--- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 44c46ae988af..0fb74507de6d 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -93,6 +93,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_spatial_pack), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_spatial_pack), name="conv2d_nchw_spatial_pack.arm_cpu", + plevel=10, ) if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype): @@ -100,6 +101,7 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.arm_cpu", + plevel=15, ) else: strategy.add_implementation( diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 409768fc8f75..2ec4d412635d 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -27,6 +27,7 @@ from ..nn import conv2d_alter_layout, conv2d_legalize from ..utils import get_const_tuple from ..x86.conv2d import _get_default_config as _get_x86_default_config +from ..x86.conv2d_int8 import _get_default_config_int8 from .conv2d_int8 import is_int8_hw_support from .arm_utils import get_tiling_B_interleaved_t from ..generic.conv2d import conv2d_alter_int8_common @@ -101,9 +102,9 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): # we then assume it's not necessary to alter this op. return None cfg = dispatch_ctx.query(target, workload) - if cfg.is_fallback: # if is fallback, clear query cache and return None - autotvm.task.clear_fallback_cache(target, workload) - return None + # if cfg.is_fallback: # if is fallback, clear query cache and return None + # autotvm.task.clear_fallback_cache(target, workload) + # return None topi_tmpl = workload[0] new_attrs = {k: attrs[k] for k in attrs.keys()} @@ -357,6 +358,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): out_dtype, False, data_layout, + int32_lanes=32 ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 075723303841..8a20dfa6974c 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -34,7 +34,16 @@ def _get_default_config_int8( - cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" + cfg, + data, + kernel, + strides, + padding, + dilation, + out_dtype, + is_depthwise=False, + layout="NCHW", + int32_lanes=16, ): """ Get default schedule config for the workload @@ -50,11 +59,11 @@ def _get_default_config_int8( is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8( - cfg, wkl, int32_lanes=16, num_int8_elements=4 + cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4 ) else: conv2d_generic.fallback_schedule_cpu_common_int8( - cfg, wkl, int32_lanes=16, num_int8_elements=4 + cfg, wkl, int32_lanes=int32_lanes, num_int8_elements=4 ) From 45946ab525ee777e710363b9cf34ef2fbcb167a5 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 23 Mar 2022 16:33:29 +0900 Subject: [PATCH 2/9] remove fallback path --- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 2ec4d412635d..2892a381fb9b 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -102,9 +102,6 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): # we then assume it's not necessary to alter this op. return None cfg = dispatch_ctx.query(target, workload) - # if cfg.is_fallback: # if is fallback, clear query cache and return None - # autotvm.task.clear_fallback_cache(target, workload) - # return None topi_tmpl = workload[0] new_attrs = {k: attrs[k] for k in attrs.keys()} From fc2d9efd7503cd8b7463802fde1fd69ac0704c32 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 23 Mar 2022 16:45:59 +0900 Subject: [PATCH 3/9] add test --- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +- .../test_meta_schedule_integration.py | 46 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 2892a381fb9b..0c0e3f695b95 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -355,7 +355,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): out_dtype, False, data_layout, - int32_lanes=32 + int32_lanes=32, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index 68ee840d15ea..36dd02068b36 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -16,9 +16,11 @@ # under the License. import sys from typing import List +import numpy as np import pytest import tvm +from tvm import relay from tvm import meta_schedule as ms from tvm.ir.module import IRModule from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload @@ -149,5 +151,49 @@ def extract_task_qbert(): assert "vnni" in annotations["schedule_rule"] +def extract_task_arm_conv2d_nchwc(): + data_shape = (1, 32, 128, 128) + weight_shape = (32, 32, 1, 1) + bias_shape = (weight_shape[0],) + padding = (1, 1) + + data = relay.var("data", shape=data_shape, dtype="int8") + weight = relay.var("weight", shape=weight_shape, dtype="int8") + bias = relay.var("bias", shape=bias_shape, dtype="int32") + conv2d = relay.nn.conv2d( + data=data, + weight=weight, + kernel_size=weight_shape[2:], + channels=weight_shape[0], + padding=padding, + strides=(1, 1), + out_dtype="int32", + ) + bias_add = relay.nn.bias_add(conv2d, bias) + relay_mod = tvm.IRModule.from_expr(bias_add) + + weight_np = np.random.uniform(1, 10, size=weight_shape).astype("int8") + bias_np = np.random.uniform(1, 10, size=bias_shape).astype("int32") + + params = {"weight": weight_np, "bias": bias_np} + + target = "llvm -device arm_cpu -mtriple aarch64-linux-gnu -mattr=+neon" + extracted_tasks = extract_task_from_relay(relay_mod, target, params) + tune_tasks = list( + filter( + lambda task: "conv2d" in task.task_name, + extracted_tasks, + ) + ) + + assert len(tune_tasks) == 1 + + relay_func = list(tune_tasks[0].mod.functions.values())[0] + out_type = relay_func.body.checked_type + + # Check that the output is in NCHWc layout + assert list(out_type.shape) == [1, 1, 130, 130, 32] + + if __name__ == "__main__": sys.exit(pytest.main([__file__] + sys.argv[1:])) From 88a2e0250a2cd1540c402114275f4641e6d5a708 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 05:58:25 +0900 Subject: [PATCH 4/9] fixed int32_lanes and add channel check --- python/tvm/relay/op/strategy/arm_cpu.py | 5 ++++- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 14 ++++++++++---- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 0fb74507de6d..9f34134f14d5 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -96,7 +96,10 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): plevel=10, ) - if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype): + if ( + topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype) + and kernel.shape[1] >= 64 + ): strategy.add_implementation( wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8), wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8), diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 0c0e3f695b95..19fd3725f364 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -344,6 +344,11 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu": assert data_layout == "NCHW" and kernel_layout == "OIHW" + batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) + out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) + + n_elems = 8 + if cfg.is_fallback: _get_default_config_int8( cfg, @@ -355,13 +360,14 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): out_dtype, False, data_layout, - int32_lanes=32, + int32_lanes=4, ) - batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) - out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape) ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] - n_elems = 8 + + if cfg.is_fallback: + # ic_bn needs to be devided by n_elems below + ic_bn = max(ic_bn, n_elems) # update new attrs new_attrs["channels"] = out_channel From d6f35747478233a995dab198a814263adaf78883 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 11:17:39 +0900 Subject: [PATCH 5/9] fixed schedule dispatch bug --- python/tvm/relay/op/strategy/arm_cpu.py | 8 ++++++-- python/tvm/topi/arm_cpu/conv2d_int8.py | 12 ++++++++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 9f34134f14d5..862377887fec 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -388,12 +388,16 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ if layout == "NHWC" and data.dtype in ["int8", "uint8"]: strategy.add_implementation( wrap_compute_conv2d_gemm(native_compute), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native), + wrap_topi_schedule( + topi.arm_cpu.schedule_conv2d_NHWC_quantized_native_without_transform + ), name="conv2d_NHWC_quantized_native_without_transform.arm_cpu", ) strategy.add_implementation( wrap_compute_conv2d_gemm(interleaved_compute), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved), + wrap_topi_schedule( + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved_without_transform + ), name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu", ) else: diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 8d9c47966113..d09433b16a78 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -297,6 +297,12 @@ def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs): return _schedule_conv2d_NHWC_quantized(cfg, outs, True) +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu") +def schedule_conv2d_NHWC_quantized_interleaved_without_transform(cfg, outs): + """Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, True) + + # Native schedules: those schedule won't interleave A (which is left in its native form). # The weights are interleaved and transposed @autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu") @@ -330,3 +336,9 @@ def compute_conv2d_NHWC_quantized_native_without_transform( def schedule_conv2d_NHWC_quantized_native(cfg, outs): """Interface for native schedule_conv2d_NHWC_quantized""" return _schedule_conv2d_NHWC_quantized(cfg, outs, False) + + +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native_without_transform.arm_cpu") +def schedule_conv2d_NHWC_quantized_native_without_transform(cfg, outs): + """Interface for native schedule_conv2d_NHWC_quantized""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, False) From c8b14e15c47f39a40ea250a3847f6dac646c2914 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 11:29:11 +0900 Subject: [PATCH 6/9] add workaround fallback path for NHWC im2col based GEMM schedule --- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 19fd3725f364..ba2c59916ced 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -400,6 +400,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs) if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu": + # TODO(masahi): This schedule can easily result in a tensorization error + # if used in the fallback mode + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None + assert data_layout == "NHWC" and kernel_layout == "HWIO" KH, KW, _, OC = get_const_tuple(kernel.shape) new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu" @@ -416,6 +422,12 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): inputs[0], new_kernel_expr, **new_attrs ) if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu": + # TODO(masahi): This schedule can easily result in a tensorization error + # if used in the fallback mode + if cfg.is_fallback: # if is fallback, clear query cache and return None + autotvm.task.clear_fallback_cache(target, workload) + return None + assert data_layout == "NHWC" and kernel_layout == "HWIO" KH, KW, _, OC = get_const_tuple(kernel.shape) new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu" From 0beacdf340f078c0b158400593fb077d0ea30245 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 11:33:07 +0900 Subject: [PATCH 7/9] int32_lanes=4 by default --- python/tvm/topi/x86/conv2d_alter_op.py | 1 + python/tvm/topi/x86/conv2d_int8.py | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py index 9234581f1d5b..032c0e2e236b 100644 --- a/python/tvm/topi/x86/conv2d_alter_op.py +++ b/python/tvm/topi/x86/conv2d_alter_op.py @@ -159,6 +159,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): out_dtype, False, data_layout, + int32_lanes=16, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 8a20dfa6974c..b0edb02b0804 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -43,7 +43,7 @@ def _get_default_config_int8( out_dtype, is_depthwise=False, layout="NCHW", - int32_lanes=16, + int32_lanes=4, ): """ Get default schedule config for the workload @@ -172,6 +172,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out padding, dilation, out_dtype, + int32_lanes=16, ) # Pack data if raw 4-D data is provided. From f43aabd9ea82d2fc87131847805985e5365dd50e Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 11:36:37 +0900 Subject: [PATCH 8/9] typo --- python/tvm/topi/arm_cpu/conv2d_alter_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index ba2c59916ced..eb719dd66777 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -366,7 +366,7 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] if cfg.is_fallback: - # ic_bn needs to be devided by n_elems below + # ic_bn needs to be divided by n_elems below ic_bn = max(ic_bn, n_elems) # update new attrs From 77d169420c7a13913931fa9b79c898482198b383 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Thu, 24 Mar 2022 11:39:54 +0900 Subject: [PATCH 9/9] update test --- tests/python/unittest/test_meta_schedule_integration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_integration.py b/tests/python/unittest/test_meta_schedule_integration.py index 36dd02068b36..8186d3c178d6 100644 --- a/tests/python/unittest/test_meta_schedule_integration.py +++ b/tests/python/unittest/test_meta_schedule_integration.py @@ -152,8 +152,8 @@ def extract_task_qbert(): def extract_task_arm_conv2d_nchwc(): - data_shape = (1, 32, 128, 128) - weight_shape = (32, 32, 1, 1) + data_shape = (1, 64, 128, 128) + weight_shape = (32, 64, 1, 1) bias_shape = (weight_shape[0],) padding = (1, 1) @@ -192,7 +192,7 @@ def extract_task_arm_conv2d_nchwc(): out_type = relay_func.body.checked_type # Check that the output is in NCHWc layout - assert list(out_type.shape) == [1, 1, 130, 130, 32] + assert list(out_type.shape) == [1, 8, 130, 130, 4] if __name__ == "__main__":