[TOPI] Add dense schedule for fp16 and fp32 using gemm

Eirene Pandi · Eirini Vlassi Pandi · commit b7d35df46fa1 · 2024-06-12T17:11:54.000+01:00
Add a new schedule for the dense operator
based on the gemm algorithm.

Change-Id: Iaf4423d21d20b5813c77a0a27c4751f8cbd1d8b8
diff --git a/cmake/config.cmake b/cmake/config.cmake
diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -729,6 +729,17 @@ def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
             plevel=12,
         )
 
+    if (
+        data.dtype in ["float16", "float32"]
+        and weight.dtype in ["float16", "float32"]
+        and out_type.dtype in ["float16", "float32"]
+    ):
+        strategy.add_implementation(
+            wrap_compute_dense(topi.arm_cpu.dense_gemm),
+            wrap_topi_schedule(topi.arm_cpu.schedule_dense_gemm),
+            name="dense_gemm.arm_cpu",
+            plevel=11,
+        )
     # Fallback to x86 schedules as there is currently no arm_cpu schedule for dense
     strategy.add_implementation(
         wrap_compute_dense(topi.x86.dense_nopack),
@@ -773,6 +784,18 @@ def matmul_strategy_arm_cpu(attrs, inputs, out_type, target):
             lambda: None,
             name="matmul.arm_cpu.sme",
         )
+    elif (
+        data.dtype in ["float16", "float32"]
+        and weight.dtype in ["float16", "float32"]
+        and out_type.dtype in ["float16", "float32"]
+        and not (attrs.transpose_a or attrs.transpose_b)
+        and len(data.shape) == 2
+    ):
+        strategy.add_implementation(
+            wrap_compute_matmul(topi.arm_cpu.dense_gemm),
+            wrap_topi_schedule(topi.arm_cpu.schedule_dense_gemm),
+            name="matmul.arm_cpu.neon",
+        )
         return strategy
 
     logger.warning("matmul is not optimized for arm cpu.")
diff --git a/python/tvm/topi/arm_cpu/dense.py b/python/tvm/topi/arm_cpu/dense.py
@@ -16,20 +16,29 @@
 # under the License.
 """Dense schedule for ARM CPU"""
 from tvm import autotvm
-
-from .mprofile.dsp.dense import (
-    dense_dsp_schedule,
-    dense_dsp_compute,
-)
+from .mprofile.dsp.dense import dense_dsp_schedule, dense_dsp_compute
+from .dense_gemm import dense_gemm_compute, dense_gemm_schedule
 
 
 @autotvm.register_topi_compute("dense_dsp.arm_cpu")
 def dense_dsp(cfg, data, weight, bias, out_dtype):
-    """Compute dense_dsp with v7e-m DSP instructions."""
+    """Compute dense with DSP instructions."""
     return dense_dsp_compute(cfg, data, weight, bias=bias, out_dtype=out_dtype)
 
 
 @autotvm.register_topi_schedule("dense_dsp.arm_cpu")
 def schedule_dense_dsp(cfg, outs):
     """Create schedule for dense_dsp"""
     return dense_dsp_schedule(cfg, outs)
+
+
+@autotvm.register_topi_compute("dense_gemm.arm_cpu")
+def dense_gemm(cfg, data, weight, bias, out_dtype, transpose_a=False, transpose_b=True):
+    """Compute dense using GeMM."""
+    return dense_gemm_compute(cfg, data, weight, bias, out_dtype, transpose_a, transpose_b)
+
+
+@autotvm.register_topi_schedule("dense_gemm.arm_cpu")
+def schedule_dense_gemm(cfg, outs):
+    """Create schedule for dense using GeMM."""
+    return dense_gemm_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/dense_alter_op.py b/python/tvm/topi/arm_cpu/dense_alter_op.py
@@ -47,6 +47,7 @@ def _alter_dense(attrs, inputs, tinfos, out_type):
 
     cfg = dispatch_ctx.query(target, workload)
     topi_impl = workload[0]
+
     if topi_impl == "matmul.arm_cpu.sme":
         # Pre-compute transposed weights and convert to a matmul
         assert isinstance(
@@ -82,6 +83,31 @@ def _alter_dense(attrs, inputs, tinfos, out_type):
             False,
             transpose_b,
         )
+    elif topi_impl == "dense_gemm.arm_cpu":
+        # Pre-compute transposed weights and convert to a matmul
+        assert isinstance(
+            inputs[1], relay.Constant
+        ), "dense_gemm.arm_cpu requires weights be a Relay Constant"
+
+        weight_dtype = tinfos[1].dtype
+        weight_data = inputs[1].data.numpy()
+        interleaved = weight_data.transpose()
+        encoded_weight = relay.const(interleaved, weight_dtype)
+
+        new_weight = te.placeholder((weight_data.shape), dtype=weight_dtype)
+        new_workload = autotvm.task.args_to_workload(
+            [tinfos[0], new_weight, None, out_type.dtype], topi_impl
+        )
+        dispatch_ctx.update(target, new_workload, cfg)
+
+        return relay.nn.matmul(
+            inputs[0],
+            encoded_weight,
+            units=attrs.units,
+            out_dtype=attrs.out_dtype,
+            transpose_a=False,
+            transpose_b=False,
+        )
 
     # x86 schedules are used as a fallback
     return tvm.topi.x86.dense_alter_op._alter_dense_layout(attrs, inputs, tinfos, out_type)
diff --git a/python/tvm/topi/arm_cpu/dense_gemm.py b/python/tvm/topi/arm_cpu/dense_gemm.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, too-many-locals
+"""GEMM Convolution schedule on AArch64"""
+import tvm
+from tvm.target import Target
+from tvm import te
+from tvm.topi import nn
+from tvm.topi.arm_cpu.arm_utils import get_tiling_A, get_tiling_B_transformed
+from ..utils import get_const_tuple, traverse_inline
+from ..nn.utils import get_pad_tuple
+from .. import tag
+
+# Compute function
+def dense_gemm_compute(
+    cfg, data, weight, bias=None, out_dtype=None, transpose_a=False, transpose_b=True
+):
+    """
+    Compute dense using GeMM.
+
+    transpose_b : Optional[bool] = True
+    Whether the weight tensor is in transposed format.
+    """
+
+    if out_dtype is None:
+        out_dtype = data.dtype
+    M, K = get_const_tuple(data.shape)  # batch, in_dim
+    if bool(transpose_b):  # out_dim
+        (N, _) = get_const_tuple(weight.shape)
+    else:
+        (_, N) = get_const_tuple(weight.shape)
+
+    in_dtype = data.dtype
+
+    tile_M, tile_K_A = get_tiling_A(False, in_dtype)
+    tile_N, tile_K_B = get_tiling_B_transformed(False, out_dtype, False)
+
+    pad_M = 0
+    pad_K = 0
+    pad_N = 0
+
+    if M % tile_M != 0:
+        pad_M = tile_M - (M % tile_M)
+
+    if K % tile_K_A != 0:
+        pad_K = tile_K_A - (K % tile_K_A)
+
+    M_padded = M + pad_M
+    K_padded = K + pad_K
+    k = te.reduce_axis((0, K_padded), name="k")
+
+    pad_before = (0, 0)
+    pad_after = (pad_M, pad_K)
+
+    if pad_K != 0:
+        data = nn.pad(data, pad_before=pad_before, pad_after=pad_after, name="A_padded_K")
+    elif pad_M != 0:
+        data = nn.pad(data, pad_before=pad_before, pad_after=pad_after, name="A_padded_M")
+
+    if N % tile_N != 0:
+        pad_N = tile_N - (N % tile_N)
+    N_padded = N + pad_N
+
+    if bool(transpose_b):
+        weight = te.compute(
+            (K_padded, N_padded), lambda x, y: weight[y, x], name="weight_transposed"
+        )
+
+    if pad_K != 0 or pad_N != 0:
+        weight = nn.pad(weight, pad_before=(0, 0), pad_after=(pad_N, pad_K), name="weight_padded")
+
+    C = te.compute(
+        (M_padded, N_padded),
+        lambda x, y: te.sum(
+            data[x, k].astype(out_dtype) * weight[k, y].astype(out_dtype),
+            axis=k,
+        ).astype(out_dtype),
+        name="C",
+    )
+
+    if bias is not None:
+        C = te.compute(
+            (M_padded, N_padded),
+            lambda i, j: C[i, j] + bias[j].astype(out_dtype),
+            tag=tag.BROADCAST,
+            name="dense_biased_output",
+        )
+
+    zero = (
+        tvm.tir.const(1, C.dtype) * C[0, N_padded - 1]
+        - tvm.tir.const(1, C.dtype) * C[0, N_padded - 1]
+    )
+
+    out = te.compute(
+        (M, N), lambda x, y: (C[x, y] + zero).astype(out_dtype), name="dense_gemm_output"
+    )
+
+    return out
+
+
+def _dense_gemm_schedule_template(s, out):
+    C = out.op.input_tensors[0]
+    A = C.op.input_tensors[0]
+    in_type = A.dtype
+    y_tile_size, _ = get_tiling_B_transformed(False, in_type)
+    if C.op.name == "dense_biased_output":
+        s[C].compute_inline()
+        C = C.op.input_tensors[0]
+    x, y = s[C].op.axis
+    (k,) = s[C].op.reduce_axis
+    k_outer, k_inner = s[C].split(k, factor=4)
+    x_outer, x_inner = s[C].split(x, factor=4)
+    y_outer, y_inner = s[C].split(y, factor=y_tile_size)
+    s[C].parallel(x_outer)
+    s[C].reorder(
+        x_outer,
+        y_outer,
+        k_outer,
+        k_inner,
+        x_inner,
+        y_inner,
+    )
+    s[C].unroll(x_inner)
+    s[C].vectorize(y_inner)
+
+    return s
+
+
+def dense_gemm_schedule(cfg, outs):
+    """Schedule the dense_gemm strategy"""
+    s = te.create_schedule([x.op for x in outs])
+    out = outs[0]
+    x, y = out.op.axis
+    _, inner = s[out].split(y, 4)
+    s[out].parallel(x)
+    s[out].vectorize(inner)
+
+    def _callback(op):
+        if "dense_gemm_output" in op.name:
+            _dense_gemm_schedule_template(s, op.output(0))
+
+    traverse_inline(s, out.op, _callback)
+    return s
diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py
@@ -70,6 +70,7 @@ def matmul(
     assert (
         len(tensor_a.shape) >= 2 and len(tensor_b.shape) >= 2
     ), "1-dim matmul is not supported yet."
+
     if bias is not None:
         assert len(bias.shape) == 1
     if out_dtype is None:
@@ -229,6 +230,7 @@ def dense(
     output : tvm.te.Tensor
         2-D with shape [batch, out_dim]
     """
+
     return matmul(
         data,
         weight,
diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py
@@ -283,6 +283,46 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_compute("dense_simple.x86")
+def dense_simple(cfg, data, weight, bias=None, out_dtype=None):
+    """Compute dense with transformed weight."""
+    if out_dtype is None:
+        out_dtype = data.dtype
+    M, K = get_const_tuple(data.shape)  # batch, in_dim
+    N, _ = get_const_tuple(weight.shape)  # out_dim
+    k = te.reduce_axis((0, K), name="k")
+    C = te.compute(
+        (M, N),
+        lambda i, j: te.sum(data[i, k] * weight[k, j]),
+        tag="dense_simple",
+    )
+    if bias is not None:
+        C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST)
+
+    target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
+    build_mod = tvm.build(C, target=target)
+    buffer_size = 128
+    np_ones = np.ones((buffer_size,)).astype("float32")
+    _test_accuracy(np_ones, np_ones, build_mod)
+    return C
+
+    # Linear transformation
+    linear_output = np.dot(data, weight.T) + bias
+
+
+@autotvm.register_topi_schedule("dense_simple.x86")
+def schedule_dense_pack(cfg, outs):
+    """Create the schedule for dense_simple"""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense_simple" in op.tag:
+            _schedule_dense_simple_template(cfg, s, op.output(0), outs[0])
+
+    traverse_inline(s, outs[0].op, _callback)
+    return s
+
+
 @autotvm.register_topi_compute("dense_int8.x86")
 def dense_int8(cfg, data, weight, bias=None, out_dtype=None):
     """Compute for uint8 x int8 -> int32 dense"""
diff --git a/tests/python/relay/test_dense.py b/tests/python/relay/test_dense.py
@@ -0,0 +1,49 @@
+import tvm
+from tvm import relay
+from tvm.testing import assert_allclose
+import numpy as np
+from tvm.ir.instrument import pass_instrument
+
+
+def _test_accuracy(input_values, output_values, build_mod):
+
+    dev = tvm.cpu(0)
+
+    input_buf = tvm.nd.array(input_values, device=dev)
+    rt = tvm.contrib.graph_executor.GraphModule(build_mod["default"](dev))
+    rt.set_input("data", input_buf)
+    rt.run()
+    out = rt.get_output(0)
+
+    tvm.testing.assert_allclose(out.numpy(), output_values)
+
+
+# Define input shape and data type
+data_size = (64, 64)
+data_shape = data_size  # Input shape
+data_type = "float32"  # Data type
+weight_shape = data_size
+
+# Create Relay input variable
+d = relay.var("data", shape=data_shape, dtype=data_type)
+w1 = np.ones(weight_shape, dtype=data_type)
+w = relay.const(w1)
+
+# Create Relay dense layer
+y = relay.nn.dense(d, w)
+
+# Create Relay module
+mod = tvm.IRModule()
+
+# Define a Relay function with the dense layer
+mod["main"] = relay.Function([d], y)
+
+# Compile the Relay module
+target = "llvm -mtriple=aarch64-linux-gnu -device=arm_cpu -mattr=+v8.2a,+neon"  # Example target, you can change this to your desired target
+lib = relay.build(mod, target=target, params=None)
+
+in_np = np.random.uniform(size=(data_size)).astype(data_type)
+out_np = np.array(np.matmul(in_np, w1.T))
+
+target = "llvm -mtriple=aarch64-linux-gnu -mattr=+sve"
+_test_accuracy(in_np, out_np, lib)
diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py
diff --git a/tests/python/topi/test_topi_dense.py b/tests/python/topi/test_topi_dense.py