[CUDA] Various int8 fix (cublas, cutlass, etc) (#10596)

masahi · web-flow · commit 7d5ef84b84c0 · 2022-03-14T11:16:27.000-07:00
* [CUTLASS] avoid tile size 256 for int8 + align1 case

* allow selecting int8 dense strategy for vulkan

* fixed cublas batch matmul for int8

* fixed int8 dense tensorcore strategy

* add cutlass conv align1 + int8 case

* support int8 mixed precision cublas bmm

* black
diff --git a/python/tvm/contrib/cutlass/gen_conv2d.py b/python/tvm/contrib/cutlass/gen_conv2d.py
@@ -22,6 +22,7 @@
 from .conv2d_profiler import Conv2dProfilerEmitter
 from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP
 from .library import (
+    DataType,
     EpilogueFunctor,
     SwizzlingFunctor,
     TensorDescription,
@@ -133,6 +134,10 @@ def enumerate_conv2d_operators(
                 B = TensorDescription(element_b, LayoutType.TensorNHWC, alignment)
                 C = TensorDescription(element_c, LayoutType.TensorNHWC, alignment)
 
+                if element_c == DataType.s32 and A.alignment == 1:
+                    tile.threadblock_shape[0] = min(tile.threadblock_shape[0], 128)
+                    tile.threadblock_shape[1] = min(tile.threadblock_shape[1], 128)
+
                 op = Conv2dOperation(
                     conv_kind,
                     IteratorAlgorithm.Optimized,
diff --git a/python/tvm/contrib/cutlass/gen_gemm.py b/python/tvm/contrib/cutlass/gen_gemm.py
@@ -20,6 +20,7 @@
 from .gemm_profiler import GemmProfilerEmitter
 from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP
 from .library import (
+    DataType,
     EpilogueFunctor,
     SwizzlingFunctor,
     TensorDescription,
@@ -87,6 +88,14 @@ def enumerate_gemm_operators(
             B = TensorDescription(element_b, LayoutType.ColumnMajor, alignment)
             C = TensorDescription(element_c, LayoutType.RowMajor, alignment)
 
+            if element_c == DataType.s32 and A.alignment == 1:
+                tile_description.threadblock_shape[0] = min(
+                    tile_description.threadblock_shape[0], 128
+                )
+                tile_description.threadblock_shape[1] = min(
+                    tile_description.threadblock_shape[1], 128
+                )
+
             op = GemmOperation(
                 tile_description.minimum_compute_capability,
                 tile_description,
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
@@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
     b, i = get_const_tuple(data.shape)
     o, _ = get_const_tuple(weights.shape)
     if (
-        target.kind.name == "cuda"
+        target.kind.name in ["cuda", "vulkan"]
         and data.dtype == "int8"
         and weights.dtype == "int8"
         and out_type.dtype == "int32"
@@ -860,36 +860,28 @@ def dense_strategy_cuda(attrs, inputs, out_type, target):
                 name="dense_large_batch.gpu",
                 plevel=5,
             )
-        if target.kind.name == "cuda":
-            if nvcc.have_tensorcore(target=target):
-                if (
-                    (
-                        data.dtype in ["float16", "int8", "uint8"]
-                        and (
-                            (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
-                            or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
-                            or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
-                        )
-                    )
-                    or (
-                        data.dtype in ["int4", "uint4"]
-                        and i % 32 == 0
-                        and b % 8 == 0
-                        and o % 8 == 0
-                    )
-                    or (
-                        data.dtype in ["int1", "uint1"]
-                        and i % 128 == 0
-                        and b % 8 == 0
-                        and o % 8 == 0
-                    )
-                ):
-                    strategy.add_implementation(
-                        wrap_compute_dense(topi.cuda.dense_tensorcore),
-                        wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
-                        name="dense_tensorcore.cuda",
-                        plevel=20,
+
+    if target.kind.name == "cuda":
+        if nvcc.have_tensorcore(target=target):
+            if (
+                (
+                    data.dtype in ["float16", "int8", "uint8"]
+                    and (
+                        (i % 16 == 0 and b % 16 == 0 and o % 16 == 0)
+                        or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0)
+                        or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0)
                     )
+                )
+                or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0)
+                or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0)
+            ):
+                strategy.add_implementation(
+                    wrap_compute_dense(topi.cuda.dense_tensorcore),
+                    wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore),
+                    name="dense_tensorcore.cuda",
+                    plevel=20,
+                )
+
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
             wrap_compute_dense(topi.cuda.dense_cublas),
@@ -927,7 +919,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target):
         )
     if target.kind.name == "cuda" and "cublas" in target.libs:
         strategy.add_implementation(
-            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas),
+            wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas, need_out_dtype=True),
             wrap_topi_schedule(topi.generic.schedule_extern),
             name="batch_matmul_cublas.cuda",
             plevel=30,
diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py
@@ -229,7 +229,7 @@ def batch_matmul_cublas(
         b, k, n = get_const_tuple(y.shape)
     if all([isinstance(s, int) for s in [b, m, n, k]]):
         cfg.add_flop(b * m * k * n * 2)
-    return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b)
+    return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b, dtype=out_dtype)
 
 
 @autotvm.register_topi_schedule("batch_matmul_cublas.cuda")
diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc
@@ -290,7 +290,7 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl)
   transa = IsInPlaceTransposed(A) ? !transa : transa;
   transb = IsInPlaceTransposed(B) ? !transb : transb;
 
-  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type";
+  ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type";
   ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0)
       << "leading dimension must divide 4 for int8 gemm";
   ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0)
diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py
@@ -120,8 +120,14 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5):
 
     dev = tvm.cuda(0)
     f = tvm.build(s, [A, B, C], "cuda")
-    a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
-    b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+
+    if "int" in in_dtype:
+        a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev)
+        b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev)
+    else:
+        a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev)
+        b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev)
+
     c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev)
     f(a, b, c)
     tvm.testing.assert_allclose(
@@ -161,6 +167,8 @@ def test_batch_matmul():
         (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2
     )
 
+    verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32")
+
 
 if __name__ == "__main__":
     test_matmul_add()
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
@@ -725,6 +725,26 @@ def test_conv2d():
             ref_target="llvm",
         )
 
+    # align1 + int8 case
+    d_shape = (16, 3, 32, 32)
+    w_shape = (32, 3, 3, 3)
+    mod_nchw = get_conv2d_nchw(
+        d_shape, w_shape, padding, out_dtype="int32", data_dtype="uint8", weight_dtype="int8"
+    )
+
+    verify_conv2d(
+        mod_nchw,
+        mod_nchw,
+        d_shape,
+        w_shape,
+        sm=80,
+        atol=1e-5,
+        rtol=1e-5,
+        ref_target="llvm",
+        data_dtype="uint8",
+        weight_dtype="int8",
+    )
+
 
 def test_conv2d_fusion():
     d_shape = (16, 16, 32, 32)