apache
diff --git a/‎python/tvm/contrib/cutlass/build.py‎
Lines changed: 9 additions & 3 deletions b/‎python/tvm/contrib/cutlass/build.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎python/tvm/contrib/cutlass/conv2d_profiler.py‎
Lines changed: 163 additions & 0 deletions b/‎python/tvm/contrib/cutlass/conv2d_profiler.py‎
Lines changed: 163 additions & 0 deletions
diff --git a/‎python/tvm/contrib/cutlass/gen_conv2d.py‎
Lines changed: 64 additions & 14 deletions b/‎python/tvm/contrib/cutlass/gen_conv2d.py‎
Lines changed: 64 additions & 14 deletions
diff --git a/‎python/tvm/contrib/cutlass/gen_gemm.py‎
Lines changed: 7 additions & 17 deletions b/‎python/tvm/contrib/cutlass/gen_gemm.py‎
Lines changed: 7 additions & 17 deletions
@@ -184,7 +184,9 @@ def handle_conv2d(
     op_type,
     d_shape,
     w_shape,
-    out_shape,
+    padding,
+    strides,
+    dilation,
     out_dtype,
     profile_all,
     use_multiprocessing,
@@ -197,7 +199,9 @@ def handle_conv2d(
         out = cutlass_profiler.profile(
             d_shape,
             w_shape,
-            out_shape,
+            padding,
+            strides,
+            dilation,
             out_dtype,
             profile_all=profile_all,
             use_multiprocessing=use_multiprocessing,
@@ -278,7 +282,9 @@ def tune_cutlass_kernels(mod, sm, profile_all=True, use_multiprocessing=False, t
                         op_type,
                         arg0_shape,
                         arg1_shape,
-                        annotator.signature["ret_shape"],
+                        annotator.op_attrs.padding,
+                        annotator.op_attrs.strides,
+                        annotator.op_attrs.dilation,
                         out_dtype,
                         profile_all,
                         use_multiprocessing,
 
@@ -0,0 +1,163 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=import-outside-toplevel, invalid-name
+"""Instantiate a C++ source for profiling CUTLASS kernels."""
+
+
+class Conv2dProfilerEmitter(object):
+    """Emit a C++ source for profiling CUTLASS kernels."""
+
+    def __init__(self):
+        from jinja2 import Template
+
+        self.template = Template(
+            """
+#include <iostream>
+#include "cutlass/cutlass.h"
+#include "cutlass/conv/kernel/default_conv2d_fprop.h"
+#include "cutlass/conv/device/implicit_gemm_convolution.h"
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+
+#define CUTLASS_CHECK(status)                                                                    \
+  {                                                                                              \
+    cutlass::Status error = status;                                                              \
+    if (error != cutlass::Status::kSuccess) {                                                    \
+      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \
+                << std::endl;                                                                    \
+      exit(EXIT_FAILURE);                                                                        \
+    }                                                                                            \
+  }
+
+{{OperatorDef}}
+using ImplicitGemm = cutlass::conv::device::ImplicitGemmConvolution<{{OperatorName}}>;
+
+struct Options {
+  cutlass::Tensor4DCoord input_size;
+  cutlass::Tensor4DCoord filter_size;
+  cutlass::Tensor4DCoord padding;
+  cutlass::MatrixCoord conv_stride;
+  cutlass::MatrixCoord dilation;
+
+  void parse(int argc, char const **args) {
+    cutlass::CommandLine cmd(argc, args);
+    cmd.get_cmd_line_argument("n", input_size.n());
+    cmd.get_cmd_line_argument("h", input_size.h());
+    cmd.get_cmd_line_argument("w", input_size.w());
+    cmd.get_cmd_line_argument("c", input_size.c());
+    cmd.get_cmd_line_argument("k", filter_size.n());
+    cmd.get_cmd_line_argument("r", filter_size.h());
+    cmd.get_cmd_line_argument("s", filter_size.w());
+    int pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w;
+    cmd.get_cmd_line_argument("pad_h", pad_h);
+    cmd.get_cmd_line_argument("pad_w", pad_w);
+    cmd.get_cmd_line_argument("stride_h", stride_h);
+    cmd.get_cmd_line_argument("stride_w", stride_w);
+    cmd.get_cmd_line_argument("dilation_h", dilation_h);
+    cmd.get_cmd_line_argument("dilation_w", dilation_w);
+    filter_size.c() = input_size.c();
+    padding = {pad_h, pad_h, pad_w, pad_w};
+    conv_stride = {stride_h, stride_w};
+    dilation = {dilation_h, dilation_w};
+  }
+
+  cutlass::Tensor4DCoord output_size() const {
+    auto dilated_h = (filter_size.h() - 1) * dilation.row() + 1;
+    auto dilated_w = (filter_size.w() - 1) * dilation.column() + 1;
+    auto h = (input_size.h() + padding.n() + padding.h() - dilated_h) / conv_stride.row() + 1;
+    auto w = (input_size.w() + padding.w() + padding.c() - dilated_w) / conv_stride.column() + 1;
+    return cutlass::Tensor4DCoord(input_size.n(), h, w, filter_size.n());
+  }
+};
+
+double profile_convolution(Options const &options) {
+  using ElementOutput = typename ImplicitGemm::ElementC;
+  using ElementInputA = typename ImplicitGemm::ElementA;
+  using ElementInputB = typename ImplicitGemm::ElementB;
+  auto oshape = options.output_size();
+  cutlass::HostTensor<ElementInputA, typename ImplicitGemm::LayoutA> tensor_a(options.input_size);
+  cutlass::HostTensor<ElementInputB, typename ImplicitGemm::LayoutB> tensor_b(options.filter_size);
+  cutlass::HostTensor<ElementOutput, typename ImplicitGemm::LayoutC> tensor_c(oshape);
+  cutlass::HostTensor<ElementOutput, typename ImplicitGemm::LayoutC> tensor_ref_c(oshape);
+
+  cutlass::conv::Conv2dProblemSize problem_size(
+						options.input_size,
+						options.filter_size,
+						options.padding,
+						options.conv_stride,
+						options.dilation,
+						options.output_size(),
+						cutlass::conv::Mode::kCrossCorrelation,
+						1
+						);
+
+  using ElementComputeEpilogue = typename ImplicitGemm::ElementCompute;
+  typename ImplicitGemm::Arguments arguments{
+    problem_size,
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+  };
+
+  ImplicitGemm implicit_gemm_op;
+  size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+  auto status = implicit_gemm_op.can_implement(arguments);
+  CUTLASS_CHECK(status);
+
+  status = implicit_gemm_op.initialize(arguments, workspace.get());
+  CUTLASS_CHECK(status);
+  status = implicit_gemm_op();
+  CUTLASS_CHECK(status);
+
+  cudaEvent_t events[2];
+  for (auto & event : events) {
+    cudaEventCreate(&event);
+  }
+  cudaEventRecord(events[0]);
+
+  for (int iteration = 0; iteration < 100; ++iteration) {
+    auto status = implicit_gemm_op();
+    CUTLASS_CHECK(status);
+  }
+
+  cudaEventRecord(events[1]);
+  cudaEventSynchronize(events[1]);
+  float runtime_ms = 0;
+  cudaEventElapsedTime(&runtime_ms, events[0], events[1]);
+
+  for (auto event : events) {
+    (void)cudaEventDestroy(event);
+  }
+  return double(runtime_ms) / 100.0;
+}
+
+int main(int argc, char const **args) {
+  Options options;
+  options.parse(argc, args);
+  std::cout << profile_convolution(options) << std::endl;
+  return 0;
+}
+"""
+        )
+
+    def emit(self, op_def, op_name):
+        src = self.template.render(OperatorDef=op_def, OperatorName=op_name)
+        return src
@@ -16,8 +16,14 @@
 # under the License.
 # pylint: disable=invalid-name
 """Conv2d kernel generator and profiler for CUTLASS."""
+import re
 from .conv2d_operation import Conv2dOperation, EmitConv2dInstance
 from .gen_gemm import CutlassGemmProfiler
+from .conv2d_profiler import Conv2dProfilerEmitter
+from .gen_tensor_op import (
+    ProfilerEngine,
+    GENERATOR_FUNC_TABLE,
+)
 from .library import (
     EpilogueFunctor,
     SwizzlingFunctor,
@@ -39,6 +45,7 @@ def create_conv2d_operator(
     ret = []
 
     kernel_emitter = EmitConv2dInstance()
+    profiler_emitter = Conv2dProfilerEmitter()
 
     element_a, element_b, element_c, element_epilogue = data_type
     iterator_algorithms = [IteratorAlgorithm.Optimized]
@@ -72,9 +79,9 @@ def create_conv2d_operator(
                     swizzling_functor_,
                 )
 
-                # TODO(masahi): Add profiler source here
                 op_entry["opdef"] = kernel_emitter.emit(op)
                 op_entry["op"] = op
+                op_entry["src"] = profiler_emitter.emit(op_entry["opdef"], op.procedural_name())
                 op_entry["name"] = op.procedural_name()
                 op_entry["runtime"] = 9999999
 
@@ -113,6 +120,9 @@ class CutlassConv2DProfiler:
     def __init__(self, sm, cutlass_path, binary_path):
         self.gemm_profiler = CutlassGemmProfiler(sm, cutlass_path, binary_path)
         self.sm = sm
+        assert sm in GENERATOR_FUNC_TABLE, "sm%d not supported yet." % sm
+        self.engine = ProfilerEngine(sm, cutlass_path, binary_path)
+        self.cache = {}
 
     def get_default(self, out_dtype):
         gemm_profile_result = self.gemm_profiler.get_default(out_dtype)
@@ -121,27 +131,67 @@ def get_default(self, out_dtype):
         data_type = gemm_profile_result["data_type"]
         return create_conv2d_operator([tile_description], data_type, [alignment])[0]
 
+    def check_align(self, op_name, C, K):
+        """Filter out kernels that cannot be supported."""
+        aligns = re.findall(r"align[1|2|4|8]", op_name)
+        assert len(aligns) == 1
+        align = int(aligns[0][-1])
+        return all([dim % align == 0 for dim in [C, K]])
+
     def profile(
-        self, d_shape, w_shape, out_shape, out_dtype, profile_all=True, use_multiprocessing=False
+        self,
+        d_shape,
+        w_shape,
+        padding,
+        stride,
+        dilation,
+        out_dtype,
+        profile_all=True,
+        use_multiprocessing=False,
     ):
         """Profile and select the best kernel from candidate kernels.
         If profile_all is False, return immediately after the first applicable kernel is found.
         If use_multiprocessing is True, compile all profiler executables in parallel.
         """
-        B, _, _, IC = d_shape
+        N, H, W, IC = d_shape
         OC, R, S, _ = w_shape
-        _, P, Q, _ = out_shape
+        workload = (
+            N,
+            H,
+            W,
+            IC,
+            OC,
+            R,
+            S,
+            padding[0],
+            padding[1],
+            stride[0],
+            stride[1],
+            dilation[0],
+            dilation[1],
+        )
 
-        M = B * P * Q
-        N = OC
-        K = R * S * IC
+        if workload in self.cache:
+            return self.cache[workload]
 
-        gemm_profile_result = self.gemm_profiler.profile(
-            M, N, K, out_dtype, profile_all=profile_all, use_multiprocessing=use_multiprocessing
-        )
+        ops = GENERATOR_FUNC_TABLE[self.sm](out_dtype, op_creator=create_conv2d_operator)
+        ops = list(filter(lambda op: self.check_align(op["name"], IC, OC), ops))
 
-        tile_description = gemm_profile_result["tile_description"]
-        alignment = gemm_profile_result["alignment"]
-        data_type = gemm_profile_result["data_type"]
+        if profile_all:
+            self.engine.compile_all(ops, use_multiprocessing)
 
-        return create_conv2d_operator([tile_description], data_type, [alignment])[0]
+        args = (
+            "--n=%d --h=%d --w=%d --c=%d --k=%d --r=%d --s=%d --pad_h=%d --pad_w=%d "
+            "--stride_h=%d --stride_w=%d --dilation_h=%d --dilation_w=%d"
+        ) % workload
+
+        for op in ops:
+            out = self.engine.evaluate(op, args.split(" "))
+            op["runtime"] = out
+            if out < float("inf") and not profile_all:
+                self.cache[workload] = op
+                return op
+
+        output = min(ops, key=lambda i: i["runtime"])
+        self.cache[workload] = output
+        return output
@@ -22,8 +22,7 @@
 from .gemm_profiler import GemmProfilerEmitter
 from .gen_tensor_op import (
     ProfilerEngine,
-    generate_sm75_tensor_op_1688,
-    generate_sm80_tensor_op_16816,
+    GENERATOR_FUNC_TABLE,
 )
 from .library import (
     EpilogueFunctor,
@@ -132,12 +131,6 @@ def create_gemm_operator(
     return ret
 
 
-GENERATOR_FUNC_TABLE = {
-    75: generate_sm75_tensor_op_1688,
-    80: generate_sm80_tensor_op_16816,
-}
-
-
 # TODO(masahi): A sensible way to pick reasonable default kernels
 DEFAULT_KERNELS = {
     75: {
@@ -199,19 +192,16 @@ def profile(
         )
         ops = list(filter(lambda op: self.check_align(op["name"], M, N, K), ops))
 
-        for op in ops:
-            op["runtime"] = -1
-
         if profile_all:
             self.engine.compile_all(ops, use_multiprocessing)
 
         for op in ops:
             out = self.engine.evaluate(op, [M, N, K])
             op["runtime"] = out
-            if out > 0 and profile_all is False:
-                break
+            if out < float("inf") and not profile_all:
+                self.cache[(M, N, K)] = op
+                return op
 
-        valid_ops = filter(lambda op: op["runtime"] > 0, ops)
-        output = sorted(valid_ops, key=lambda i: i["runtime"])
-        self.cache[(M, N, K)] = output[0]
-        return output[0]
+        output = min(ops, key=lambda i: i["runtime"])
+        self.cache[(M, N, K)] = output
+        return output