speed up profiling by removing initialization

masahi · masahi · commit a447b57ade7c · 2021-12-12T14:54:52.000+09:00
diff --git a/python/tvm/contrib/cutlass/conv2d_profiler.py b/python/tvm/contrib/cutlass/conv2d_profiler.py
@@ -81,10 +81,7 @@ def __init__(self):
     auto dilated_w = (filter_size.w() - 1) * dilation.column() + 1;
     auto h = (input_size.h() + padding.n() + padding.h() - dilated_h) / conv_stride.row() + 1;
     auto w = (input_size.w() + padding.w() + padding.c() - dilated_w) / conv_stride.column() + 1;
-    return cutlass::Tensor4DCoord(
-				  input_size.n(),
-				  h, w,
-				  filter_size.n());
+    return cutlass::Tensor4DCoord(input_size.n(), h, w, filter_size.n());
   }
 };
 
@@ -98,31 +95,6 @@ def __init__(self):
   cutlass::HostTensor<ElementOutput, typename ImplicitGemm::LayoutC> tensor_c(oshape);
   cutlass::HostTensor<ElementOutput, typename ImplicitGemm::LayoutC> tensor_ref_c(oshape);
 
-  cutlass::reference::host::TensorFillRandomUniform(
-						    tensor_a.host_view(),
-						    1,
-						    ElementInputA(7),
-						    ElementInputA(-8),
-						    0);
-
-  cutlass::reference::host::TensorFillRandomUniform(
-						    tensor_b.host_view(),
-						    1,
-						    ElementInputB(7),
-						    ElementInputB(-8),
-						    0);
-
-  cutlass::reference::host::TensorFill(
-				       tensor_c.host_view());
-
-  cutlass::reference::host::TensorFill(
-				       tensor_ref_c.host_view());
-
-  tensor_a.sync_device();
-  tensor_b.sync_device();
-  tensor_c.sync_device();
-  tensor_ref_c.sync_device();
-
   cutlass::conv::Conv2dProblemSize problem_size(
 						options.input_size,
 						options.filter_size,
@@ -137,12 +109,12 @@ def __init__(self):
   using ElementComputeEpilogue = typename ImplicitGemm::ElementCompute;
   typename ImplicitGemm::Arguments arguments{
     problem_size,
-      tensor_a.device_ref(),
-      tensor_b.device_ref(),
-      tensor_c.device_ref(),
-      tensor_c.device_ref(),
-      {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
-      };
+    tensor_a.device_ref(),
+    tensor_b.device_ref(),
+    tensor_c.device_ref(),
+    tensor_c.device_ref(),
+    {ElementComputeEpilogue(1), ElementComputeEpilogue(0)},
+  };
 
   ImplicitGemm implicit_gemm_op;
   size_t workspace_size = implicit_gemm_op.get_workspace_size(arguments);