[RUNTIME][CLML] Fix for Softmax op for 4D tensors (#16328)

krishnaraj36 · web-flow · commit a5e883e8465e · 2024-01-18T12:38:57.000+05:30
Fixed the softmax layer for 4D tensors to support for NCHW and NHWC
layout types.
Enabled relevant test cases for softmax layer
diff --git a/python/tvm/relay/op/contrib/clml.py b/python/tvm/relay/op/contrib/clml.py
@@ -437,7 +437,8 @@ def check_pad_op(extract):
 
     def check_softmax_op(extract):
         call = extract
-        if len(call.args[0].checked_type.shape) > 2:
+        # supports 2D and 4D tensors
+        if len(call.args[0].checked_type.shape) not in [2, 4]:
             return False
         return True
 
diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc
@@ -511,6 +511,7 @@ class CLMLRuntime : public JSONRuntimeBase {
 
   /*!
    * \brief Create an CLML tensor from JSON node entry. Lookup storage map before creation.
+   * Update input placeholder for NHWC layout
    *
    * \param nid The node index of graph JSON.
    * \param shape shape information of tensor
@@ -528,15 +529,22 @@ class CLMLRuntime : public JSONRuntimeBase {
         uint32_t eid = EntryID(nid, 0);
         node_data = data_entry_[eid]->data;
       }
+
       auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape);
+
       this->layer_.storage_map.insert({nid, std::make_pair(clml_tensor, node)});
 
       if ("input" == node.GetOpType()) {
         this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].first});
         // Input copy placeholder Tensor
-        this->layer_.in_placeholder.insert(
-            {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, dtype, node_data,
-                                             shape)});
+        if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
+          this->layer_.in_placeholder.insert(
+              {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM, dtype, node_data,
+                                               shape)});
+        } else {
+          this->layer_.in_placeholder.insert(
+              {nid, MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data, shape)});
+        }
       }
 
       return clml_tensor;
@@ -559,6 +567,7 @@ class CLMLRuntime : public JSONRuntimeBase {
       const auto& node = nodes_[nid];
       if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_, node, nid);
       if ("nn.batch_matmul" == node.GetOpName()) CreateBatchMatmulLayerTensor(&layer_, node, nid);
+      if ("nn.softmax" == node.GetOpName()) CreateSoftmaxLayerTensor(&layer_, node, nid);
     }
 
     for (nid = 0; nid < nodes_.size(); ++nid) {
@@ -1092,6 +1101,37 @@ class CLMLRuntime : public JSONRuntimeBase {
     return;
   }
 
+  /*!
+   * \brief Create a Softmax layer Tensors with supported layout.
+   * \param layer The CLML layer to build. Containing inputs, outputs and the CLML function.
+   * \param node The JSON representation of the operator.
+   * \param nid The node index of JSON graph node, which points to this operator.
+   */
+
+  void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
+    cl_ml_tensor_layout_qcom layout;
+    cl_int result = 0;
+    cl_ml_op_qcom op = nullptr;
+    DLDataType tvm_dtype = node.GetOpDataType()[0];
+    cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+    auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+    int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+    // enabling  NHWC layout && NCHW layout for 4D,  basis the axis value
+    if (out_dims.h >= 1 && out_dims.w >= 1) {
+      if (axis == 3 || axis == -1) {
+        layout = CL_TENSOR_LAYOUT_NHWC_QCOM;
+      } else {
+        layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
+      }
+    } else {  // default layout for 2D
+      layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
+    }
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
+
+    return;
+  }
+
   /*!
    * \brief Create a SoftMax layer.
    *
@@ -1100,24 +1140,20 @@ class CLMLRuntime : public JSONRuntimeBase {
    * \param nid The node index of JSON graph node, which points to this operator.
    */
   void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node, size_t nid) {
+    cl_ml_tensor_layout_qcom layout;
+    cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
     cl_int result = 0;
     cl_ml_op_qcom op = nullptr;
     DLDataType tvm_dtype = node.GetOpDataType()[0];
     cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
     cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype, cl_dtype);
-    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
-                                             CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-    auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
-    auto output = MakeCLMLTensorFromJSONEntry(nid, {out_dims.n, out_dims.c, 1, 1},
-                                              CL_TENSOR_LAYOUT_OPTIMAL_QCOM, cl_dtype);
-
-    cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
-                                               CL_SOFTMAX_MODE_INSTANCE_QCOM, cl_arithmetic_mode};
-
+    auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+    auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {}, layout, cl_dtype);
+    cl_ml_op_softmax_desc_qcom softmax_desc = {CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
+                                               cl_arithmetic_mode};
     result = CLML_INTF->clCreateMLOpSoftmaxQCOM(CLML_CTX, nullptr, &softmax_desc, input->tensor,
                                                 output->tensor, &op, layer_.tuning_cache);
     ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
-
     layer->function.push_back(op);
     return;
   }
diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py
@@ -280,9 +280,9 @@ def test_conv2d(remote, dtype, target, trials, executor_type):
         has_activation=composite[2],
     )
     outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_rtol = 1e-1 if dtype == "float16" else 1e-5
+    out_tol = 1e-1 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
     args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels)
     exp_codegen = _get_conv_expected_codegen(
@@ -373,9 +373,9 @@ def test_conv2d_transpose(remote, dtype, target, trials, executor_type):
     func = relay.Function([x, w], y)
     mod = IRModule.from_expr(func)
     outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_rtol = 1e-1 if dtype == "float16" else 1e-5
+    out_tol = 1e-1 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
     args = (
         dshape,
@@ -425,9 +425,9 @@ def test_batchnorm(remote, dtype, target, trials, executor_type):
         "a": input_arr,
     }
     outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_rtol = 1e-3 if dtype == "float16" else 1e-5
+    out_tol = 1e-3 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
     exp_codegen = [
         {
@@ -485,9 +485,9 @@ def test_concat(remote, dtype, target, trials, executor_type):
     func = relay.concatenate((a, b), axis=1)
 
     outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_rtol = 1e-2 if dtype == "float16" else 1e-5
+    out_tol = 1e-2 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
 
     exp_codegen = [
@@ -601,9 +601,9 @@ def test_pool(remote, dtype, target, trials, executor_type):
         func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride, padding=padding)
 
     outputs = _build_and_run_network(remote, func, params, inputs, target, executor_type)
-    out_rtol = 1e-2 if dtype == "float16" else 1e-5
+    out_tol = 1e-2 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
     args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
     exp_codegen = _get_pool_expected_codegen(*args)
@@ -690,9 +690,9 @@ def _get_model(x_shape, k_shape, has_bias=False):
     def _verify(out, params, inputs, exp_codegen):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-1 if dtype == "float16" else 1e-5
+        out_tol = 1e-1 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
         verify_codegen(remote, mod, params, exp_codegen, target)
 
@@ -718,9 +718,9 @@ def _get_model(a_shape, b_shape, op_func):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        out_tol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
         exp_codegen = [
             {
@@ -776,9 +776,9 @@ def _get_model(a_shape, op):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        out_tol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
 
         exp_codegen = [
@@ -823,12 +823,11 @@ def _get_model(a_shape, block_size):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        out_tol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
 
-        # Check to make sure these ops are offloaded to CLML instead of TVM.
         exp_codegen = [
             {
                 "attrs": {
@@ -877,12 +876,11 @@ def _get_model(a_shape, scale, align_corners):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        out_tol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
 
-        # Check to make sure these ops are offloaded to CLML instead of TVM.
         exp_codegen = [
             {
                 "attrs": {
@@ -944,12 +942,11 @@ def _get_model(a_shape, b_shape, a_transpose, b_transpose):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-1 if dtype == "float16" else 1e-5
+        out_tol = 1e-1 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
 
-        # Check to make sure these ops are offloaded to CLML instead of TVM.
         exp_codegen = [
             {
                 "attrs": {
@@ -1026,20 +1023,30 @@ def _get_model(a_shape, axis):
         params = {}
         return out, params, inputs, axis
 
-    def _verify(out, params, inputs, axis):
+    def _verify(out, params, inputs, axis, out_tol):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-1 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].numpy(), rtol=out_tol, atol=out_tol
         )
         args = (inputs, dtype, outputs[0].shape, axis)
         exp_codegen = _get_softmax_exp_codegen(*args)
         verify_codegen(remote, mod, params, exp_codegen, target)
 
-    _verify(*(_get_model((1, 5), 1)))
-    _verify(*(_get_model((1, 1000), 1)))
-    _verify(*(_get_model((1, 3), 1)))
+    # 2D Tensor  TEST CASES
+    _verify(*(_get_model((1, 5), 1)), 1e-3)
+    _verify(*(_get_model((1, 16), 1)), 1e-3)
+    _verify(*(_get_model((1, 1000), -1)), 1e-3)
+
+    # 4D Tensor  TEST CASES  layout = NCHW
+    _verify(*(_get_model((1, 100, 64, 100), 1)), 1e-3)
+    _verify(*(_get_model((1, 64, 64, 64), 1)), 1e-3)
+    _verify(*(_get_model((1, 5, 3, 4), 1)), 1e-3)
+
+    # 4D Tensor  TEST CASES  layout = NHWC
+    _verify(*(_get_model((1, 64, 100, 100), 3)), 1e-1)
+    _verify(*(_get_model((1, 100, 100, 100), 3)), 1e-1)
+    _verify(*(_get_model((1, 64, 5, 32), -1)), 1e-1)
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float16"])
@@ -1066,9 +1073,9 @@ def _verify(in_shape, scale_h, scale_w):
         )
         mod = IRModule.from_expr(func)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-2 if dtype == "float16" else 1e-5
+        out_tol = 1e-2 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
         exp_codegen = [
             {
@@ -1124,9 +1131,9 @@ def _verify(shape, newshape):
         params = {}
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-3 if dtype == "float16" else 1e-5
+        out_tol = 1e-3 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
         exp_codegen = [
             {
@@ -1223,9 +1230,9 @@ def test_pool_global(remote, dtype, target, executor_type, trials):
         func = relay.nn.global_avg_pool2d(a)
     mod = IRModule.from_expr(func)
     outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-    out_rtol = 1e-3 if dtype == "float16" else 1e-5
+    out_tol = 1e-3 if dtype == "float16" else 1e-5
     tvm.testing.assert_allclose(
-        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+        outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
     )
     args = (input_shape, pooling_type, dtype, outputs[0].shape)
     exp_codegen = _get_pool_global_expected_codegen(*args)
@@ -1241,6 +1248,7 @@ def _get_model(a_shape):
         # Defined the test case with unary operator
         # Single batch_flatten op is failing in native OpenCL
         # Empty TVM mod in VM doesn't pick appropriate cross compiler
+        np.random.seed(0)
         out = relay.nn.relu(a)
         out = relay.nn.batch_flatten(out)
         inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1, a_shape).astype(dtype))}
@@ -1250,9 +1258,9 @@ def _get_model(a_shape):
     def _verify(out, params, inputs):
         mod = IRModule.from_expr(out)
         outputs = _build_and_run_network(remote, mod, params, inputs, target, executor_type)
-        out_rtol = 1e-3 if dtype == "float16" else 1e-5
+        out_tol = 1e-3 if dtype == "float16" else 1e-5
         tvm.testing.assert_allclose(
-            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol, atol=out_rtol
+            outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
         )
         exp_codegen = [
             {