Add vectorization in elementwise_util (not working yet)

swolchok · swolchok · commit 1c9948517422 · 2025-03-19T17:36:22.000-07:00
this works with op_mul, which is vectorized-friendly, but doesn't work when we roll out to pattern.h because those ops will not work with Vectorized yet. See TODO in elementwise_util.h ghstack-source-id: 30d2311 ghstack-comment-id: 2738665976 Pull Request resolved: #9432
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -264,6 +264,8 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
+    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
+    'kernels/portable/cpu/util/elementwise_util.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
@@ -56,9 +56,7 @@ Tensor& mul_out(
         CTYPE_COMPUTE,
         op_name,
         utils::SupportedTensorDtypes::REALHBBF16>(
-        [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
-          return val_a * val_b;
-        },
+        [](const auto val_a, const auto val_b) { return val_a * val_b; },
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/cpu/pattern/pattern.h b/kernels/portable/cpu/pattern/pattern.h
@@ -80,13 +80,12 @@ Tensor& unary_ufunc_realh(
       ctx, tensors_have_same_shape_and_dtype(in, out), InvalidArgument, out);
 
   ET_SWITCH_REALH_TYPES(in.scalar_type(), ctx, op_name, CTYPE, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE, op_name, utils::SupportedTensorDtypes::SAME_AS_COMMON>(
         fn,
         ctx,
         in,
         utils::SupportedTensorDtypes::REALH,
-        out,
-        utils::SupportedTensorDtypes::SAME_AS_COMMON);
+        out);
   });
   return out;
 }
@@ -107,13 +106,12 @@ Tensor& unary_ufunc_realhb_to_bool(
     return out;
   }
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name, utils::SupportedTensorDtypes::BOOL>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::BOOL);
+        out);
   });
 
   return out;
@@ -138,13 +136,12 @@ Tensor& unary_ufunc_realhbbf16_to_floathbf16(
   }
 
   ET_SWITCH_REALHBBF16_TYPES(in.scalar_type(), ctx, op_name, CTYPE_IN, [&] {
-    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name>(
+    utils::apply_unitensor_elementwise_fn<CTYPE_IN, op_name, utils::SupportedTensorDtypes::FLOATHBF16>(
         [fn](const CTYPE_IN val_in) { return fn(val_in); },
         ctx,
         in,
         utils::SupportedTensorDtypes::REALHBBF16,
-        out,
-        utils::SupportedTensorDtypes::FLOATHBF16);
+        out);
   });
 
   return out;
diff --git a/kernels/portable/cpu/util/elementwise_util.h b/kernels/portable/cpu/util/elementwise_util.h
@@ -15,6 +15,10 @@
 #include <executorch/runtime/kernel/kernel_runtime_context.h>
 #include <executorch/runtime/kernel/thread_parallel_interface.h>
 
+#ifdef ET_USE_PYTORCH_HEADERS
+#include <ATen/cpu/vec/vec.h>
+#endif // ET_USE_PYTORCH_HEADERS
+
 #include <array>
 #include <utility>
 
@@ -58,6 +62,38 @@ template <typename CTYPE_COMMON, typename Op, typename... Args>
 using op_call_result =
     std::invoke_result_t<Op, ignore_first_yield_second<Args, CTYPE_COMMON>...>;
 
+#ifdef ET_USE_PYTORCH_HEADERS
+template <typename T>
+struct is_vectorized : public std::false_type {};
+
+template <typename T>
+struct is_vectorized<at::vec::Vectorized<T>> : public std::true_type {};
+
+// TODO: can_use_vectorized and can_use_vectorized_impl are a failed
+// attempt to use SFINAE to detect whether our generic lambda argument
+// with deduced return type would compile if it was passed
+// Vectorized<CTYPE_COMMON> instead of CTYPE_COMMON. SFINAE does not
+// work that way (see
+// e.g. https://stackoverflow.com/questions/53344484/hard-error-when-using-stdinvoke-result-t-with-a-generic-lambda,
+// https://stackoverflow.com/questions/31368601/how-to-detect-if-a-generic-lambda-is-uncompilable-in-c-14);
+// if we really want to do it then we need to at least require that
+// our lambdas actively participate in being SFINAE-friendly, as in
+// https://stackoverflow.com/questions/76525790/detecting-if-a-generic-lambda-with-certain-arguments-is-invocable.
+template <typename CTYPE_COMMON, typename Op, typename Enable=void, typename... Args>
+struct can_use_vectorized_impl : std::false_type {};
+template <typename CTYPE_COMMON, typename Op, typename... Args>
+struct can_use_vectorized_impl<CTYPE_COMMON, Op, typename std::void_t<decltype(std::declval<std::invoke_result_t<
+      Op,
+                                                                               ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>().store(std::declval<CTYPE_COMMON*>()))>, Args...> : public std::true_type {};//std::bool_constant<is_vectorized<std::invoke_result_t<Op,ignore_first_yield_second<Args, at::vec::Vectorized<CTYPE_COMMON>>...>>::value> {};
+
+// Can I call a function of type Op with sizeof...(Args) arguments of type
+// at::vec::Vectorized<CTYPE_COMMON>?
+// This is not possible in C++17 as the code is currently set up; see TODO above.
+template <typename CTYPE_COMMON, typename Op, typename...Args>
+struct can_use_vectorized : public can_use_vectorized_impl<CTYPE_COMMON, Op, void, Args...> {};
+
+#endif // ET_USE_PYTORCH_HEADERS
+
 template <
     typename CTYPE_COMMON,
     typename CTYPE_OUT,
@@ -68,14 +104,72 @@ inline void dtype_specialized_elementwise_fn_impl(
     KernelRuntimeContext& ctx,
     const Tensor& out,
     Args... inputs) {
+  static_assert(
+      (std::is_same_v<Args, std::pair<const Tensor*, SupportedTensorDtypes>> &&
+       ...));
   constexpr auto kNumInputs = sizeof...(inputs);
-  ET_DCHECK(((inputs.first->element_size() == sizeof(CTYPE_COMMON)) && ...));
+  // All inputs must be of type CTYPE_COMMON.
+  ET_DCHECK(
+      ((inputs.first->scalar_type() ==
+        CppTypeToScalarType<CTYPE_COMMON>::value) &&
+       ...));
 
   std::array<const CTYPE_COMMON*, kNumInputs> inputs_data_ptrs = {
       inputs.first->template const_data_ptr<CTYPE_COMMON>()...};
 
   CTYPE_OUT* const data_out = out.mutable_data_ptr<CTYPE_OUT>();
 
+#ifdef ET_USE_PYTORCH_HEADERS
+  if constexpr (can_use_vectorized<CTYPE_COMMON, Op, Args...>::value) {
+    const bool any_is_broadcasted =
+        !(torch::executor::internal::sizes_match_ignoring_leading_1s(
+              inputs.first->sizes(), out.sizes()) &&
+          ...);
+    if (!any_is_broadcasted) {
+      using Vec = at::vec::Vectorized<CTYPE_COMMON>;
+      ::executorch::extension::parallel_for(
+          0,
+          out.numel(),
+          ::executorch::extension::internal::GRAIN_SIZE,
+          [&](const auto begin, const auto end) {
+            const auto vectorized_begin =
+                begin + (Vec::size() - begin % Vec::size()) % Vec::size();
+            const auto vectorized_end = end - (end % Vec::size());
+            // Scalar prologue.
+            for (const auto idx : c10::irange(begin, vectorized_begin)) {
+              std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+            }
+
+            // Main vectorized loop.
+            for (auto idx = vectorized_begin; idx < vectorized_end;
+                 idx += Vec::size()) {
+              std::array<Vec, kNumInputs> loaded_vec_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_vec_inputs[input_idx] =
+                    Vec::loadu(&inputs_data_ptrs[input_idx][idx]);
+              }
+              auto result_vec = std::apply(compute_fun, loaded_vec_inputs);
+              result_vec.store(&data_out[idx]);
+            }
+
+            // Scalar epilogue.
+            for (const auto idx : c10::irange(vectorized_end, end)) {
+              std::array<CTYPE_COMMON, kNumInputs> loaded_inputs;
+              for (const auto input_idx : c10::irange(kNumInputs)) {
+                loaded_inputs[input_idx] = inputs_data_ptrs[input_idx][idx];
+              }
+              data_out[idx] = std::apply(compute_fun, loaded_inputs);
+            }
+          });
+      return;
+    }
+  }
+#endif
+
   ::executorch::extension::parallel_for(
       0,
       out.numel(),
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -110,6 +110,7 @@ def define_common_targets():
             ":broadcast_indexes_range",
             ":broadcast_util",
             ":dtype_util",
+            "//executorch/runtime/core/portable_type/c10/c10:aten_headers_for_executorch",
             "//executorch/runtime/kernel:kernel_runtime_context",
             "//executorch/runtime/kernel:thread_parallel_interface",
         ],
diff --git a/runtime/core/portable_type/c10/c10/targets.bzl b/runtime/core/portable_type/c10/c10/targets.bzl
@@ -49,7 +49,10 @@ def define_common_targets():
     runtime.cxx_library(
         name = "aten_headers_for_executorch",
         srcs = [],
-        visibility = ["//executorch/kernels/optimized/..."],
+        visibility = [
+            "//executorch/kernels/optimized/...",
+            "//executorch/kernels/portable/cpu/util/...",
+        ],
         exported_deps = select({
             "DEFAULT": [],
             "ovr_config//cpu:arm64": [