pytorch · AdrianLundell · Nov 19, 2025 · Nov 11, 2025
@@ -58,6 +58,8 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_minimum.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_maximum.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp
 )
 

diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h
@@ -18,11 +18,19 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
+#include <limits>
+#include <optional>
+
+extern "C" {
+#include "arm_nn_types.h"
+}
+
 using Tensor = torch::executor::Tensor;
 using ScalarType = executorch::aten::ScalarType;
 using Scalar = torch::executor::Scalar;
 using Error = executorch::runtime::Error;
 using IntArrayRef = executorch::aten::ArrayRef<int64_t>;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
 
 // From arm_nn_math_types.h
 #define ARM_NN_Q31_MAX ((int32_t)(0x7FFFFFFFL))
@@ -34,7 +42,8 @@ inline void validate_cmsis_nn_tensor_requirements(
     const Tensor& input2,
     Tensor& output,
     ScalarType expected_dtype = ScalarType::Char,
-    bool require_channels_last = false) {
+    bool require_channels_last = false,
+    bool require_same_sizes = true) {
   // Basic dtype validation
   ET_CHECK_MSG(
       input1.scalar_type() == expected_dtype,
@@ -51,12 +60,14 @@ inline void validate_cmsis_nn_tensor_requirements(
       "Output dtype must be %hhd, got %hhd",
       expected_dtype,
       output.scalar_type());
-  ET_CHECK_MSG(
-      input1.sizes() == input2.sizes(),
-      "Input1 and Input2 must have the same sizes");
-  ET_CHECK_MSG(
-      output.sizes() == input1.sizes(),
-      "Output must have the same sizes as inputs");
+  if (require_same_sizes) {
+    ET_CHECK_MSG(
+        input1.sizes() == input2.sizes(),
+        "Input1 and Input2 must have the same sizes");
+    ET_CHECK_MSG(
+        output.sizes() == input1.sizes(),
+        "Output must have the same sizes as inputs");
+  }
 
   // Dim order consistency
   ET_CHECK_MSG(

diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& maximum_out(
+    KernelRuntimeContext& context,
+    const Tensor& input1,
+    const Tensor& input2,
+    Tensor& out) {
+  validate_cmsis_nn_tensor_requirements(
+      input1,
+      input2,
+      out,
+      ScalarType::Char,
+      /*require_channels_last=*/false,
+      /*require_same_sizes=*/false);
+
+  auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
+  if (resize_error != Error::Ok) {
+    ET_LOG(Error, "maximum_out: broadcast shape mismatch between inputs");
+    context.fail(resize_error);
+    return out;
+  }
+
+  const int8_t* input1_data = input1.const_data_ptr<int8_t>();
+  const int8_t* input2_data = input2.const_data_ptr<int8_t>();
+  int8_t* output_data = out.mutable_data_ptr<int8_t>();
+
+  // Create CMSIS-NN dims directly from tensor sizes
+  const auto input1_rank = input1.dim();
+  const auto input1_sizes = input1.sizes();
+  const cmsis_nn_dims input1_dims{
+      static_cast<int32_t>(
+          input1_rank >= 4 ? input1_sizes[input1_rank - 4] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 3 ? input1_sizes[input1_rank - 3] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 2 ? input1_sizes[input1_rank - 2] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 1 ? input1_sizes[input1_rank - 1] : 1)};
+
+  const auto input2_rank = input2.dim();
+  const auto input2_sizes = input2.sizes();
+  const cmsis_nn_dims input2_dims{
+      static_cast<int32_t>(
+          input2_rank >= 4 ? input2_sizes[input2_rank - 4] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 3 ? input2_sizes[input2_rank - 3] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 2 ? input2_sizes[input2_rank - 2] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 1 ? input2_sizes[input2_rank - 1] : 1)};
+
+  const auto output_rank = out.dim();
+  const auto output_sizes = out.sizes();
+  const cmsis_nn_dims output_dims{
+      static_cast<int32_t>(
+          output_rank >= 4 ? output_sizes[output_rank - 4] : 1),
+      static_cast<int32_t>(
+          output_rank >= 3 ? output_sizes[output_rank - 3] : 1),
+      static_cast<int32_t>(
+          output_rank >= 2 ? output_sizes[output_rank - 2] : 1),
+      static_cast<int32_t>(
+          output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
+
+  const arm_cmsis_nn_status status = arm_maximum_s8(
+      /* ctx */ nullptr,
+      input1_data,
+      &input1_dims,
+      input2_data,
+      &input2_dims,
+      output_data,
+      &output_dims);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "maximum_out: arm_maximum_s8 failed with status [%d]",
+        static_cast<int>(status));
+    context.fail(Error::Internal);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+Tensor& minimum_out(
+    KernelRuntimeContext& context,
+    const Tensor& input1,
+    const Tensor& input2,
+    Tensor& out) {
+  validate_cmsis_nn_tensor_requirements(
+      input1,
+      input2,
+      out,
+      ScalarType::Char,
+      /*require_channels_last=*/false,
+      /*require_same_sizes=*/false);
+
+  auto resize_error = resize_to_broadcast_target_size(input1, input2, out);
+  if (resize_error != Error::Ok) {
+    ET_LOG(Error, "minimum_out: broadcast shape mismatch between inputs");
+    context.fail(resize_error);
+    return out;
+  }
+
+  const int8_t* input1_data = input1.const_data_ptr<int8_t>();
+  const int8_t* input2_data = input2.const_data_ptr<int8_t>();
+  int8_t* output_data = out.mutable_data_ptr<int8_t>();
+
+  // Create CMSIS-NN dims directly from tensor sizes
+  const auto input1_rank = input1.dim();
+  const auto input1_sizes = input1.sizes();
+  const cmsis_nn_dims input1_dims{
+      static_cast<int32_t>(
+          input1_rank >= 4 ? input1_sizes[input1_rank - 4] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 3 ? input1_sizes[input1_rank - 3] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 2 ? input1_sizes[input1_rank - 2] : 1),
+      static_cast<int32_t>(
+          input1_rank >= 1 ? input1_sizes[input1_rank - 1] : 1)};
+
+  const auto input2_rank = input2.dim();
+  const auto input2_sizes = input2.sizes();
+  const cmsis_nn_dims input2_dims{
+      static_cast<int32_t>(
+          input2_rank >= 4 ? input2_sizes[input2_rank - 4] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 3 ? input2_sizes[input2_rank - 3] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 2 ? input2_sizes[input2_rank - 2] : 1),
+      static_cast<int32_t>(
+          input2_rank >= 1 ? input2_sizes[input2_rank - 1] : 1)};
+
+  const auto output_rank = out.dim();
+  const auto output_sizes = out.sizes();
+  const cmsis_nn_dims output_dims{
+      static_cast<int32_t>(
+          output_rank >= 4 ? output_sizes[output_rank - 4] : 1),
+      static_cast<int32_t>(
+          output_rank >= 3 ? output_sizes[output_rank - 3] : 1),
+      static_cast<int32_t>(
+          output_rank >= 2 ? output_sizes[output_rank - 2] : 1),
+      static_cast<int32_t>(
+          output_rank >= 1 ? output_sizes[output_rank - 1] : 1)};
+
+  const arm_cmsis_nn_status status = arm_minimum_s8(
+      /* ctx */ nullptr,
+      input1_data,
+      &input1_dims,
+      input2_data,
+      &input2_dims,
+      output_data,
+      &output_dims);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "minimum_out: arm_minimum_s8 failed with status [%d]",
+        static_cast<int>(status));
+    context.fail(Error::Internal);
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py
@@ -238,6 +238,47 @@ def quantized_mul_impl(
     return result
 
 
+# ===================================================================
+# MINIMUM/MAXIMUM OPERATION DEFINITIONS
+# ===================================================================
+lib.define("minimum(Tensor self, Tensor other) -> Tensor")
+lib.define("minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@register_fake("cortex_m::minimum")
+def minimum_meta(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    assert self.dtype == other.dtype, (
+        "Cortex-M minimum: dtype mismatch — "
+        f"got self.dtype={self.dtype}, other.dtype={other.dtype}"
+    )
+    broadcasted_shape = torch.broadcast_shapes(self.shape, other.shape)
+    return torch.empty(broadcasted_shape, dtype=self.dtype, device=self.device)
+
+
+@impl(lib, "minimum", "CompositeExplicitAutograd")
+def minimum_impl(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    return torch.minimum(self, other)
+
+
+lib.define("maximum(Tensor self, Tensor other) -> Tensor")
+lib.define("maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@register_fake("cortex_m::maximum")
+def maximum_meta(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    assert self.dtype == other.dtype, (
+        "Cortex-M maximum: dtype mismatch — "
+        f"got self.dtype={self.dtype}, other.dtype={other.dtype}"
+    )
+    broadcasted_shape = torch.broadcast_shapes(self.shape, other.shape)
+    return torch.empty(broadcasted_shape, dtype=self.dtype, device=self.device)
+
+
+@impl(lib, "maximum", "CompositeExplicitAutograd")
+def maximum_impl(self: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
+    return torch.maximum(self, other)
+
+
 # ===================================================================
 # QUANTIZED LINEAR OPERATION DEFINITION
 # ===================================================================

diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml
@@ -29,6 +29,18 @@
     - arg_meta: null
       kernel_name: cortex_m::quantized_mul_out
 
+- func: cortex_m::minimum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::minimum_out
+
+- func: cortex_m::maximum.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::maximum_out
+
 - func: cortex_m::quantized_linear.out(Tensor input, Tensor weights, Tensor? bias, Tensor? kernel_sum, Scalar input_offset, Scalar filter_offset, Scalar output_offset, int[] requantize_multipliers, int[] requantize_shifts, Scalar activation_max, Scalar activation_min, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
   kernels:

diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py
@@ -101,6 +101,18 @@ def _get_mul_replacement(self, args, meta):
 
         return exir_ops.edge.cortex_m.quantized_mul.default, args
 
+    def _get_minimum_replacement(self, args, meta):
+        if args[0].data.dtype != torch.int8:
+            return exir_ops.edge.aten.minimum.default, args
+
+        return exir_ops.edge.cortex_m.minimum.default, args
+
+    def _get_maximum_replacement(self, args, meta):
+        if args[0].data.dtype != torch.int8:
+            return exir_ops.edge.aten.maximum.default, args
+
+        return exir_ops.edge.cortex_m.maximum.default, args
+
     def _get_permute_replacement(self, args, meta):
         if args[0].data.dtype != torch.int8:
             return exir_ops.edge.aten.permute_copy.default, args
@@ -123,6 +135,10 @@ def call_operator(
                 op, args = self._get_add_replacement(args, meta)
             case exir_ops.edge.aten.mul.Tensor:
                 op, args = self._get_mul_replacement(args, meta)
+            case exir_ops.edge.aten.minimum.default:
+                op, args = self._get_minimum_replacement(args, meta)
+            case exir_ops.edge.aten.maximum.default:
+                op, args = self._get_maximum_replacement(args, meta)
             case exir_ops.edge.aten.permute_copy.default:
                 op, args = self._get_permute_replacement(args, meta)
             case _: