pytorch
diff --git a/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cuda.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 17 additions & 3 deletions b/‎backends/aoti/aoti_partitioner.py‎
Lines changed: 17 additions & 3 deletions
diff --git a/‎backends/arm/operator_support/clone_dim_order_support.py‎
Lines changed: 14 additions & 0 deletions b/‎backends/arm/operator_support/clone_dim_order_support.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎backends/arm/tosa/dialect/lib.py‎
Lines changed: 23 additions & 0 deletions b/‎backends/arm/tosa/dialect/lib.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎backends/cadence/aot/compiler.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/compiler.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cadence/aot/export_example.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/cadence/aot/export_example.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎backends/cortex_m/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/cortex_m/ops/op_transpose.cpp‎
Lines changed: 124 additions & 0 deletions b/‎backends/cortex_m/ops/op_transpose.cpp‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎backends/cortex_m/ops/operators.py‎
Lines changed: 18 additions & 0 deletions b/‎backends/cortex_m/ops/operators.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎backends/cortex_m/ops/operators.yaml‎
Lines changed: 6 additions & 0 deletions b/‎backends/cortex_m/ops/operators.yaml‎
Lines changed: 6 additions & 0 deletions
@@ -71,7 +71,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model: [linear, add, add_mul, resnet18, conv1d]
+        model: [linear, add, add_mul, resnet18, conv1d, sdpa]
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
 
@@ -52,10 +52,24 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         partition_tags: Dict[str, DelegationSpec] = {}
         tag = "tag0"
 
+        # Tag torch.cond and other control flow operations
+        def is_control_flow(node: torch.fx.Node) -> bool:
+            return node.op == "call_function" and node.target in [
+                torch.ops.higher_order.cond,
+                torch.ops.higher_order.map_impl,
+                torch.ops.higher_order.while_loop,
+            ]
+
         for node in exported_program.graph.nodes:
-            if node.op != "call_function":
-                continue
-            node.meta["delegation_tag"] = tag
+            if node.op == "call_function":
+                node.meta["delegation_tag"] = tag
+            # Tag get_attr nodes that are used by control flow operations
+            elif node.op == "get_attr":
+                # Check if any user is a control flow operation
+                for user in node.users:
+                    if is_control_flow(user):
+                        node.meta["delegation_tag"] = tag
+                        break
 
         partition_tags[tag] = self.delegation_spec
 
 
@@ -2,6 +2,12 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+"""Declare operator support for dim-order clone in TOSA.
+
+This module registers a support check for ``dim_order_ops._clone_dim_order``
+ensuring input/output dtypes match and the value types are FakeTensors.
+
+"""
 
 import logging
 
@@ -19,6 +25,8 @@
 
 @register_tosa_support_check
 class CloneSupported(SupportedTOSAOperatorCheck):
+    """Provide TOSA support check for ``_clone_dim_order``."""
+
     targets = [exir_ops.edge.dim_order_ops._clone_dim_order.default]
 
     tosa_specs = [
@@ -29,6 +37,12 @@ class CloneSupported(SupportedTOSAOperatorCheck):
     def is_node_tosa_supported(
         self, node: fx.Node, tosa_spec: TosaSpecification
     ) -> bool:
+        """Return True if the node is supported by TOSA.
+
+        Verify the operator target, the number and types of inputs/outputs, and
+        check that input and output dtypes match.
+
+        """
         if node.target not in self.targets:
             self.reporter.report_reject(node, f"Target {node.target} is not supported.")
             return False
 
@@ -15,6 +15,17 @@
 
 
 def register_tosa_dialect_op(op_schema, func) -> Callable:
+    """Register a TOSA dialect operator with the backend op library.
+
+    Args:
+        op_schema (str): Operator schema without namespace or overload name.
+        func (Callable): Fake implementation used for registration.
+
+    Returns:
+        Callable: Backend dialect operator handle exposed via ``exir_ops`` and
+        marked ``not_callable`` for runtime use.
+
+    """
     if tosa_lib.ns not in _BACKEND_OP_LIB:
         _BACKEND_OP_LIB.append(tosa_lib.ns)
 
@@ -43,6 +54,7 @@ def register_tosa_dialect_op(op_schema, func) -> Callable:
     # the op doesn't need to be callable. This can be changed in the future if needed to support
     # execution of TOSA ops directly.
     def not_callable():
+        """Raise when the dialect op handle is invoked at runtime."""
         raise RuntimeError("TOSA dialect op is not callable")
 
     op.__equvalent_callable__ = not_callable
@@ -51,11 +63,22 @@ def not_callable():
 
 
 class TosaValueError(ValueError):
+    """Error type that annotates failures with the originating TOSA op."""
+
     def __init__(self, message="A TOSA value error occurred", *args, op=None):
+        """Initialise the error with optional operator metadata.
+
+        Args:
+            message (str): Human-readable error message.
+            *args: Additional arguments forwarded to ``ValueError``.
+            op: Optional operator identifier included in the string output.
+
+        """
         super().__init__(message, *args)
         self.op = op
 
     def __str__(self):
+        """Return the base message, appending the operator when provided."""
         base_message = super().__str__()
         if self.op is not None:
             return f"{base_message} (TOSA op: {self.op})"
 
@@ -144,7 +144,7 @@ def convert_pt2(
 # It is however useful for unit tests to separate the converted model from the
 # fused model, to be able to get reference numerics.
 # If this does not apply, please use quantize_pt2 instead.
-def fuse_pt2(
+def apply_pre_edge_transform_passes(
     converted_program: ExportedProgram,
     quantizer: CadenceQuantizer,
 ) -> ExportedProgram:
@@ -229,7 +229,7 @@ def quantize_pt2(
 
     # Apply quant fusion to the exported program
     program = torch.export.export(converted_gm, inputs, strict=True)
-    fused_program = fuse_pt2(program, quantizer)
+    fused_program = apply_pre_edge_transform_passes(program, quantizer)
 
     if dump_graphs:
         logging.info("Graph after quantization and fusion:")
 
@@ -18,8 +18,8 @@
 
 from executorch.backends.cadence.aot.compiler import (
     _lower_ep_to_cadence_gen_etrecord,
+    apply_pre_edge_transform_passes,
     convert_pt2,
-    fuse_pt2,
     prepare_pt2,
 )
 
@@ -66,7 +66,7 @@ def export_model(
     ep = torch.export.export(converted_model, example_inputs, strict=True)
 
     # Fuse the quantized patterns on the exported program (note: quantizer needs to be the same as the one used in prepare_and_convert_pt2)
-    ep = fuse_pt2(ep, quantizer)
+    ep = apply_pre_edge_transform_passes(ep, quantizer)
 
     # Get edge program after Cadence specific passes
     exec_prog: ExecutorchProgramManager = _lower_ep_to_cadence_gen_etrecord(
 
@@ -58,6 +58,7 @@ set(_cortex_m_kernels__srcs
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp
 )
 
 # Generate C++ bindings to register kernels into Executorch
 
@@ -0,0 +1,124 @@
+/*
+ * Copyright 2025 Arm Limited and/or its affiliates.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cortex_m_ops_common.h"
+
+#include <array>
+#include <limits>
+#include <vector>
+
+// Include CMSIS-NN headers with C linkage
+extern "C" {
+#include "arm_nnfunctions.h"
+}
+
+namespace cortex_m {
+namespace native {
+
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+
+namespace {
+
+constexpr size_t kMaxSupportedDims = 4;
+
+} // namespace
+
+Tensor& transpose_out(
+    KernelRuntimeContext& context,
+    const Tensor& input,
+    const IntArrayRef perm,
+    Tensor& out) {
+  if (input.scalar_type() != ScalarType::Char ||
+      out.scalar_type() != ScalarType::Char) {
+    ET_LOG(
+        Error,
+        "transpose_out: only int8 tensors are supported (input=%d, out=%d)",
+        static_cast<int>(input.scalar_type()),
+        static_cast<int>(out.scalar_type()));
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  const size_t rank = input.dim();
+  if (rank == 0 || rank > kMaxSupportedDims) {
+    ET_LOG(
+        Error,
+        "transpose_out: expected tensor rank in [1, %zu], got %zu",
+        kMaxSupportedDims,
+        rank);
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  if (perm.size() != static_cast<int64_t>(rank)) {
+    ET_LOG(
+        Error,
+        "transpose_out: permutation length %zd does not match tensor rank %zu",
+        perm.size(),
+        rank);
+    context.fail(Error::InvalidArgument);
+    return out;
+  }
+
+  std::array<int32_t, kMaxSupportedDims> input_dims_arr{1, 1, 1, 1};
+  std::array<int32_t, kMaxSupportedDims> output_dims_arr{1, 1, 1, 1};
+  for (size_t i = 0; i < rank; ++i) {
+    const auto in_size = input.size(i);
+    const auto out_size = out.size(i);
+    if (in_size > std::numeric_limits<int32_t>::max() ||
+        out_size > std::numeric_limits<int32_t>::max()) {
+      ET_LOG(
+          Error,
+          "transpose_out: dimension size exceeds int32_t range (input=%lld, output=%lld)",
+          static_cast<long long>(in_size),
+          static_cast<long long>(out_size));
+      context.fail(Error::InvalidArgument);
+      return out;
+    }
+    input_dims_arr[i] = static_cast<int32_t>(in_size);
+    output_dims_arr[i] = static_cast<int32_t>(out_size);
+  }
+
+  cmsis_nn_dims input_dims = {
+      input_dims_arr[0],
+      input_dims_arr[1],
+      input_dims_arr[2],
+      input_dims_arr[3]};
+  cmsis_nn_dims output_dims = {
+      output_dims_arr[0],
+      output_dims_arr[1],
+      output_dims_arr[2],
+      output_dims_arr[3]};
+
+  std::array<uint32_t, kMaxSupportedDims> perm_buffer{0, 1, 2, 3};
+  for (size_t i = 0; i < rank; ++i) {
+    perm_buffer[i] = static_cast<uint32_t>(perm[i]);
+  }
+
+  const cmsis_nn_transpose_params transpose_params{
+      static_cast<int32_t>(rank), perm_buffer.data()};
+
+  const int8_t* input_data = input.const_data_ptr<int8_t>();
+  int8_t* output_data = out.mutable_data_ptr<int8_t>();
+
+  const arm_cmsis_nn_status status = arm_transpose_s8(
+      input_data, output_data, &input_dims, &output_dims, &transpose_params);
+
+  if (status != ARM_CMSIS_NN_SUCCESS) {
+    ET_LOG(
+        Error,
+        "transpose_out: arm_transpose_s8 failed with status [%d]",
+        static_cast<int>(status));
+    context.fail(Error::Internal);
+    return out;
+  }
+
+  return out;
+}
+
+} // namespace native
+} // namespace cortex_m
@@ -349,3 +349,21 @@ def quantized_linear_impl(
     output += output_offset
     output = torch.clamp(output, activation_min, activation_max).to(torch.int8)
     return output
+
+
+# ===================================================================
+# TRANSPOSE OPERATION DEFINITION
+# ===================================================================
+lib.define("transpose(Tensor input, int[] perm) -> Tensor")
+lib.define("transpose.out(Tensor input, int[] perm, *, Tensor(a!) out) -> Tensor(a!)")
+
+
+@register_fake("cortex_m::transpose")
+def transpose_meta(input: torch.Tensor, perm) -> torch.Tensor:
+    output_shape = [input.shape[idx] for idx in perm]
+    return torch.empty(output_shape, dtype=input.dtype, device=input.device)
+
+
+@impl(lib, "transpose", "CompositeExplicitAutograd")
+def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
+    return input.permute(tuple(perm)).contiguous()
@@ -34,3 +34,9 @@
   kernels:
     - arg_meta: null
       kernel_name: cortex_m::quantized_linear_out
+
+- func: cortex_m::transpose.out(Tensor input, int[] perm, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  kernels:
+    - arg_meta: null
+      kernel_name: cortex_m::transpose_out
Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ set(_cortex_m_kernels__srcs`
`58`	`58`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_add.cpp`
`59`	`59`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_linear.cpp`
`60`	`60`	`${CMAKE_CURRENT_SOURCE_DIR}/ops/op_quantized_mul.cpp`
	`61`	`+ ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp`
`61`	`62`	`)`
`62`	`63`
`63`	`64`	`# Generate C++ bindings to register kernels into Executorch`