Merge branch 'main' into export-D87400561

Gasoonjia · web-flow · commit b991c34bc15f · 2025-11-20T14:56:35.000-08:00
diff --git a/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py b/backends/arm/_passes/decompose_int16_activation_conv2d_pass.py
@@ -10,7 +10,7 @@
 from executorch.backends.arm._passes.arm_pass import ArmPass
 from executorch.backends.arm._passes.quant_args import QuantArgs
 
-from executorch.backends.arm.tosa.specification import get_context_spec, Tosa_1_00
+from executorch.backends.arm.tosa.specification import get_context_spec
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -40,9 +40,7 @@ def call_operator(self, op, args, kwargs, meta):
         if args[0].data.dtype == torch.int8:
             return super().call_operator(op, args, kwargs, meta)
         elif args[0].data.dtype == torch.int16:
-            if isinstance(tosa_spec, Tosa_1_00) and not tosa_spec.support_extension(
-                "int16"
-            ):
+            if not tosa_spec.support_extension("int16"):
                 raise ValueError(
                     "int16 activation for convolution requires TOSA int16 extension"
                 )
diff --git a/backends/arm/common/arm_compile_spec.py b/backends/arm/common/arm_compile_spec.py
@@ -35,19 +35,22 @@ class DebugMode(Enum):
     _OUTPUT_FORMAT_KEY = "output_format"
     _DEBUG_ARTIFACT_KEY = "debug_artifact_path"
     _DEBUG_MODE_KEY = "dump_debug_info"
+    _OUTPUT_REORDER_KEY = "ouput_reorder_workaround"
 
     def _set_compile_specs(
         self,
         tosa_spec: TosaSpecification,
         compiler_flags: list[str],
         path_for_intermediates: str | None = None,
         tosa_debug_mode: DebugMode | None = None,
+        output_order_workaround: bool = True,
     ):
         """Set all values of dataclass directly."""
         self.tosa_spec = tosa_spec
         self.compiler_flags = compiler_flags
         self.path_for_intermediates = path_for_intermediates
         self.tosa_debug_mode = tosa_debug_mode
+        self.output_order_workaround = output_order_workaround
 
     @classmethod
     def from_list(cls, compile_specs: list[CompileSpec]):  # noqa: C901
@@ -56,10 +59,15 @@ def from_list(cls, compile_specs: list[CompileSpec]):  # noqa: C901
         compiler_flags: list[str] | None = None
         path_for_intermediates: str | None = None
         tosa_debug_mode: ArmCompileSpec.DebugMode | None = None
+        output_order_workaround: bool = True
         unknown_specs: dict[str, str] = {}
         for spec in compile_specs:
             key = spec.key
-            val = spec.value.decode()
+            val = (
+                spec.value.decode()
+                if isinstance(spec.value, (bytes, bytearray))
+                else spec.value
+            )
             if key == ArmCompileSpec._TOSA_SPEC_KEY:
                 if tosa_spec is not None:
                     raise ValueError("More than one tosa_spec entry in compile spec.")
@@ -88,6 +96,8 @@ def from_list(cls, compile_specs: list[CompileSpec]):  # noqa: C901
                         "More than one tosa_debug_mode entry in compile spec."
                     )
                 tosa_debug_mode = ArmCompileSpec.DebugMode[val]
+            elif key == ArmCompileSpec._OUTPUT_REORDER_KEY:
+                output_order_workaround = val  # type: ignore[assignment]
             else:
                 unknown_specs[key] = val
 
@@ -109,6 +119,7 @@ def from_list(cls, compile_specs: list[CompileSpec]):  # noqa: C901
             compiler_flags=compiler_flags,
             path_for_intermediates=path_for_intermediates,
             tosa_debug_mode=tosa_debug_mode,
+            output_order_workaround=output_order_workaround,
         )
         cls.from_list_hook(compile_spec, unknown_specs)
         compile_spec.validate()
@@ -170,6 +181,14 @@ def to_list(self):
                 )
             )
 
+        if not self.output_order_workaround:
+            compile_spec.append(
+                CompileSpec(
+                    ArmCompileSpec._OUTPUT_REORDER_KEY,
+                    self.output_order_workaround,
+                )
+            )
+
         return compile_spec
 
     def get_intermediate_path(self) -> str | None:
@@ -201,6 +220,13 @@ def dump_debug_info(self, debug_mode: DebugMode | None):
         self.tosa_debug_mode = debug_mode
         return self
 
+    def set_output_order_workaround(self, output_order_workaround: bool):
+        self.output_order_workaround = output_order_workaround
+        return self
+
+    def get_output_order_workaround(self) -> bool:
+        return self.output_order_workaround
+
     @classmethod
     @abstractmethod
     def get_output_format(cls) -> str:
diff --git a/backends/arm/ethosu/backend.py b/backends/arm/ethosu/backend.py
@@ -9,6 +9,7 @@
 # backends. Converts via TOSA as an intermediate form supported by AoT and
 # JIT compiler flows.
 #
+"""Ahead-of-time Arm Ethos-U backend built on the shared TOSA pipeline."""
 
 import logging
 from typing import final, List
@@ -27,19 +28,28 @@
 
 @final
 class EthosUBackend(BackendDetails):
-    """
-    BackendDetails subclass for delegation to Ethos-U. Deduce the TOSA lowering from
-    the compile spec list by filtering out the compile spec values that are of interest
-    for the TOSABackend.
+    """BackendDetails subclass for delegation to Ethos-U.
+
+    Deduce the TOSA lowering from the compile spec list by filtering out the
+    compile spec values that are of interest for the TOSABackend.
+
     """
 
     @staticmethod
     def _compile_tosa_flatbuffer(
         tosa_flatbuffer: bytes, compile_spec: EthosUCompileSpec
     ) -> bytes:
-        """
-        Static helper method to do the compilation of the TOSA flatbuffer
-        representation to a target specific binary stream.
+        """Compile a TOSA flatbuffer into a target-specific binary stream.
+
+        Args:
+            tosa_flatbuffer (bytes): Serialized TOSA graph produced by
+                ``TOSABackend``.
+            compile_spec (EthosUCompileSpec): Compile specification providing
+                Vela flags and intermediate paths.
+
+        Returns:
+            bytes: Target-specific binary stream produced by Vela.
+
         """
         compile_flags = compile_spec.compiler_flags
 
@@ -73,6 +83,17 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        """Lower the exported program and compile it for an Ethos-U target.
+
+        Args:
+            edge_program (ExportedProgram): Program to lower to Ethos-U.
+            compile_specs (List[CompileSpec]): Serialized Ethos-U compile specs
+                supplied by the frontend.
+
+        Returns:
+            PreprocessResult: Result containing the compiled Ethos-U binary.
+
+        """
         logger.info(f"{EthosUBackend.__name__} preprocess")
 
         compile_spec = EthosUCompileSpec.from_list(compile_specs)
diff --git a/backends/arm/test/misc/test_outputs_order.py b/backends/arm/test/misc/test_outputs_order.py
@@ -78,14 +78,18 @@ def _read_tosa_outputs(tosa_path: Path):
     return shapes
 
 
+# TODO: MLETORCH-1266 Investigate output order issue
 @pytest.mark.parametrize("batch_size", [1, 4])
-def test_network_output_order_and_restore(batch_size):
+@pytest.mark.parametrize("output_order_workaround", [True, False])
+def test_network_output_order_and_restore(batch_size, output_order_workaround):
     model = Network(batch_norm=True).eval()
     # Prepare spec
     spec = TosaSpecification.create_from_string("TOSA-1.0+INT")
-    compile_spec = TosaCompileSpec(tosa_spec=spec)
+    tosa_compile_spec = TosaCompileSpec(spec).set_output_order_workaround(
+        output_order_workaround
+    )
     # Setup quantizer
-    quantizer = TOSAQuantizer(compile_spec)
+    quantizer = TOSAQuantizer(tosa_compile_spec)
     quantizer.set_global(
         get_symmetric_quantization_config(is_qat=True, is_per_channel=False)
     )
@@ -100,7 +104,7 @@ def test_network_output_order_and_restore(batch_size):
     with tempfile.TemporaryDirectory(dir="") as tmpdir:
         art_dir = Path(tmpdir)
         part = TOSAPartitioner(
-            TosaCompileSpec(spec).dump_intermediate_artifacts_to(str(art_dir))
+            tosa_compile_spec.dump_intermediate_artifacts_to(str(art_dir))
         )
         _ = to_edge_transform_and_lower(aten_gm, partitioner=[part])
         # Expect exactly one .tosa file in the artefact dir
diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py
@@ -283,6 +283,7 @@ def _preprocess_module(  # noqa: C901
             output_node.update_arg(0, [output_node.args[0]])
         node_to_id_map = _annotate_external_ids(graph_module.graph)
         artifact_path = compile_spec.get_intermediate_path()
+        output_order_workaround = compile_spec.get_output_order_workaround()
 
         # TODO: Fix the need to lazily import this.
         from executorch.backends.arm._passes import ArmPassManager
@@ -295,7 +296,12 @@ def _preprocess_module(  # noqa: C901
         from executorch.backends.arm.operators.node_visitor import get_node_visitors
 
         node_visitors = get_node_visitors(edge_program, tosa_spec, debug_hook)
-        graph_module = _sort_outputs(graph_module, node_to_id_map)
+
+        if output_order_workaround:
+            logger.debug("Re-sorting outputs during TOSA lowering.")
+            graph_module = _sort_outputs(graph_module, node_to_id_map)
+        else:
+            logger.debug("No re-sorting outputs (workaround) during TOSA lowering.")
 
         if submodule_name is not None:
             tosa_graph.startRegion(submodule_name)
@@ -375,4 +381,5 @@ def filter_tosa_compile_specs(
             TosaCompileSpec(compile_spec.tosa_spec)
             .dump_intermediate_artifacts_to(compile_spec.get_intermediate_path())
             .dump_debug_info(compile_spec.tosa_debug_mode)
+            .set_output_order_workaround(compile_spec.output_order_workaround)
         )
diff --git a/backends/arm/tosa/specification.py b/backends/arm/tosa/specification.py
@@ -105,6 +105,18 @@ def support_float(self) -> bool:
         """Return True if floating-point operations are supported."""
         raise NotImplementedError
 
+    def support_extension(self, extension: str) -> bool:
+        """Return True if an extension is supported and enabled.
+
+        Args:
+            extension (str): Extension name (for example ``int4``, ``bf16``).
+
+        Returns:
+            bool: True if the extension is valid for the active profiles and selected.
+
+        """
+        raise NotImplementedError
+
     def __init__(self, version: Version, extras: List[str]):
         """Initialize the base specification.
 
diff --git a/backends/arm/vgf/backend.py b/backends/arm/vgf/backend.py
@@ -10,6 +10,7 @@
 # this form is used where the final JIT compile is performed on target (in the
 # runtime delegate executorch::runtime::BackendInterface::init
 #
+"""Ahead-of-time Arm VGF backend built on the shared TOSA pipeline."""
 
 import logging
 import os
@@ -43,9 +44,11 @@
 
 @final
 class VgfBackend(BackendDetails):
-    """
-    BackendDetails subclass for delegation to VGF compatible devices. This enables
-    encapsulated TOSA on target device and JIT compilation on suitable platforms.
+    """BackendDetails subclass for delegation to VGF compatible devices.
+
+    This enables encapsulated TOSA on target device and JIT compilation on
+    suitable platforms.
+
     """
 
     @staticmethod
@@ -54,9 +57,18 @@ def _compile_tosa_flatbuffer(
         compile_spec: VgfCompileSpec,
         tag_name: str = "",
     ) -> bytes:
-        """
-        Static helper method to do the compilation of the TOSA flatbuffer
-        representation to a target specific binary stream.
+        """Compile a TOSA flatbuffer into a target-specific binary stream.
+
+        Args:
+            tosa_flatbuffer (bytes): Serialized TOSA graph produced by
+                ``TOSABackend``.
+            compile_spec (VgfCompileSpec): Compile specification providing
+                converter flags and artifact paths.
+            tag_name (str): Optional suffix used when producing debug outputs.
+
+        Returns:
+            bytes: Target-specific VGF binary stream.
+
         """
         compile_flags = compile_spec.compiler_flags
         artifact_path = compile_spec.get_intermediate_path()
@@ -69,6 +81,17 @@ def preprocess(
         edge_program: ExportedProgram,
         compile_specs: List[CompileSpec],
     ) -> PreprocessResult:
+        """Lower the exported program and compile it for a VGF target.
+
+        Args:
+            edge_program (ExportedProgram): Program to lower to VGF.
+            compile_specs (List[CompileSpec]): Serialized VGF compile specs
+                supplied by the frontend.
+
+        Returns:
+            PreprocessResult: Result containing the compiled VGF binary.
+
+        """
         logger.info(f"{VgfBackend.__name__} preprocess")
 
         compile_spec = VgfCompileSpec.from_list(compile_specs)
@@ -98,6 +121,20 @@ def vgf_compile(
     artifact_path: str | None = None,
     tag_name: str = "",
 ):
+    """Invoke the VGF compiler to convert a TOSA flatbuffer.
+
+    Args:
+        tosa_flatbuffer (bytes): Serialized TOSA graph produced by
+            ``TOSABackend``.
+        compile_flags (List[str]): Command-line flags forwarded to
+            ``model-converter``.
+        artifact_path (str | None): Directory where debug artifacts are saved.
+        tag_name (str): Optional suffix used when producing debug outputs.
+
+    Returns:
+        bytes: Compiled VGF binary emitted by ``model-converter``.
+
+    """
     with tempfile.TemporaryDirectory() as tmpdir:
 
         # We currently write out a flatbuffer as input to the converter
diff --git a/backends/nxp/backend/neutron_converter_manager.py b/backends/nxp/backend/neutron_converter_manager.py
@@ -78,23 +78,35 @@ def convert(self, tflite_model: bytes, target: str) -> bytes:
         cctx.compilationOpts.minNumOpsPerGraph = 1
         cctx.compilationOpts.excludeGraphPasses = "MergeTranspose"
 
-        logger = multiprocessing.log_to_stderr()
-        logger.setLevel(logging.WARNING)
-        queue = multiprocessing.Manager().Queue()
+        # Try to use multiprocessing for isolation, but fall back to direct execution
+        # if the environment doesn't support it (e.g., in sandcastle/build environments)
+        try:
+            logger = multiprocessing.log_to_stderr()
+            logger.setLevel(logging.WARNING)
+            queue = multiprocessing.Manager().Queue()
+
+            process = multiprocessing.Process(
+                target=convert_unsafe,
+                args=(self.neutron_converter, tflite_model, cctx, queue),
+            )
+            process.start()
+            process.join()  # waits until the subprocess is complete
 
-        process = multiprocessing.Process(
-            target=convert_unsafe,
-            args=(self.neutron_converter, tflite_model, cctx, queue),
-        )
-        process.start()
-        process.join()  # waits until the subprocess is complete
+            if queue.empty():  # signals the unsafe task did not run till the end
+                raise RuntimeError(
+                    f"Neutron converter module terminated unexpectedly with exit code {process.exitcode}"
+                )
 
-        if queue.empty():  # signals the unsafe task did not run till the end
-            raise RuntimeError(
-                f"Neutron converter module terminated unexpectedly with exit code {process.exitcode}"
+            model_converted = queue.get()
+            process.close()
+        except (EOFError, OSError) as e:
+            # Multiprocessing failed (likely due to environment restrictions)
+            # Fall back to direct execution
+            logging.warning(
+                f"Multiprocessing not available ({e}), running neutron converter directly"
+            )
+            model_converted = self.neutron_converter.convertModel(
+                list(tflite_model), cctx
             )
 
-        model_converted = queue.get()
-
-        process.close()
         return bytes(model_converted)