pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions b/‎.ci/scripts/build-qnn-sdk.sh‎
100644100755
Lines changed: 4 additions & 9 deletions
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 39 additions & 1 deletion b/‎.github/workflows/build-presets.yml‎
Lines changed: 39 additions & 1 deletion
diff --git a/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions b/‎CMakePresets.json‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions b/‎backends/arm/_passes/broadcast_args_pass.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎backends/arm/arm_vela.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/arm_vela.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_neg.py‎
Lines changed: 10 additions & 11 deletions b/‎backends/arm/operators/op_neg.py‎
Lines changed: 10 additions & 11 deletions
@@ -11,17 +11,12 @@ set -o xtrace
 
 build_qnn_backend() {
   echo "Start building qnn backend."
-  export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
+  export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
+  export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  # Workaround to avoid issues around missing flatccrt library (depending on the
-  # number of jobs used), see issue #7300:
-  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
-  # available.
-  # TODO: Remove this workaround once the underlying issue is fixed.
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
+  parallelism=$(( $(nproc) - 1 ))
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
 }
 
 set_up_aot() {
 
@@ -6,6 +6,8 @@ on:
     branches:
       - main
       - release/*
+    paths:
+      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
@@ -16,15 +18,51 @@ jobs:
   apple:
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     strategy:
+      fail-fast: false
       matrix:
-        preset: [macos-arm64]
+        preset: [macos-arm64, pybind]
     with:
       job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       runner: macos-latest-xlarge
       python-version: 3.12
       submodules: recursive
+      timeout: 90
       script: |
         set -eux
         ${CONDA_RUN} ./install_requirements.sh > /dev/null
         ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
         ${CONDA_RUN} cmake --build cmake-out --parallel
+
+  linux:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    strategy:
+      fail-fast: false
+      matrix:
+        preset: [pybind]
+        runner: [linux.2xlarge, linux.arm64.2xlarge]
+        docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
+        # Excluding specific runner + docker image combinations that don't make sense:
+        #   - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
+        #   - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
+        exclude:
+          - runner: linux.2xlarge
+            docker-image: executorch-ubuntu-22.04-gcc11-aarch64
+          - runner: linux.arm64.2xlarge
+            docker-image: executorch-ubuntu-22.04-clang12
+    with:
+      job-name: build
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      runner: ${{ matrix.runner }}
+      docker-image: ${{ matrix.docker-image }}
+      submodules: recursive
+      timeout: 90
+      script: |
+        set -eux
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        ./install_requirements.sh > /dev/null
+        cmake --preset ${{ matrix.preset }}
+        cmake --build cmake-out --parallel
@@ -582,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
       ${TORCH_PYTHON_LIBRARY}
       bundled_program
       etdump
+      flatccrt
       executorch
       extension_data_loader
       util
 
@@ -15,6 +15,7 @@
     },
     {
       "name": "macos-arm64",
+      "displayName": "Build everything buildable on macOS arm64",
       "inherits": ["common"],
       "generator": "Xcode",
       "cacheVariables": {
@@ -28,6 +29,20 @@
         "type": "equals",
         "rhs": "Darwin"
       }
+    },
+    {
+      "name": "pybind",
+      "displayName": "Build pybindings exported in the wheel",
+      "inherits": ["common"],
+      "cacheVariables": {
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
+        "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "type": "inList",
+        "string": "${hostSystemName}",
+        "list": ["Darwin", "Linux", "Windows"]
+      }
     }
   ]
 }
@@ -8,6 +8,7 @@
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .arm_pass import ArmPass  # noqa
+from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
 
@@ -10,6 +10,7 @@
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
+    BroadcastArgsPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOT,
@@ -104,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        if self.tosa_spec.is_U55_subset:
+            self.add_pass(BroadcastArgsPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
 
@@ -0,0 +1,63 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import PassResult
+from torch.fx import GraphModule, Node
+
+
+class BroadcastArgsPass(ArmPass):
+    """
+    Pass to manually broadcast arguments by inserting repeats.
+    This is done when more than one arg needs broadcasting.
+    """
+
+    targeted_ops = {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        # mul is indirectly targeting div as div is decompsed to reciprocal + mul
+        exir_ops.edge.aten.mul.Tensor,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            output_shape = get_first_fake_tensor(node).shape
+            nbr_of_broacasts = 0
+            for arg in node.args:
+                if not isinstance(arg, Node):
+                    continue
+
+                shape = get_first_fake_tensor(arg).shape
+                if shape != output_shape:
+                    nbr_of_broacasts += 1
+                if nbr_of_broacasts > 1:
+                    multiples = [
+                        int(output_shape[d] / shape[d])
+                        for d in range(len(output_shape))
+                    ]
+                    with graph_module.graph.inserting_before(node):
+                        repeat = create_node(
+                            graph_module.graph,
+                            exir_ops.edge.aten.repeat.default,
+                            args=(arg, multiples),
+                            kwargs={},
+                            from_node=node,
+                        )
+                        node.replace_input_with(arg, repeat)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             np_path = os.path.join(tmpdir, "output", "out_vela.npz")
         else:
             np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
-        blocks = b""
 
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Construct our modified output_blocks with data in a form easily
             # digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             if not isinstance(data["scratch_shape"][0], np.int64):
                 raise RuntimeError("Expected scratch to be int64")
             block_length = int(data["scratch_shape"][0])
-            bin_blocks["scratch_data"] = b"\x00" * block_length
+            bin_blocks["scratch_size"] = struct.pack("<I", block_length)
 
             # Capture inputs and outputs
             bin_blocks["inputs"] = vela_bin_pack_io("input", data)
 
@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.upsample_bilinear2d.vec,
         exir_ops.edge.aten.upsample_nearest2d.vec,
+        torch.ops.aten.scalar_tensor.default,
         *TableOps.included_ops(),
     )
 
 
@@ -16,7 +16,10 @@
     NodeVisitor,
     register_node_visitor,
 )
-
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+)
 from executorch.backends.arm.tosa_mapping import TosaArg
 
 
@@ -60,14 +63,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )
@@ -109,14 +110,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )
Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`},`
`16`	`16`	`{`
`17`	`17`	`"name": "macos-arm64",`
	`18`	`+ "displayName": "Build everything buildable on macOS arm64",`
`18`	`19`	`"inherits": ["common"],`
`19`	`20`	`"generator": "Xcode",`
`20`	`21`	`"cacheVariables": {`
`@@ -28,6 +29,20 @@`
`28`	`29`	`"type": "equals",`
`29`	`30`	`"rhs": "Darwin"`
`30`	`31`	`}`
	`32`	`+ },`
	`33`	`+ {`
	`34`	`+ "name": "pybind",`
	`35`	`+ "displayName": "Build pybindings exported in the wheel",`
	`36`	`+ "inherits": ["common"],`
	`37`	`+ "cacheVariables": {`
	`38`	`+ "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",`
	`39`	`+ "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"`
	`40`	`+ },`
	`41`	`+ "condition": {`
	`42`	`+ "type": "inList",`
	`43`	`+ "string": "${hostSystemName}",`
	`44`	`+ "list": ["Darwin", "Linux", "Windows"]`
	`45`	`+ }`
`31`	`46`	`}`
`32`	`47`	`]`
`33`	`48`	`}`
Original file line number	Diff line number	Diff line change
`@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):`
`306`	`306`	`exir_ops.edge.aten.sub.Tensor,`
`307`	`307`	`exir_ops.edge.aten.upsample_bilinear2d.vec,`
`308`	`308`	`exir_ops.edge.aten.upsample_nearest2d.vec,`
	`309`	`+ torch.ops.aten.scalar_tensor.default,`
`309`	`310`	`*TableOps.included_ops(),`
`310`	`311`	`)`
`311`	`312`