Skip to content

Commit 5b5ecce

Browse files
committed
Merge remote-tracking branch 'origin/main' into toupstream/model_update
2 parents 90d9b10 + 7d9dd46 commit 5b5ecce

File tree

78 files changed

+3693
-766
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

78 files changed

+3693
-766
lines changed

.ci/scripts/build-qnn-sdk.sh

100644100755
Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,12 @@ set -o xtrace
1111

1212
build_qnn_backend() {
1313
echo "Start building qnn backend."
14-
export ANDROID_NDK_ROOT=/opt/ndk
15-
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
14+
export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
15+
export QNN_SDK_ROOT=${QNN_SDK_ROOT:-/tmp/qnn/2.28.0.241029}
1616
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1717

18-
# Workaround to avoid issues around missing flatccrt library (depending on the
19-
# number of jobs used), see issue #7300:
20-
# Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
21-
# available.
22-
# TODO: Remove this workaround once the underlying issue is fixed.
23-
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
24-
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
18+
parallelism=$(( $(nproc) - 1 ))
19+
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number ${parallelism} --release
2520
}
2621

2722
set_up_aot() {

.github/workflows/build-presets.yml

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ on:
66
branches:
77
- main
88
- release/*
9+
paths:
10+
- .github/workflows/build-presets.yml
911
workflow_dispatch:
1012

1113
concurrency:
@@ -16,15 +18,51 @@ jobs:
1618
apple:
1719
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
1820
strategy:
21+
fail-fast: false
1922
matrix:
20-
preset: [macos-arm64]
23+
preset: [macos-arm64, pybind]
2124
with:
2225
job-name: build
26+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
2327
runner: macos-latest-xlarge
2428
python-version: 3.12
2529
submodules: recursive
30+
timeout: 90
2631
script: |
2732
set -eux
2833
${CONDA_RUN} ./install_requirements.sh > /dev/null
2934
${CONDA_RUN} cmake --preset ${{ matrix.preset }}
3035
${CONDA_RUN} cmake --build cmake-out --parallel
36+
37+
linux:
38+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
39+
strategy:
40+
fail-fast: false
41+
matrix:
42+
preset: [pybind]
43+
runner: [linux.2xlarge, linux.arm64.2xlarge]
44+
docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
45+
# Excluding specific runner + docker image combinations that don't make sense:
46+
# - Excluding the ARM64 gcc image on the x86 runner (linux.2xlarge)
47+
# - Excluding the x86 clang image on the ARM64 runner (linux.arm64.2xlarge)
48+
exclude:
49+
- runner: linux.2xlarge
50+
docker-image: executorch-ubuntu-22.04-gcc11-aarch64
51+
- runner: linux.arm64.2xlarge
52+
docker-image: executorch-ubuntu-22.04-clang12
53+
with:
54+
job-name: build
55+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
56+
runner: ${{ matrix.runner }}
57+
docker-image: ${{ matrix.docker-image }}
58+
submodules: recursive
59+
timeout: 90
60+
script: |
61+
set -eux
62+
# The generic Linux job chooses to use base env, not the one setup by the image
63+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
64+
conda activate "${CONDA_ENV}"
65+
66+
./install_requirements.sh > /dev/null
67+
cmake --preset ${{ matrix.preset }}
68+
cmake --build cmake-out --parallel

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,7 @@ if(EXECUTORCH_BUILD_PYBIND)
582582
${TORCH_PYTHON_LIBRARY}
583583
bundled_program
584584
etdump
585+
flatccrt
585586
executorch
586587
extension_data_loader
587588
util

CMakePresets.json

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
},
1616
{
1717
"name": "macos-arm64",
18+
"displayName": "Build everything buildable on macOS arm64",
1819
"inherits": ["common"],
1920
"generator": "Xcode",
2021
"cacheVariables": {
@@ -28,6 +29,20 @@
2829
"type": "equals",
2930
"rhs": "Darwin"
3031
}
32+
},
33+
{
34+
"name": "pybind",
35+
"displayName": "Build pybindings exported in the wheel",
36+
"inherits": ["common"],
37+
"cacheVariables": {
38+
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/pybind.cmake",
39+
"CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
40+
},
41+
"condition": {
42+
"type": "inList",
43+
"string": "${hostSystemName}",
44+
"list": ["Darwin", "Linux", "Windows"]
45+
}
3146
}
3247
]
3348
}

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder # noqa
99
from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass # noqa
1010
from .arm_pass import ArmPass # noqa
11+
from .broadcast_args_pass import BroadcastArgsPass # noqa
1112
from .cast_int64_pass import CastInt64BuffersToInt32Pass # noqa
1213
from .cast_to_int32_pass import CastToInt32Pass # noqa
1314
from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from executorch.backends.arm._passes import (
1111
AnnotateChannelsLastDimOrder,
1212
AnnotateDecomposedMatmulPass,
13+
BroadcastArgsPass,
1314
CastInt64BuffersToInt32Pass,
1415
CastToInt32Pass,
1516
ComputeConstantOpsAOT,
@@ -104,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
104105
self.add_pass(RetraceFoldedDtypesPass())
105106
self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
106107
self.add_pass(MatchArgRanksPass(exported_program))
108+
if self.tosa_spec.is_U55_subset:
109+
self.add_pass(BroadcastArgsPass())
107110
self.add_pass(ComputeConstantOpsAOT(exported_program))
108111

109112
self.add_pass(RemoveClonePass())
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
# Copyright 2025 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from executorch.backends.arm._passes import ArmPass
7+
8+
from executorch.backends.arm._passes.arm_pass_utils import (
9+
create_node,
10+
get_first_fake_tensor,
11+
)
12+
13+
from executorch.exir.dialects._ops import ops as exir_ops
14+
15+
from executorch.exir.pass_base import PassResult
16+
from torch.fx import GraphModule, Node
17+
18+
19+
class BroadcastArgsPass(ArmPass):
20+
"""
21+
Pass to manually broadcast arguments by inserting repeats.
22+
This is done when more than one arg needs broadcasting.
23+
"""
24+
25+
targeted_ops = {
26+
exir_ops.edge.aten.add.Tensor,
27+
exir_ops.edge.aten.sub.Tensor,
28+
# mul is indirectly targeting div as div is decompsed to reciprocal + mul
29+
exir_ops.edge.aten.mul.Tensor,
30+
}
31+
32+
def call(self, graph_module: GraphModule) -> PassResult:
33+
for node in graph_module.graph.nodes:
34+
if node.op != "call_function" or node.target not in self.targeted_ops:
35+
continue
36+
37+
output_shape = get_first_fake_tensor(node).shape
38+
nbr_of_broacasts = 0
39+
for arg in node.args:
40+
if not isinstance(arg, Node):
41+
continue
42+
43+
shape = get_first_fake_tensor(arg).shape
44+
if shape != output_shape:
45+
nbr_of_broacasts += 1
46+
if nbr_of_broacasts > 1:
47+
multiples = [
48+
int(output_shape[d] / shape[d])
49+
for d in range(len(output_shape))
50+
]
51+
with graph_module.graph.inserting_before(node):
52+
repeat = create_node(
53+
graph_module.graph,
54+
exir_ops.edge.aten.repeat.default,
55+
args=(arg, multiples),
56+
kwargs={},
57+
from_node=node,
58+
)
59+
node.replace_input_with(arg, repeat)
60+
61+
graph_module.recompile()
62+
graph_module = super().call(graph_module).graph_module
63+
return PassResult(graph_module, True)

backends/arm/arm_vela.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
7373
np_path = os.path.join(tmpdir, "output", "out_vela.npz")
7474
else:
7575
np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
76-
blocks = b""
7776

77+
blocks = b""
7878
with np.load(np_path, allow_pickle=False) as data:
7979
# Construct our modified output_blocks with data in a form easily
8080
# digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
9292
if not isinstance(data["scratch_shape"][0], np.int64):
9393
raise RuntimeError("Expected scratch to be int64")
9494
block_length = int(data["scratch_shape"][0])
95-
bin_blocks["scratch_data"] = b"\x00" * block_length
95+
bin_blocks["scratch_size"] = struct.pack("<I", block_length)
9696

9797
# Capture inputs and outputs
9898
bin_blocks["inputs"] = vela_bin_pack_io("input", data)

backends/arm/operator_support/tosa_supported_operators.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):
306306
exir_ops.edge.aten.sub.Tensor,
307307
exir_ops.edge.aten.upsample_bilinear2d.vec,
308308
exir_ops.edge.aten.upsample_nearest2d.vec,
309+
torch.ops.aten.scalar_tensor.default,
309310
*TableOps.included_ops(),
310311
)
311312

backends/arm/operators/op_neg.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
NodeVisitor,
1717
register_node_visitor,
1818
)
19-
19+
from executorch.backends.arm.operators.operator_validation_utils import (
20+
validate_num_inputs,
21+
validate_same_dtype,
22+
)
2023
from executorch.backends.arm.tosa_mapping import TosaArg
2124

2225

@@ -60,14 +63,12 @@ def define_node(
6063
ts.DType.FP32,
6164
}
6265

66+
validate_num_inputs(self.target, inputs, 1)
67+
validate_same_dtype(self.target, [*inputs, output])
68+
6369
if inputs[0].dtype not in supported_dtypes:
6470
raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
6571

66-
if inputs[0].dtype != output.dtype:
67-
raise ValueError(
68-
"All inputs and output need same dtype."
69-
f"Got {inputs[0].dtype=}, {output.dtype=}"
70-
)
7172
input_zp, output_zp = get_negate_zero_points(
7273
node, inputs[0].dtype == ts.DType.INT8
7374
)
@@ -109,14 +110,12 @@ def define_node(
109110
ts.DType.FP32,
110111
}
111112

113+
validate_num_inputs(self.target, inputs, 1)
114+
validate_same_dtype(self.target, [*inputs, output])
115+
112116
if inputs[0].dtype not in supported_dtypes:
113117
raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
114118

115-
if inputs[0].dtype != output.dtype:
116-
raise ValueError(
117-
"All inputs and output need same dtype."
118-
f"Got {inputs[0].dtype=}, {output.dtype=}"
119-
)
120119
input_zp, output_zp = get_negate_zero_points(
121120
node, inputs[0].dtype == ts.DType.INT8
122121
)

0 commit comments

Comments
 (0)