Skip to content

Commit 01db40e

Browse files
committed
Update base for Update on "Arm backend: Add 16A8W support for view and transpose operations"
Add 16A8W quantization support for view and transpose operations in ExecutorTorch ARM backend. This follows the pattern established for linear, mul, sigmoid, tanh, and slice operations, extending int16 support to view and transpose operations. Changes: - Add INT16 dtype validation support in op_transpose.py - Add test_view_tensor_16a8w_tosa_INT test function - Enable test_view.py in test targets configuration The 16A8W configuration uses 16-bit activations with 8-bit weights, enabling higher precision for activations while maintaining weight efficiency. Differential Revision: [D80511313](https://our.internmc.facebook.com/intern/diff/D80511313/) cc digantdesai freddan80 per zingo oscarandersson8218 [ghstack-poisoned]
2 parents 7a08624 + 705150c commit 01db40e

File tree

117 files changed

+3749
-773
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+3749
-773
lines changed

.ci/scripts/test_model.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ test_model() {
9797
bash examples/models/llava/install_requirements.sh
9898
STRICT="--no-strict"
9999
fi
100-
if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
100+
if [[ "${MODEL_NAME}" == "qwen2_5_1_5b" ]]; then
101101
# Install requirements for export_llama
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.

.github/workflows/trunk.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ jobs:
176176
- model: phi_4_mini
177177
backend: portable
178178
runner: linux.arm64.m7g.4xlarge
179-
- model: qwen2_5
179+
- model: qwen2_5_1_5b
180180
backend: portable
181181
runner: linux.arm64.2xlarge
182182
- model: llama3_2_vision_encoder
@@ -823,10 +823,10 @@ jobs:
823823
--tsv_path ${TSV_PATH}
824824
echo "::endgroup::"
825825
826-
test-huggingface-transformers-coreml:
826+
test-huggingface-transformers-macos:
827827
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
828828
if: ${{ !github.event.pull_request.head.repo.fork }}
829-
name: test-huggingface-transformers-coreml
829+
name: test-huggingface-transformers-macos
830830
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
831831
permissions:
832832
id-token: write
@@ -844,10 +844,10 @@ jobs:
844844
# phi4-mini|xnnpack|--quantize,
845845
# smollm2-135m|xnnpack|--quantize,
846846
# smollm3-3b|xnnpack|--quantize,
847+
# qwen3-1.7b|xnnpack|--quantize,
847848
# CoreML.
848849
llama3.2-1b|coreml_fp32_gpu|--quantize,
849850
qwen3-0.6b|coreml_fp32_gpu|--quantize,
850-
qwen3-1.7b|xnnpack|--quantize,
851851
smollm2-135m|coreml_fp32_gpu|--quantize,
852852
olmo-1b|coreml_fp32_gpu|--quantize,
853853
bert|coreml_fp32_gpu|--quantize,

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ To get started you can:
5252

5353
- Visit the [Step by Step Tutorial](https://pytorch.org/executorch/stable/getting-started.html) to get things running locally and deploy a model to a device
5454
- Use this [Colab Notebook](https://colab.research.google.com/drive/1qpxrXC3YdJQzly3mRg-4ayYiOjC6rue3?usp=sharing) to start playing around right away
55-
- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), and [Llava](examples/models/llava/README.md)
55+
- Jump straight into LLM use cases by following specific instructions for popular open-source models such as [Llama](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [Llava](examples/models/llava/README.md), [Voxtral](examples/models/voxtral/README.md), and [LFM2](examples/models/lfm2/README.md).
5656

5757
## Feedback and Engagement
5858

backends/apple/coreml/TARGETS

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,16 +61,21 @@ runtime.python_library(
6161
)
6262

6363
runtime.python_library(
64-
name = "recipes",
65-
srcs = glob([
66-
"recipes/*.py",
67-
]),
64+
name = "coreml_recipes",
65+
srcs = [
66+
"recipes/__init__.py",
67+
"recipes/coreml_recipe_provider.py"
68+
],
6869
visibility = [
6970
"@EXECUTORCH_CLIENTS",
71+
"//executorch/export/...",
7072
],
7173
deps = [
7274
"fbsource//third-party/pypi/coremltools:coremltools",
75+
":coreml_recipe_types",
7376
":backend",
77+
":partitioner",
78+
":quantizer",
7479
"//caffe2:torch",
7580
"//executorch/exir:lib",
7681
"//executorch/exir/backend:compile_spec_schema",
@@ -80,6 +85,20 @@ runtime.python_library(
8085
],
8186
)
8287

88+
runtime.python_library(
89+
name = "coreml_recipe_types",
90+
srcs = [
91+
"recipes/coreml_recipe_types.py",
92+
],
93+
visibility = [
94+
"@EXECUTORCH_CLIENTS",
95+
"//executorch/export/...",
96+
],
97+
deps = [
98+
"//executorch/export:recipe",
99+
],
100+
)
101+
83102
runtime.cxx_python_extension(
84103
name = "executorchcoreml",
85104
srcs = [
@@ -124,7 +143,7 @@ runtime.python_test(
124143
"fbsource//third-party/pypi/pytest:pytest",
125144
":partitioner",
126145
":quantizer",
127-
":recipes",
146+
":coreml_recipes",
128147
"//caffe2:torch",
129148
"//pytorch/vision:torchvision",
130149
"fbsource//third-party/pypi/scikit-learn:scikit-learn",

backends/apple/coreml/compiler/torch_ops.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ def dequantize_affine(context, node):
175175
int_data.astype(quantized_np_dtype),
176176
zero_point,
177177
scale,
178-
axis=-1,
179178
name=node.name,
180179
)
181180
context.add(output, node.name)

backends/apple/coreml/recipes/coreml_recipe_provider.py

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# Please refer to the license found in the LICENSE file in the root directory of the source tree.
44

55

6+
import logging
67
from typing import Any, Optional, Sequence
78

89
import coremltools as ct
@@ -111,8 +112,9 @@ def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> Non
111112

112113
unexpected = set(kwargs.keys()) - expected_keys
113114
if unexpected:
114-
raise ValueError(
115-
f"Recipe '{recipe_type.value}' received unexpected parameters: {list(unexpected)}"
115+
logging.warning(
116+
f"CoreML recipe '{recipe_type.value}' ignoring unexpected parameters: {list(unexpected)}. "
117+
f"Expected parameters: {list(expected_keys)}"
116118
)
117119

118120
self._validate_base_parameters(kwargs)
@@ -121,7 +123,13 @@ def _validate_recipe_kwargs(self, recipe_type: RecipeType, **kwargs: Any) -> Non
121123

122124
def _get_expected_keys(self, recipe_type: RecipeType) -> set:
123125
"""Get expected parameter keys for a recipe type"""
124-
common_keys = {"minimum_deployment_target", "compute_unit"}
126+
common_keys = {
127+
"minimum_deployment_target",
128+
"compute_unit",
129+
"skip_ops_for_coreml_delegation",
130+
"lower_full_graph",
131+
"take_over_constant_data",
132+
}
125133

126134
if recipe_type in [
127135
CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_GROUP,
@@ -377,9 +385,19 @@ def _get_coreml_lowering_recipe(
377385
if minimum_deployment_target and minimum_deployment_target < ct.target.iOS18:
378386
take_over_mutable_buffer = False
379387

388+
# Extract additional partitioner parameters
389+
skip_ops_for_coreml_delegation = kwargs.get(
390+
"skip_ops_for_coreml_delegation", None
391+
)
392+
lower_full_graph = kwargs.get("lower_full_graph", False)
393+
take_over_constant_data = kwargs.get("take_over_constant_data", True)
394+
380395
partitioner = CoreMLPartitioner(
381396
compile_specs=compile_specs,
382397
take_over_mutable_buffer=take_over_mutable_buffer,
398+
skip_ops_for_coreml_delegation=skip_ops_for_coreml_delegation,
399+
lower_full_graph=lower_full_graph,
400+
take_over_constant_data=take_over_constant_data,
383401
)
384402

385403
edge_compile_config = EdgeCompileConfig(

backends/apple/coreml/test/test_coreml_recipes.py

Lines changed: 0 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -185,14 +185,6 @@ def test_int4_weight_only_per_group_validation(self):
185185
)
186186
self.assertIn("must be positive", str(cm.exception))
187187

188-
# Test unexpected parameter
189-
with self.assertRaises(ValueError) as cm:
190-
self.provider.create_recipe(
191-
CoreMLRecipeType.TORCHAO_INT4_WEIGHT_ONLY_PER_CHANNEL,
192-
group_size=32, # group_size not valid for per-channel
193-
)
194-
self.assertIn("unexpected parameters", str(cm.exception))
195-
196188
def test_int8_weight_only_per_channel(self):
197189
"""Test INT8 weight-only per-channel quantization"""
198190
model = TestHelperModules.TwoLinearModule().eval()
@@ -385,23 +377,6 @@ def forward(self, x):
385377
self._compare_eager_quantized_model_outputs(session, example_inputs, atol=1e-2)
386378
self._compare_eager_unquantized_model_outputs(session, model, example_inputs)
387379

388-
def test_pt2e_recipes_parameter_rejection(self):
389-
"""Test that PT2E recipes reject TorchAO-specific parameters"""
390-
# PT2E recipes should reject TorchAO-specific parameters
391-
pt2e_recipes = [
392-
CoreMLRecipeType.PT2E_INT8_STATIC,
393-
CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY,
394-
]
395-
torchao_params = ["filter_fn", "group_size", "bits", "block_size"]
396-
397-
for recipe_type in pt2e_recipes:
398-
for param in torchao_params:
399-
with self.subTest(recipe=recipe_type.value, param=param):
400-
kwargs = {param: "dummy_value"}
401-
with self.assertRaises(ValueError) as cm:
402-
self.provider.create_recipe(recipe_type, **kwargs)
403-
self.assertIn("unexpected parameters", str(cm.exception).lower())
404-
405380
def test_filter_fn_comprehensive(self):
406381
"""Comprehensive test for filter_fn parameter functionality"""
407382

backends/apple/coreml/test/test_torch_ops.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727
class TestTorchOps(unittest.TestCase):
2828
edge_compile_config = executorch.exir.EdgeCompileConfig()
2929

30-
def _coreml_partitioner(self):
30+
def _coreml_partitioner(self, *, minimum_deployment_target=ct.target.iOS18):
3131
compile_specs = CoreMLBackend.generate_compile_specs(
32-
minimum_deployment_target=ct.target.iOS18
32+
minimum_deployment_target=minimum_deployment_target
3333
)
3434
return CoreMLPartitioner(compile_specs=compile_specs)
3535

@@ -158,6 +158,33 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
158158
et_prog = delegated_program.to_executorch()
159159
self._compare_outputs(et_prog, model, example_inputs)
160160

161+
def test_dequantize_affine_c8w_embedding_c8w_linear_ios16(self):
162+
model, example_inputs = self._get_test_model()
163+
quantize_(
164+
model,
165+
IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
166+
lambda m, fqn: isinstance(m, torch.nn.Embedding),
167+
)
168+
quantize_(
169+
model,
170+
IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
171+
)
172+
ep = torch.export.export(model, example_inputs)
173+
delegated_program = executorch.exir.to_edge_transform_and_lower(
174+
ep,
175+
partitioner=[
176+
self._coreml_partitioner(minimum_deployment_target=ct.target.iOS16)
177+
],
178+
)
179+
for node in delegated_program.exported_program().graph.nodes:
180+
if node.op == "call_function":
181+
assert node.target.__name__ in [
182+
"executorch_call_delegate",
183+
"getitem",
184+
], f"Got unexpected node target after delegation: {node.target.__name__}"
185+
et_prog = delegated_program.to_executorch()
186+
self._compare_outputs(et_prog, model, example_inputs)
187+
161188
def test_dequantize_codebook_linear_per_grouped_col(self):
162189
model, example_inputs = self._get_test_model()
163190
quantize_(
@@ -298,6 +325,7 @@ def forward(self, x):
298325
test_runner.test_dequantize_affine_c4w_embedding()
299326
test_runner.test_dequantize_affine_c4w_linear()
300327
test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
328+
test_runner.test_dequantize_affine_c8w_embedding_c8w_linear_ios16()
301329
test_runner.test_dequantize_codebook_linear_per_grouped_col()
302330
test_runner.test_dequantize_codebook_linear_per_grouped_row()
303331
test_runner.test_dequantize_codebook_embedding_per_grouped_col()

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
3838
from .decompose_cumsum_pass import DecomposeCumsumPass # noqa
3939
from .decompose_div_pass import DecomposeDivPass # noqa
40+
from .decompose_div_tensor_mode import DecomposeDivTensorModePass # noqa
4041
from .decompose_elu_pass import DecomposeEluPass # noqa
4142
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
4243
from .decompose_expm1_pass import DecomposeExpm1Pass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
DecomposeCosineSimilarityPass,
4343
DecomposeCumsumPass,
4444
DecomposeDivPass,
45+
DecomposeDivTensorModePass,
4546
DecomposeEluPass,
4647
DecomposeEmbeddingPass,
4748
DecomposeExpm1Pass,
@@ -211,6 +212,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
211212
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
212213
)
213214
self.add_pass(DecomposeNotEqualPass())
215+
self.add_pass(DecomposeDivTensorModePass())
214216
self.add_pass(DecomposeDivPass())
215217
self.add_pass(DecomposeSoftmaxPass())
216218
self.add_pass(DecomposeGeluPass())
@@ -289,6 +291,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
289291
self.add_pass(DecomposeNotEqualPass())
290292
self.add_pass(DecomposeCosineSimilarityPass())
291293
self.add_pass(DecomposeGluPass())
294+
self.add_pass(DecomposeDivTensorModePass())
292295
self.add_pass(DecomposeDivPass())
293296
self.add_pass(DecomposeLeakyReLUPass())
294297
self.add_pass(DecomposeLinearVectorNormPass())

0 commit comments

Comments
 (0)