pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 0 additions & 9 deletions b/‎.ci/scripts/setup-samsung-linux-deps.sh‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎.ci/scripts/test_model.ps1‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_model.ps1‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_torchao_huggingface_checkpoints.sh‎
Lines changed: 139 additions & 0 deletions b/‎.ci/scripts/test_torchao_huggingface_checkpoints.sh‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 5 additions & 1 deletion b/‎.ci/scripts/unittest-buck2.sh‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 18 additions & 1 deletion b/‎.ci/scripts/wheel/pre_build_script.sh‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎.ci/scripts/wheel/test_windows.py‎
Lines changed: 75 additions & 0 deletions b/‎.ci/scripts/wheel/test_windows.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎.ci/scripts/wheel/vc_env_helper.bat‎
Lines changed: 61 additions & 0 deletions b/‎.ci/scripts/wheel/vc_env_helper.bat‎
Lines changed: 61 additions & 0 deletions
@@ -1 +1 @@
-e7152ff8a6a929a0db7f3f4a72a5b6d471769cd3
+4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e
@@ -54,15 +54,6 @@ install_enn_backend() {
   rm -rf "${NDK_INSTALLATION_DIR}" && sudo mkdir -p "${NDK_INSTALLATION_DIR}"
   ANDROID_NDK_VERSION=r27b
 
-  pushd .
-  cd /tmp
-  curl -Os --retry 3 "https://ossci-android.s3.amazonaws.com/android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
-  unzip -qo "android-ndk-${ANDROID_NDK_VERSION}-linux.zip"
-
-  # Print the content for manual verification
-  ls -lah "android-ndk-${ANDROID_NDK_VERSION}"
-  sudo mv "android-ndk-${ANDROID_NDK_VERSION}"/* "${NDK_INSTALLATION_DIR}"
-  popd
   # build Exynos backend
   export ANDROID_NDK_ROOT=${ANDROID_NDK_ROOT:-/opt/ndk}
   bash backends/samsung/build.sh --build all
 
@@ -34,7 +34,7 @@ function ExportModel-Xnnpack {
         [bool]$quantize
     )
 
-    if $(quantize) {
+    if ($quantize) {
         python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize | Write-Host
         $modelFile = "$($modelName)_xnnpack_q8.pte"
     } else {
 
@@ -0,0 +1,139 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# -------------------------
+# Args / flags
+# -------------------------
+TEST_WITH_RUNNER=0
+MODEL_NAME=""
+
+# Parse args
+if [[ $# -lt 1 ]]; then
+  echo "Usage: $0 <model_name> [--test_with_runner]"
+  echo "Supported model_name values: qwen3_4b, phi_4_mini"
+  exit 1
+fi
+
+MODEL_NAME="$1"
+shift
+
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --test_with_runner)
+      TEST_WITH_RUNNER=1
+      ;;
+    -h|--help)
+      echo "Usage: $0 <model_name> [--test_with_runner]"
+      echo "  model_name: qwen3_4b | phi_4_mini"
+      echo "  --test_with_runner: build ET + run llama_main to sanity-check the export"
+      exit 0
+      ;;
+    *)
+      echo "Unknown option: $1"
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+
+MODEL_OUT=model.pte
+
+case "$MODEL_NAME" in
+  qwen3_4b)
+    echo "Running Qwen3-4B export..."
+    HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "qwen3_4b" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/qwen3/config/4b_config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  phi_4_mini)
+    echo "Running Phi-4-mini export..."
+    HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4)
+    EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
+    $PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
+      $HF_MODEL_DIR \
+      pytorch_model_converted.bin
+
+    $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
+      --model "phi_4_mini" \
+      --checkpoint pytorch_model_converted.bin \
+      --params examples/models/phi_4_mini/config/config.json \
+      --output_name $MODEL_OUT \
+      -kv \
+      --use_sdpa_with_kv_cache \
+      -X \
+      --xnnpack-extended-ops \
+      --max_context_length 1024 \
+      --max_seq_length 1024 \
+      --dtype fp32 \
+      --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
+    ;;
+
+  *)
+    echo "Error: unsupported model_name '$MODEL_NAME'"
+    echo "Supported values: qwen3_4b, phi_4_mini"
+    exit 1
+    ;;
+esac
+
+# Check file size
+MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT)
+if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
+  echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND"
+  exit 1
+fi
+
+# Install ET with CMake
+if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
+  echo "[runner] Building and testing llama_main ..."
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DEXECUTORCH_ENABLE_LOGGING=1 \
+        -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
+        -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
+        -Bcmake-out .
+    cmake --build cmake-out -j16 --config Release --target install
+
+
+    # Install llama runner
+    cmake -DPYTHON_EXECUTABLE=python \
+        -DCMAKE_BUILD_TYPE=Release \
+        -Bcmake-out/examples/models/llama \
+        examples/models/llama
+    cmake --build cmake-out/examples/models/llama -j16 --config Release
+
+    # Run the model
+    ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
+fi
+
+# Clean up
+rm -f pytorch_model_converted.bin "$MODEL_OUT"
@@ -9,7 +9,11 @@ set -eux
 # TODO: expand this to //...
 # TODO: can't query cadence & vulkan backends
 # TODO: can't query //kernels/prim_ops because of non-buckified stuff in OSS.
-buck2 query "//backends/apple/... + //backends/example/... + \
+# TODO: Make //backends/arm tests use runtime wrapper so we can just query //backends/arm/...
+buck2 query "//backends/apple/... + //backends/arm: + //backends/arm/debug/... + \
+//backends/arm/operator_support/... + //backends/arm/operators/... + \
+//backends/arm/_passes/... + //backends/arm/runtime/... + //backends/arm/tosa/... \
++ //backends/example/... + \
 //backends/mediatek/... + //backends/transforms/... + \
 //backends/xnnpack/... + //configurations/... + //extension/flat_tensor: + \
 //extension/llm/runner: + //kernels/aten/... + //kernels/optimized/... + \
 
@@ -9,9 +9,26 @@ set -euxo pipefail
 
 # This script is run before building ExecuTorch binaries
 
+# Clone nested submodules for tokenizers - this is a workaround for recursive
+# submodule clone failing due to path length limitations on Windows. Eventually,
+# we should update the core job in test-infra to enable long paths before
+# checkout to avoid needing to do this.
+pushd extension/llm/tokenizers
+git submodule update --init
+popd
+
+# On Windows, enable symlinks and re-checkout the current revision to create
+# the symlinked src/ directory. This is needed to build the wheel.
+UNAME_S=$(uname -s)
+if [[ $UNAME_S == *"MINGW"* || $UNAME_S == *"MSYS"* ]]; then
+    echo "Enabling symlinks on Windows"
+    git config core.symlinks true
+    git checkout -f HEAD
+fi
+
 # Manually install build requirements because `python setup.py bdist_wheel` does
 # not install them. TODO(dbort): Switch to using `python -m build --wheel`,
 # which does install them. Though we'd need to disable build isolation to be
 # able to see the installed torch package.
 
-"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh"  --example
+"${GITHUB_WORKSPACE}/${REPOSITORY}/install_requirements.sh" --example
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List
+
+import torch
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.examples.models import Backend, Model, MODEL_NAME_TO_MODEL
+from executorch.examples.models.model_factory import EagerModelFactory
+from executorch.examples.xnnpack import MODEL_NAME_TO_OPTIONS
+from executorch.examples.xnnpack.quantization.utils import quantize as quantize_xnn
+from executorch.exir import EdgeCompileConfig, to_edge_transform_and_lower
+from executorch.extension.pybindings.portable_lib import (
+    _load_for_executorch_from_buffer,
+)
+from test_base import ModelTest
+
+
+def test_model_xnnpack(model: Model, quantize: bool) -> None:
+    model_instance, example_inputs, _, _ = EagerModelFactory.create_model(
+        *MODEL_NAME_TO_MODEL[str(model)]
+    )
+
+    model_instance.eval()
+    ref_outputs = model_instance(*example_inputs)
+
+    if quantize:
+        quant_type = MODEL_NAME_TO_OPTIONS[str(model)].quantization
+        model_instance = torch.export.export_for_training(
+            model_instance, example_inputs
+        )
+        model_instance = quantize_xnn(
+            model_instance.module(), example_inputs, quant_type
+        )
+
+    lowered = to_edge_transform_and_lower(
+        torch.export.export(model_instance, example_inputs),
+        partitioner=[XnnpackPartitioner()],
+        compile_config=EdgeCompileConfig(
+            _check_ir_validity=False,
+        ),
+    ).to_executorch()
+
+    loaded_model = _load_for_executorch_from_buffer(lowered.buffer)
+    et_outputs = loaded_model([*example_inputs])
+
+    if isinstance(ref_outputs, torch.Tensor):
+        ref_outputs = (ref_outputs,)
+
+    assert len(ref_outputs) == len(et_outputs)
+    for i in range(len(ref_outputs)):
+        torch.testing.assert_close(ref_outputs[i], et_outputs[i], atol=1e-4, rtol=1e-5)
+
+
+def run_tests(model_tests: List[ModelTest]) -> None:
+    for model_test in model_tests:
+        if model_test.backend == Backend.Xnnpack:
+            test_model_xnnpack(model_test.model, quantize=False)
+        else:
+            raise RuntimeError(f"Unsupported backend {model_test.backend}.")
+
+
+if __name__ == "__main__":
+    run_tests(
+        model_tests=[
+            ModelTest(
+                model=Model.Mv3,
+                backend=Backend.Xnnpack,
+            ),
+        ]
+    )
@@ -0,0 +1,61 @@
+REM This is lightly modified from the torchvision Windows build logic.
+REM See https://github.com/pytorch/vision/blob/main/packaging/windows/internal/vc_env_helper.bat
+
+@echo on
+
+set VC_VERSION_LOWER=17
+set VC_VERSION_UPPER=18
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+if "%VSDEVCMD_ARGS%" == "" (
+    call "%VS15VCVARSALL%" x64 || exit /b 1
+) else (
+    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
+)
+
+@echo on
+
+if "%CU_VERSION%" == "xpu" call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
+
+set DISTUTILS_USE_SDK=1
+
+set args=%1
+shift
+:start
+if [%1] == [] goto done
+set args=%args% %1
+shift
+goto start
+
+:done
+if "%args%" == "" (
+    echo Usage: vc_env_helper.bat [command] [args]
+    echo e.g. vc_env_helper.bat cl /c test.cpp
+)
+
+set work_dir=%CD%
+if exist setup.py (
+    echo "Creating symlink..."
+    REM Setup a symlink to shorten the path length.
+    REM Note that the ET directory has to be named "executorch".
+    cd %GITHUB_WORKSPACE%
+    if not exist et\ (
+        mkdir et
+    )
+    cd et
+    echo Work dir: %work_dir%
+    if not exist executorch\ (
+        mklink /d executorch %work_dir%
+    )
+    cd executorch
+)
+
+%args% || exit /b 1
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e7152ff8a6a929a0db7f3f4a72a5b6d471769cd3`
	`1`	`+4d4abec80f03cd8fdefe1d9cb3a60d3690cd777e`
Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ function ExportModel-Xnnpack {`
`34`	`34`	`[bool]$quantize`
`35`	`35`	`)`
`36`	`36`
`37`		`- if $(quantize) {`
	`37`	`+ if ($quantize) {`
`38`	`38`	`python -m examples.xnnpack.aot_compiler --model_name="${MODEL_NAME}" --delegate --quantize \| Write-Host`
`39`	`39`	`$modelFile = "$($modelName)_xnnpack_q8.pte"`
`40`	`40`	`} else {`