pytorch · GregoryComer · Oct 22, 2025 · Oct 8, 2025 · Oct 13, 2025 · Oct 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -62,7 +62,6 @@ xcuserdata/
 /include/
 /share/
 /version.py
-*.csv
 *_etdump
 
 # Android

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -24,17 +24,17 @@ For Apple, please refer to the [iOS documentation](docs/source/using-executorch-
 executorch
 ├── <a href="backends">backends</a> - Backend delegate implementations for various hardware targets. Each backend uses partitioner to split the graph into subgraphs that can be executed on specific hardware, quantizer to optimize model precision, and runtime components to execute the graph on target hardware. For details refer to the <a href="docs/source/backend-delegates-integration.md">backend documentation</a> and the <a href="docs/source/using-executorch-export.md">Export and Lowering tutorial</a> for more information.
 │   ├── <a href="backends/apple">apple</a> - Apple-specific backends.
-│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends-coreml.md">doc</a>.
-│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends-mps.md">doc</a>.
+│   │   ├── <a href="backends/apple/coreml">coreml</a> - CoreML backend for Apple devices. See <a href="docs/source/backends/coreml/coreml-overview.md">doc</a>.
+│   │   └── <a href="backends/apple/mps">mps</a> - Metal Performance Shaders backend for Apple devices. See <a href="docs/source/backends/mps/mps-overview.md">doc</a>.
 │   ├── <a href="backends/arm">arm</a> - ARM architecture backends. See <a href="docs/source/backends-arm-ethos-u.md">doc</a>.
 │   ├── <a href="backends/cadence">cadence</a> - Cadence-specific backends. See <a href="docs/source/backends-cadence.md">doc</a>.
 │   ├── <a href="backends/example">example</a> - Example backend implementations.
 │   ├── <a href="backends/mediatek">mediatek</a> - MediaTek-specific backends. See <a href="docs/source/backends-mediatek.md">doc</a>.
 │   ├── <a href="backends/openvino">openvino</a> - OpenVINO backend for Intel hardware.
 │   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
 │   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
-│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
-│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
+│   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends/vulkan/vulkan-overview.md">doc</a>.
+│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
 ├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
 ├── <a href="configurations">configurations</a> - Configuration files.
 ├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.

diff --git a/README-wheel.md b/README-wheel.md
@@ -11,8 +11,8 @@ The `executorch` pip package is in beta.
 The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
-* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
-* \[macOS only] [Core ML](docs/source/backends-coreml.md) and [MPS](docs/source/backends-mps.md) backend
+* Only the [XNNPACK backend delegate](docs/source/backends/xnnpack/xnnpack-overview.md) is linked into the prebuilt module.
+* \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
   are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch) for

diff --git a/README.md b/README.md
@@ -104,16 +104,14 @@ outputs = method.execute([torch.randn(1, 3, 224, 224)])
 
 Module module("model.pte");
 auto tensor = make_tensor_ptr({2, 2}, {1.0f, 2.0f, 3.0f, 4.0f});
-auto outputs = module.forward(tensor);
+auto outputs = module.forward({tensor});
 ```
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/ios-section.html)**
 ```swift
-import ExecuTorch
-
 let module = Module(filePath: "model.pte")
-let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0], shape: [2, 2])
-let outputs = try module.forward(input)
+let input = Tensor<Float>([1.0, 2.0, 3.0, 4.0])
+let outputs: [Value] = try module.forward([input])
 ```
 
 **[Kotlin (Android)](https://docs.pytorch.org/executorch/main/android-section.html)**
@@ -153,8 +151,6 @@ runner->generate("Hello, how are you?", config);
 
 **[Swift (iOS)](https://docs.pytorch.org/executorch/main/llm/run-on-ios.html)**
 ```swift
-import ExecuTorchLLM
-
 let runner = TextRunner(modelPath: "llama.pte", tokenizerPath: "tiktoken.bin")
 try runner.generate("Hello, how are you?", Config {
     $0.sequenceLength = 128
@@ -202,7 +198,7 @@ ExecuTorch powers on-device AI at scale across Meta's family of apps, VR/AR devi
 
 **LLMs:** [Llama 3.2/3.1/3](examples/models/llama/README.md), [Qwen 3](examples/models/qwen3/README.md), [Phi-4-mini](examples/models/phi_4_mini/README.md), [LiquidAI LFM2](examples/models/lfm2/README.md)
 
-**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language)
+**Multimodal:** [Llava](examples/models/llava/README.md) (vision-language), [Voxtral](examples/models/voxtral/README.md) (audio-language), [Gemma](examples/models/gemma3) (vision-language)
 
 **Vision/Speech:** [MobileNetV2](https://github.com/meta-pytorch/executorch-examples/tree/main/mv2), [DeepLabV3](https://github.com/meta-pytorch/executorch-examples/tree/main/dl3), [Whisper](https://github.com/meta-pytorch/executorch-examples/tree/main/whisper/android/WhisperApp)
 

@@ -1,7 +1,7 @@
 # ExecuTorch Core ML Delegate
 
 This subtree contains the Core ML Delegate implementation for ExecuTorch.
-Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends-coreml.md). 
+Core ML is an optimized framework for running machine learning models on Apple devices. The delegate is the mechanism for leveraging the Core ML framework to accelerate operators when running on Apple devices.  To learn how to use the CoreML delegate, see the [documentation](https://github.com/pytorch/executorch/blob/main/docs/source/backends/coreml/coreml-overview.md).
 
 ## Layout
 - `compiler/` : Lowers a module to Core ML backend.

diff --git a/backends/nxp/README.md b/backends/nxp/README.md
@@ -5,14 +5,14 @@ This subtree contains the ExecuTorch Backend implementation for the
 
 The eIQ® Neutron NPU is a highly scalable accelerator core architecture providing machine learning (ML) acceleration,
 able to support common and critical tasks for edge AI such as anomaly detection, speech recognition,
-image classification, object detection, facial recognition, image segmentation, and generative AI use cases like 
+image classification, object detection, facial recognition, image segmentation, and generative AI use cases like
 large and small language models (LLMs & SLMs) and text-to-speech (TTS).
-The architecture provides power and performance optimized NPUs integrated with NXP's broad portfolio of 
+The architecture provides power and performance optimized NPUs integrated with NXP's broad portfolio of
 microcontrollers and applications processors.
 
-The eIQ Neutron NPUs offer support for a wide variety of neural network types such as CNN, RNN, TCN and Transformer 
+The eIQ Neutron NPUs offer support for a wide variety of neural network types such as CNN, RNN, TCN and Transformer
 networks, as well as the ability to adapt and scale to new model architectures, topologies and layer types introduced
-to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the 
+to AI workloads. ML application development with the eIQ Neutron NPU is fully supported by the
 [eIQ machine learning software development environment](https://www.nxp.com/design/design-center/software/eiq-ml-development-environment/eiq-toolkit-for-end-to-end-model-development-and-deployment:EIQ-TOOLKIT).
 The eIQ AI SW Stack provides a streamlined development experience for developers and end-users of NXP products.
 
@@ -22,7 +22,7 @@ At this moment following eIQ® Neutron NPU variants and NXP platforms are suppor
 
 * **eIQ Neutron N3-64**, available on [i.MX RT700](https://www.nxp.com/products/i.MX-RT700)
 
-In the future the NXP eIQ Neutron Backend will be extended to support [i.MX 9 Application Processors](https://www.nxp.com/products/processors-and-microcontrollers/arm-processors/i-mx-applications-processors/i-mx-9-processors:IMX9-PROCESSORS) 
+In the future the NXP eIQ Neutron Backend will be extended to support [i.MX 9 Application Processors](https://www.nxp.com/products/processors-and-microcontrollers/arm-processors/i-mx-applications-processors/i-mx-9-processors:IMX9-PROCESSORS)
 with eIQ Neutron NPU, like the [i.MX 95](https://www.nxp.com/products/iMX95).
 
 
@@ -33,7 +33,7 @@ The eIQ Neutron NPU Backend should be considered as prototype quality at this mo
 improvements. NXP and the ExecuTorch community is actively developing this codebase.
 
 ## Neutron Backend implementation and SW architecture
-Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode. 
+Neutron Backend uses the eIQ Neutron Converter as ML compiler to compile the delegated subgraph to Neutron microcode.
 The Neutron Converter accepts the ML model in LiteRT format, for the **eIQ Neutron N3** class  therefore the Neutron Backend
 uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Converter ML compiler.
 
@@ -44,10 +44,10 @@ uses the LiteRT flatbuffers format as IR between the ExecuTorch and Neutron Conv
       `node_conveters` is structured as single module for each Edge operator.
     * `backend/ir/lib` - automatically generated handlers from LiteRT flatbuffers schema.
     * `backend/ir/tflite_generator` and `backend/ir/tflite_optimizer` handle the serialization
-       of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers 
+       of the in-memory built subgraph for delegation into LiteRT/TFLite flatbuffers
        representation. Code taken from the onnx2tflite tool.
-*  `edge_passes` - Various passes operating on Edge dialect level. 
-*  `quantizer` - Neutron Backend quantizer implementation. 
+*  `edge_passes` - Various passes operating on Edge dialect level.
+*  `quantizer` - Neutron Backend quantizer implementation.
 *  `runtime` - Neutron Backend runtime implementation. For running compiled on device.
 *  `tests/` - Unit tests for Neutron backend.
     * `tests/converter/node_converter` - Operator level unit tests.

@@ -1,205 +1,4 @@
-# Vulkan Backend
+# The ExecuTorch Vulkan Backend
 
-The ExecuTorch Vulkan delegate is a native GPU delegate for ExecuTorch that is
-built on top of the cross-platform Vulkan GPU API standard. It is primarily
-designed to leverage the GPU to accelerate model inference on Android devices,
-but can be used on any platform that supports an implementation of Vulkan:
-laptops, servers, and edge devices.
-
-::::{note}
-The Vulkan delegate is currently under active development, and its components
-are subject to change.
-::::
-
-## What is Vulkan?
-
-Vulkan is a low-level GPU API specification developed as a successor to OpenGL.
-It is designed to offer developers more explicit control over GPUs compared to
-previous specifications in order to reduce overhead and maximize the
-capabilities of the modern graphics hardware.
-
-Vulkan has been widely adopted among GPU vendors, and most modern GPUs (both
-desktop and mobile) in the market support Vulkan. Vulkan is also included in
-Android from Android 7.0 onwards.
-
-**Note that Vulkan is a GPU API, not a GPU Math Library**. That is to say it
-provides a way to execute compute and graphics operations on a GPU, but does not
-come with a built-in library of performant compute kernels.
-
-## The Vulkan Compute Library
-
-The ExecuTorch Vulkan Delegate is a wrapper around a standalone runtime known as
-the **Vulkan Compute Library**. The aim of the Vulkan Compute Library is to
-provide GPU implementations for PyTorch operators via GLSL compute shaders.
-
-The Vulkan Compute Library is a fork/iteration of the [PyTorch Vulkan Backend](https://pytorch.org/tutorials/prototype/vulkan_workflow.html).
-The core components of the PyTorch Vulkan backend were forked into ExecuTorch
-and adapted for an AOT graph-mode style of model inference (as opposed to
-PyTorch which adopted an eager execution style of model inference).
-
-The components of the Vulkan Compute Library are contained in the
-`executorch/backends/vulkan/runtime/` directory. The core components are listed
-and described below:
-
-```
-runtime/
-├── api/ .................... Wrapper API around Vulkan to manage Vulkan objects
-└── graph/ .................. ComputeGraph class which implements graph mode inference
-    └── ops/ ................ Base directory for operator implementations
-        ├── glsl/ ........... GLSL compute shaders
-        │   ├── *.glsl
-        │   └── conv2d.glsl
-        └── impl/ ........... C++ code to dispatch GPU compute shaders
-            ├── *.cpp
-            └── Conv2d.cpp
-```
-
-## Features
-
-The Vulkan delegate currently supports the following features:
-
-* **Memory Planning**
-  * Intermediate tensors whose lifetimes do not overlap will share memory allocations. This reduces the peak memory usage of model inference.
-* **Capability Based Partitioning**:
-  * A graph can be partially lowered to the Vulkan delegate via a partitioner, which will identify nodes (i.e. operators) that are supported by the Vulkan delegate and lower only supported subgraphs
-* **Support for upper-bound dynamic shapes**:
-  * Tensors can change shape between inferences as long as its current shape is smaller than the bounds specified during lowering
-
-In addition to increasing operator coverage, the following features are
-currently in development:
-
-* **Quantization Support**
-  * We are currently working on support for 8-bit dynamic quantization, with plans to extend to other quantization schemes in the future.
-* **Memory Layout Management**
-  * Memory layout is an important factor to optimizing performance. We plan to introduce graph passes to introduce memory layout transitions throughout a graph to optimize memory-layout sensitive operators such as Convolution and Matrix Multiplication.
-* **Selective Build**
-  * We plan to make it possible to control build size by selecting which operators/shaders you want to build with
-
-## End to End Example
-
-To further understand the features of the Vulkan Delegate and how to use it,
-consider the following end to end example with a simple single operator model.
-
-### Compile and lower a model to the Vulkan Delegate
-
-Assuming ExecuTorch has been set up and installed, the following script can be
-used to produce a lowered MobileNet V2 model as `vulkan_mobilenetv2.pte`.
-
-Once ExecuTorch has been set up and installed, the following script can be used
-to generate a simple model and lower it to the Vulkan delegate.
-
-```
-# Note: this script is the same as the script from the "Setting up ExecuTorch"
-# page, with one minor addition to lower to the Vulkan backend.
-import torch
-from torch.export import export
-from executorch.exir import to_edge
-
-from executorch.backends.vulkan.partitioner.vulkan_partitioner import VulkanPartitioner
-
-# Start with a PyTorch model that adds two input tensors (matrices)
-class Add(torch.nn.Module):
-  def __init__(self):
-    super(Add, self).__init__()
-
-  def forward(self, x: torch.Tensor, y: torch.Tensor):
-      return x + y
-
-# 1. torch.export: Defines the program with the ATen operator set.
-aten_dialect = export(Add(), (torch.ones(1), torch.ones(1)))
-
-# 2. to_edge: Make optimizations for Edge devices
-edge_program = to_edge(aten_dialect)
-# 2.1 Lower to the Vulkan backend
-edge_program = edge_program.to_backend(VulkanPartitioner())
-
-# 3. to_executorch: Convert the graph to an ExecuTorch program
-executorch_program = edge_program.to_executorch()
-
-# 4. Save the compiled .pte program
-with open("vk_add.pte", "wb") as file:
-    file.write(executorch_program.buffer)
-```
-
-Like other ExecuTorch delegates, a model can be lowered to the Vulkan Delegate
-using the `to_backend()` API. The Vulkan Delegate implements the
-`VulkanPartitioner` class which identifies nodes (i.e. operators) in the graph
-that are supported by the Vulkan delegate, and separates compatible sections of
-the model to be executed on the GPU.
-
-This means the a model can be lowered to the Vulkan delegate even if it contains
-some unsupported operators. This will just mean that only parts of the graph
-will be executed on the GPU.
-
-
-::::{note}
-The [supported ops list](https://github.com/pytorch/executorch/blob/main/backends/vulkan/op_registry.py#L194)
-Vulkan partitioner code can be inspected to examine which ops are currently
-implemented in the Vulkan delegate.
-::::
-
-### Build Vulkan Delegate libraries
-
-The easiest way to build and test the Vulkan Delegate is to build for Android
-and test on a local Android device. Android devices have built in support for
-Vulkan, and the Android NDK ships with a GLSL compiler which is needed to
-compile the Vulkan Compute Library's GLSL compute shaders.
-
-The Vulkan Delegate libraries can be built by setting `-DEXECUTORCH_BUILD_VULKAN=ON`
-when building with CMake.
-
-First, make sure that you have the Android NDK installed; any NDK version past
-NDK r19c should work. Note that the examples in this doc have been validated with
-NDK r27b. The Android SDK should also be installed so that you have access to `adb`.
-
-The instructions in this page assumes that the following environment variables
-are set.
-
-```shell
-export ANDROID_NDK=<path_to_ndk>
-# Select the appropriate Android ABI for your device
-export ANDROID_ABI=arm64-v8a
-# All subsequent commands should be performed from ExecuTorch repo root
-cd <path_to_executorch_root>
-# Make sure adb works
-adb --version
-```
-
-To build and install ExecuTorch libraries (for Android) with the Vulkan
-Delegate:
-
-```shell
-# From executorch root directory
-(rm -rf cmake-android-out && \
-  pp cmake . -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-    -DANDROID_ABI=$ANDROID_ABI \
-    -DEXECUTORCH_BUILD_VULKAN=ON \
-    -DPYTHON_EXECUTABLE=python \
-    -Bcmake-android-out && \
-  cmake --build cmake-android-out -j16 --target install)
-```
-
-### Run the Vulkan model on device
-
-::::{note}
-Since operator support is currently limited, only binary arithmetic operators
-will run on the GPU. Expect inference to be slow as the majority of operators
-are being executed via Portable operators.
-::::
-
-Now, the partially delegated model can be executed (partially) on your device's
-GPU!
-
-```shell
-# Build a model runner binary linked with the Vulkan delegate libs
-cmake --build cmake-android-out --target executor_runner -j32
-
-# Push model to device
-adb push vk_add.pte /data/local/tmp/vk_add.pte
-# Push binary to device
-adb push cmake-android-out/executor_runner /data/local/tmp/runner_bin
-
-# Run the model
-adb shell /data/local/tmp/runner_bin --model_path /data/local/tmp/vk_add.pte
-```
+Please see the [Vulkan Backend Overview](../../docs/source/backends/vulkan/vulkan-overview.md)
+to learn more about the ExecuTorch Vulkan Backend.