pytorch
diff --git a/‎.gitignore‎
Lines changed: 0 additions & 1 deletion b/‎.gitignore‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion b/‎CONTRIBUTING.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion b/‎README-wheel.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/test/models/test_nn_modules.py‎
Lines changed: 77 additions & 18 deletions b/‎backends/arm/test/models/test_nn_modules.py‎
Lines changed: 77 additions & 18 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 10 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 6 additions & 10 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 1 addition & 1 deletion
@@ -62,7 +62,6 @@ xcuserdata/
 /include/
 /share/
 /version.py
-*.csv
 *_etdump
 
 # Android
 
@@ -34,7 +34,7 @@ executorch
 │   ├── <a href="backends/qualcomm">qualcomm</a> - Qualcomm-specific backends. See <a href="docs/source/backends-qualcomm.md">doc</a>.
 │   ├── <a href="backends/transforms">transforms</a> - Transformations for backend optimization.
 │   ├── <a href="backends/vulkan">vulkan</a> - Vulkan backend for cross-platform GPU support. See <a href="docs/source/backends-vulkan.md">doc</a>.
-│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends-xnnpack.md">doc</a>.
+│   └── <a href="backends/xnnpack">xnnpack</a> - XNNPACK backend for optimized neural network operations. See <a href="docs/source/backends/xnnpack/xnnpack-overview.md">doc</a>.
 ├── <a href="codegen">codegen</a> - Tooling to autogenerate bindings between kernels and the runtime.
 ├── <a href="configurations">configurations</a> - Configuration files.
 ├── <a href="devtools">devtools</a> - Model profiling, debugging, and inspection. Please refer to the <a href="docs/source/devtools-overview.md">tools documentation</a> for more information.
 
@@ -11,7 +11,7 @@ The `executorch` pip package is in beta.
 The prebuilt `executorch.runtime` module included in this package provides a way
 to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen operators](docs/source/ir-ops-set-definition.md) are linked into the prebuilt module
-* Only the [XNNPACK backend delegate](docs/source/backends-xnnpack.md) is linked into the prebuilt module.
+* Only the [XNNPACK backend delegate](docs/source/backends/xnnpack/xnnpack-overview.md) is linked into the prebuilt module.
 * \[macOS only] [Core ML](docs/source/backends/coreml/coreml-overview.md) and [MPS](docs/source/backends/mps/mps-overview.md) backend
   are also linked into the prebuilt module.
 
 
@@ -17,32 +17,91 @@
 - Transformer
 """
 
+from typing import Callable
+
 import torch
 from executorch.backends.arm.test.common import parametrize
 from executorch.backends.arm.test.tester.test_pipeline import (
     TosaPipelineFP,
     TosaPipelineINT,
 )
 
+
+def make_module_wrapper(
+    name: str, module_factory: Callable[[], torch.nn.Module]
+) -> torch.nn.Module:
+    class ModuleWrapper(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self._module = module_factory()
+
+        def forward(self, *args, **kwargs):
+            return self._module(*args, **kwargs)
+
+    ModuleWrapper.__name__ = name
+    ModuleWrapper.__qualname__ = name
+    return ModuleWrapper()
+
+
 example_input = torch.rand(1, 6, 16, 16)
 
 module_tests = [
-    (torch.nn.Embedding(10, 10), (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),)),
-    (torch.nn.LeakyReLU(), (example_input,)),
-    (torch.nn.BatchNorm1d(16), (torch.rand(6, 16, 16),)),
-    (torch.nn.AdaptiveAvgPool2d((12, 12)), (example_input,)),
-    (torch.nn.ConvTranspose2d(6, 3, 2), (example_input,)),
-    (torch.nn.GRU(10, 20, 2), (torch.randn(5, 3, 10), torch.randn(2, 3, 20))),
-    (torch.nn.GroupNorm(2, 6), (example_input,)),
-    (torch.nn.InstanceNorm2d(16), (example_input,)),
-    (torch.nn.PReLU(), (example_input,)),
     (
-        torch.nn.Transformer(
-            d_model=64,
-            nhead=1,
-            num_encoder_layers=1,
-            num_decoder_layers=1,
-            dtype=torch.float32,
+        make_module_wrapper(
+            "EmbeddingModule",
+            lambda: torch.nn.Embedding(10, 10),
+        ),
+        (torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]]),),
+    ),
+    (
+        make_module_wrapper("LeakyReLUModule", torch.nn.LeakyReLU),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("BatchNorm1dModule", lambda: torch.nn.BatchNorm1d(16)),
+        (torch.rand(6, 16, 16),),
+    ),
+    (
+        make_module_wrapper(
+            "AdaptiveAvgPool2dModule",
+            lambda: torch.nn.AdaptiveAvgPool2d((12, 12)),
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "ConvTranspose2dModule", lambda: torch.nn.ConvTranspose2d(6, 3, 2)
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("GRUModule", lambda: torch.nn.GRU(10, 20, 2)),
+        (torch.randn(5, 3, 10), torch.randn(2, 3, 20)),
+    ),
+    (
+        make_module_wrapper("GroupNormModule", lambda: torch.nn.GroupNorm(2, 6)),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "InstanceNorm2dModule", lambda: torch.nn.InstanceNorm2d(16)
+        ),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper("PReLUModule", torch.nn.PReLU),
+        (example_input,),
+    ),
+    (
+        make_module_wrapper(
+            "TransformerModule",
+            lambda: torch.nn.Transformer(
+                d_model=64,
+                nhead=1,
+                num_encoder_layers=1,
+                num_decoder_layers=1,
+                dtype=torch.float32,
+            ),
         ),
         (torch.rand((10, 32, 64)), torch.rand((20, 32, 64))),
     ),
@@ -78,9 +137,9 @@ def test_nn_Modules_FP(test_data):
     "test_data",
     test_parameters,
     xfails={
-        "GRU": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
-        "PReLU": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
-        "Transformer": "AssertionError: Output 0 does not match reference output.",
+        "GRUModule": "RuntimeError: Node aten_linear_default with op <EdgeOpOverload: aten.linear[...]> was not decomposed or delegated.",
+        "PReLUModule": "RuntimeError: mul(): functions with out=... arguments don't support automatic differentiation, but one of the arguments requires grad.",
+        "TransformerModule": "AssertionError: Output 0 does not match reference output.",
     },
 )
 def test_nn_Modules_INT(test_data):
 
@@ -53,17 +53,10 @@ def _validate_ref_impl_exists() -> None:
     # 1. be removed
     # 2. have a reference implementation added to ref_implementations.py
     _WARN_ONLY = {
-        "cadence::quantized_w8a32_linear",
-        "cadence::quantized_add",  # We should only support per_tensor variant, should remove
         "cadence::_softmax_f32_f32",
-        "cadence::requantize",  # We should only support per_tensor variant, should remove
         "cadence::quantized_softmax.per_tensor",
-        "cadence::quantized_conv2d_nchw",  # We should only support per_tensor variant, should remove
-        "cadence::quantized_relu",  # We should only support per_tensor variant, should remove
-        "cadence::quantized_conv2d_nhwc",  # We should only support per_tensor variant, should remove
         "cadence::quantized_softmax",
         "cadence::quantized_w8a32_gru",
-        "cadence::quantized_layer_norm",  # We should only support per_tensor variant, should remove
     }
 
     ref_impls = get_registered_ref_implementations()
@@ -2706,6 +2699,9 @@ def quantized_w8a32_linear_meta(
     # output comes in empty with shape [leading_dims, out_dim]
     src_shape = list(src.shape)
     weight_shape = weight.shape
+    assert (src_shape[-1] % 4) == 0
+    if len(src_shape) >= 2:
+        assert src_shape[-2] == 1
     assert len(weight_shape) == 2
     assert src_shape[-1] == weight_shape[-1]
     src_shape[-1] = weight_shape[0]
@@ -2720,12 +2716,12 @@ def quantized_w8a32_conv_meta(
     bias: torch.Tensor,
     b_scale: float,
 ) -> torch.Tensor:
-    # src comes in shape [batch, in_channel, in_length]
-    # weight comes in shape [out_ch, in_ch, kernel_dim]
+    # src comes in shape [batch, in_length, in_channels]
+    # weight comes in shape [kernel_dim, out_ch, in_ch]
     # output comes in empty with shape [batch, out_ch, in_length - kernel_dim + 1]
     assert len(src.shape) == 3
 
-    out_channels, in_channels, kernel_size = weight.shape
+    kernel_size, out_channels, in_channels = weight.shape
     assert kernel_size == 3
     assert (out_channels % 4) == 0
     assert (in_channels % 4) == 0
 
@@ -397,7 +397,7 @@ def get_args_and_kwargs_mixed_w8a32_conv(
     )
     transposed_weights = graph_module.graph.call_function(
         torch.ops.aten.permute.default,
-        (weights_inputs[0], [2, 0, 1]),  # NCL -> NLC
+        (weights_inputs[0], [2, 0, 1]),  # NCL -> LNC
     )
 
     args = (
Original file line number	Diff line number	Diff line change
`@@ -397,7 +397,7 @@ def get_args_and_kwargs_mixed_w8a32_conv(`
`397`	`397`	`)`
`398`	`398`	`transposed_weights = graph_module.graph.call_function(`
`399`	`399`	`torch.ops.aten.permute.default,`
`400`		`- (weights_inputs[0], [2, 0, 1]), # NCL -> NLC`
	`400`	`+ (weights_inputs[0], [2, 0, 1]), # NCL -> LNC`
`401`	`401`	`)`
`402`	`402`
`403`	`403`	`args = (`