Update

manuelcandales · manuelcandales · commit 61ead646cbd0 · 2025-10-14T23:00:36.000-04:00
[ghstack-poisoned]
diff --git a/backends/apple/metal/runtime/shims/et_metal_ops.h b/backends/apple/metal/runtime/shims/et_metal_ops.h
@@ -18,19 +18,6 @@ namespace metal {
 extern "C" {
 #endif
 
-/**
- * ExecutorTorch implementation of aoti_torch_mps_addmm_out.
- * Performs matrix multiplication with bias: out = beta * self + alpha * (mat1 @
- * mat2)
- */
-AOTITorchError aoti_torch_mps_addmm_out(
-    AOTITensorHandle out,
-    AOTITensorHandle self,
-    AOTITensorHandle mat1,
-    AOTITensorHandle mat2,
-    double beta,
-    double alpha);
-
 /**
  * ExecutorTorch implementation of aoti_torch_mps_mm_out.
  * Performs simple matrix multiplication: out = self @ mat2
diff --git a/backends/apple/metal/runtime/shims/et_metal_ops.mm b/backends/apple/metal/runtime/shims/et_metal_ops.mm
@@ -229,59 +229,6 @@ AOTITorchError aoti_torch_mps_mm_out(
   }
 }
 
-AOTITorchError aoti_torch_mps_addmm_out(
-    AOTITensorHandle out,
-    AOTITensorHandle self,
-    AOTITensorHandle mat1,
-    AOTITensorHandle mat2,
-    double beta,
-    double alpha) {
-  ET_LOG(Debug, "aoti_torch_mps_addmm_out: Starting with out=%p, self=%p, mat1=%p, mat2=%p, beta=%f, alpha=%f",
-         out, self, mat1, mat2, beta, alpha);
-
-  if (!out || !self || !mat1 || !mat2) {
-    ET_LOG(Error, "aoti_torch_mps_addmm_out: null tensor handles");
-    return Error::InvalidArgument;
-  }
-
-  @autoreleasepool {
-    try {
-      // Convert AOTITensorHandle to ExecutorTorch tensors
-      auto out_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(out);
-      auto self_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(self);
-      auto mat1_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(mat1);
-      auto mat2_tensor = reinterpret_cast<executorch::runtime::etensor::Tensor*>(mat2);
-
-      ET_LOG(Debug, "aoti_torch_mps_addmm_out: Converted tensor handles to ET tensors");
-
-      // For now, just zero out the output tensor to get the right shape
-      // TODO: Implement actual matrix multiplication: out = beta * self + alpha * (mat1 @ mat2)
-
-      // Get output data pointer and size
-      float* out_data = static_cast<float*>(out_tensor->mutable_data_ptr());
-      size_t out_numel = out_tensor->numel();
-
-      if (!out_data) {
-        ET_LOG(Error, "aoti_torch_mps_addmm_out: null output data pointer");
-        return Error::InvalidArgument;
-      }
-
-      // Zero out the output tensor
-      std::memset(out_data, 0, out_numel * sizeof(float));
-
-      ET_LOG(Debug, "aoti_torch_mps_addmm_out: Zeroed output tensor with %zu elements", out_numel);
-      return Error::Ok;
-
-    } catch (const std::exception& e) {
-      ET_LOG(Error, "aoti_torch_mps_addmm_out exception: %s", e.what());
-      return Error::Internal;
-    } catch (...) {
-      ET_LOG(Error, "aoti_torch_mps_addmm_out: unknown exception");
-      return Error::Internal;
-    }
-  }
-}
-
 AOTITorchError aoti_torch_mps_convolution(
     AOTITensorHandle input,
     AOTITensorHandle weight,
@@ -743,7 +690,7 @@ AOTITorchError aoti_torch_mps_convolution(
           output_strides.data(),
           0,  // storage_offset
           dtype,  // dtype
-          2,  // device_type (MPS)
+          13,  // device_type (MPS)
           0,  // device_index
           &output_tensor_handle,
           0,  // layout (strided)
@@ -859,6 +806,12 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
 
         ET_LOG(Debug, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: mps_dtype=%d, element_size=%zu", mps_dtype, element_size);
 
+        // Check that headSize is not zero to avoid division by zero
+        if (headSize == 0) {
+          ET_LOG(Error, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: headSize is zero");
+          throw std::runtime_error("headSize must be non-zero for scaled dot product attention");
+        }
+
         // Calculate scale factor
         double scale_factor = scale ? *scale : (1.0 / sqrt(static_cast<double>(headSize)));
         ET_LOG(Debug, "aoti_torch_mps__scaled_dot_product_attention_math_for_mps: scale_factor=%f", scale_factor);
@@ -1193,7 +1146,7 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
             out_strides.data(),
             0,  // storage_offset
             dtype,
-            2,  // device_type (MPS)
+            13,  // device_type (MPS)
             0,  // device_index
             &out_tensor_handle,
             0,  // layout (strided)
@@ -1208,7 +1161,7 @@ AOTITorchError aoti_torch_mps__scaled_dot_product_attention_math_for_mps(
             attn_strides.data(),
             0,  // storage_offset
             dtype,
-            2,  // device_type (MPS)
+            13,  // device_type (MPS)
             0,  // device_index
             &attn_tensor_handle,
             0,  // layout (strided)
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -38,7 +38,7 @@
     ExecutorchProgramManager,
 )
 from executorch.exir.passes import ToOutVarPass
-from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
+from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass
 from executorch.exir.program._program import to_edge
 
 from torch.export.exported_program import ExportedProgram
@@ -460,7 +460,7 @@ def _lower_ep_to_cadence_gen_etrecord(
             emit_stacktrace=False,
             to_out_var_pass=ToOutVarPass(),
             extract_delegate_segments=False,
-            sym_shape_eval_pass=HintBasedSymShapeEvalPass(),
+            sym_shape_eval_pass=ConstraintBasedSymShapeEvalPass(),
         ),
     )
 
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -874,6 +874,7 @@ def _to_edge_and_lower_llama_xnnpack(
     xnnpack_extended_ops: bool = False,
     generate_etrecord: bool = False,
     verbose: bool = False,
+    gen_tag_fn: Optional[Callable[[torch.fx.Node], Optional[str]]] = None,
 ) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
 
@@ -896,9 +897,27 @@ def _to_edge_and_lower_llama_xnnpack(
     if generate_etrecord:
         builder_exported.generate_etrecord = True
 
-    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
-        partitioners
-    )
+    builder = builder_exported.pt2e_quantize(quantizers)
+    if gen_tag_fn is not None:
+        from executorch.exir.passes.external_constants_pass import (
+            delegate_external_constants_pass_unlifted,
+            external_constants_pass,
+        )
+
+        assert (
+            builder_exported.pre_autograd_graph_module is not None
+        ), "pre_autograd_graph_module shouldn't be None here"
+        delegate_external_constants_pass_unlifted(
+            module=builder_exported.pre_autograd_graph_module,
+            gen_tag_fn=gen_tag_fn,
+        )
+
+        # Also add a pass for 'to_executorch' to tag weights that aren't delegated.
+        additional_passes.append(
+            partial(external_constants_pass, gen_tag_fn=gen_tag_fn)
+        )
+
+    builder = builder.to_edge_transform_and_lower(partitioners)
     if verbose:
         print_delegation_info(builder.edge_manager.exported_program().graph_module)
 
@@ -1136,6 +1155,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
         llm_config.backend.xnnpack.enabled = True
 
     if llm_config.backend.xnnpack.enabled:
+        gen_tag_fn = None
         if (
             llm_config.export.foundation_weights_file is not None
             or llm_config.export.lora_weights_file is not None
@@ -1145,24 +1165,6 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
                 if "lora" not in x.name
                 else llm_config.export.lora_weights_file
             )
-            from executorch.exir.passes.external_constants_pass import (
-                delegate_external_constants_pass_unlifted,
-                external_constants_pass,
-            )
-
-            assert (
-                builder_exported.pre_autograd_graph_module is not None
-            ), "pre_autograd_graph_module shouldn't be None here"
-            delegate_external_constants_pass_unlifted(
-                module=builder_exported.pre_autograd_graph_module,
-                gen_tag_fn=gen_tag_fn,
-            )
-
-            # Also add a pass for 'to_executorch' to tag weights that aren't delegated.
-            additional_passes.append(
-                partial(external_constants_pass, gen_tag_fn=gen_tag_fn)
-            )
-
         builder = _to_edge_and_lower_llama_xnnpack(
             builder_exported,
             modelname,
@@ -1173,6 +1175,7 @@ def _export_llama(llm_config: LlmConfig) -> LLMEdgeManager:  # noqa: C901
             xnnpack_extended_ops=llm_config.backend.xnnpack.extended_ops,
             generate_etrecord=llm_config.debug.generate_etrecord,
             verbose=llm_config.debug.verbose,
+            gen_tag_fn=gen_tag_fn,
         )
     elif llm_config.backend.openvino.enabled:
         builder = _to_edge_and_lower_llama_openvino(