microsoft · yuslepukhin · May 9, 2026 · Apr 24, 2026 · May 6, 2026 · May 7, 2026
diff --git a/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/conv_op_builder.cc
@@ -15,6 +15,15 @@
 namespace onnxruntime {
 namespace coreml {
 
+namespace {
+
+bool IsSupportedFusedConvActivation(const std::string& name) {
+  return name == "Relu" || name == "Sigmoid" || name == "Tanh" ||
+         name == "LeakyRelu" || name == "Clip" || name == "HardSigmoid";
+}
+
+}  // namespace
+
 class ConvOpBuilder : public BaseOpBuilder {
   void AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const override;
 
@@ -92,9 +101,83 @@
 
     AddPadTypeAndPads(*conv_op, model_builder, op_type, helper, num_spatial_dims);
 
-    AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
+    const bool is_fused_conv = node.OpType() == "FusedConv";
+    if (!is_fused_conv) {
+      AddOperationOutput(*conv_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(conv_op));
+    } else {
+      // com.microsoft:FusedConv = Conv + activation. Emit conv into an
+      // intermediate, then the activation MIL op on top. Mirrors how
+      // ConvActivationFusion was going to compose them on other EPs.
+      const auto output_elem_type = static_cast<int32_t>(
+          node.OutputDefs()[0]->TypeAsProto()->tensor_type().elem_type());
+      std::vector<int64_t> output_shape;
+      ORT_RETURN_IF_NOT(GetShape(*node.OutputDefs()[0], output_shape, logger),
+                        "Failed to get FusedConv output shape");
+
+      const std::string& conv_out_name = model_builder.GetUniqueName(node, "fused_conv_conv_out");
+      AddIntermediateOperationOutput(*conv_op, conv_out_name, output_elem_type, output_shape);
+      model_builder.AddOperation(std::move(conv_op));
+
+      const std::string activation = helper.Get("activation", std::string(""));
+      const auto activation_params = helper.Get("activation_params", std::vector<float>{});
+
+      std::string_view mil_op;
+      if (activation == "Relu") {
+        mil_op = "relu";
+      } else if (activation == "Sigmoid") {
+        mil_op = "sigmoid";
+      } else if (activation == "Tanh") {
+        mil_op = "tanh";
+      } else if (activation == "LeakyRelu") {
+        mil_op = "leaky_relu";
+      } else if (activation == "Clip") {
+        mil_op = "clip";
+      } else if (activation == "HardSigmoid") {
+        mil_op = "sigmoid_hard";
+      } else {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                               "FusedConv has unsupported activation: ", activation);
+      }
+
+      auto act_op = model_builder.CreateOperation(node, mil_op, "activation");
+      AddOperationInput(*act_op, "x", conv_out_name);
+
+      auto add_scalar = [&](std::string_view port_name, float value) {
+        if (output_elem_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) {
+          AddOperationInput(*act_op, std::string(port_name),
+                            model_builder.AddScalarConstant(act_op->type(), std::string(port_name), value));
+        } else {
+          AddOperationInput(*act_op, std::string(port_name),
+                            model_builder.AddScalarConstant(act_op->type(), std::string(port_name), MLFloat16(value)));
+        }
+      };
+
+      // Activation-specific params. ConvActivationFusion packs them into
+      // `activation_params` in this order (see conv_activation_fusion.cc:165-184):
+      //   LeakyRelu: [alpha]
+      //   Clip:      [min, max]
+      //   HardSigmoid: [alpha, beta]
+      if (activation == "LeakyRelu") {
+        const float alpha = activation_params.empty() ? 0.01f : activation_params[0];
+        add_scalar("alpha", alpha);
+      } else if (activation == "Clip") {
+        const float min_v = activation_params.size() > 0 ? activation_params[0]
+                                                         : std::numeric_limits<float>::lowest();
+        const float max_v = activation_params.size() > 1 ? activation_params[1]
+                                                         : std::numeric_limits<float>::max();
+        add_scalar("alpha", min_v);
+        add_scalar("beta", max_v);
+      } else if (activation == "HardSigmoid") {
+        const float alpha = activation_params.size() > 0 ? activation_params[0] : 0.2f;
+        const float beta = activation_params.size() > 1 ? activation_params[1] : 0.5f;
+        add_scalar("alpha", alpha);
+        add_scalar("beta", beta);
+      }
 
-    model_builder.AddOperation(std::move(conv_op));
+      AddOperationOutput(*act_op, *node.OutputDefs()[0]);
+      model_builder.AddOperation(std::move(act_op));
+    }
   } else {
     std::unique_ptr<COREML_SPEC::NeuralNetworkLayer> layer = model_builder.CreateNNLayer(node);
 
@@ -232,6 +315,24 @@
                                       const logging::Logger& logger) const {
   const auto& name = node.Name();
   const auto& input_defs = node.InputDefs();
+  const bool is_fused_conv = node.OpType() == "FusedConv";
+
+  // FusedConv composes Conv with an activation op in a single node. Only
+  // implemented for the MLProgram path; fall back to CPU in NeuralNetwork mode
+  // rather than emitting an unfused Conv and losing the activation.
+  if (is_fused_conv) {
+    if (!input_params.create_mlprogram) {
+      LOGS(logger, VERBOSE) << "FusedConv is only supported in MLProgram format";
+      return false;
+    }
+    NodeAttrHelper fused_helper(node);
+    const std::string activation = fused_helper.Get("activation", std::string(""));
+    if (!IsSupportedFusedConvActivation(activation)) {
+      LOGS(logger, VERBOSE) << "FusedConv activation [" << activation
+                            << "] is not supported by the CoreML EP";
+      return false;
+    }
+  }
 
   const auto& weight_name = input_defs[1]->Name();
   const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name);

diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc
@@ -26,8 +26,11 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
   CreateActivationOpBuilder("Elu", op_registrations);
   CreateActivationOpBuilder("HardSigmoid", op_registrations);
 
-  // Microsoft-domain ops produced by ORT's own optimizer passes
+  // Microsoft-domain ops produced by ORT's own optimizer passes.
   CreateQuickGeluOpBuilder("QuickGelu", op_registrations);
+  // FusedConv (from ConvActivationFusion) reuses the existing ConvOpBuilder
+  // which branches on op_type internally.
+  CreateConvOpBuilder("FusedConv", op_registrations);
 
   // Unary ops
   CreateUnaryOpBuilder("Erf", op_registrations);

diff --git a/onnxruntime/test/providers/coreml/coreml_basic_test.cc b/onnxruntime/test/providers/coreml/coreml_basic_test.cc
@@ -1164,6 +1164,138 @@ TEST(CoreMLExecutionProviderTest, QuickGeluTestFp16) {
 #endif
 }
 
+namespace {
+// Build a single-node com.microsoft:FusedConv model for the tests below.
+// Input X is {1, 2, 4, 4}, weight W is {3, 2, 2, 2} (constant initializer, set
+// to a simple pattern), no bias. stride=1, pad=0. Output is {1, 3, 3, 3}.
+ONNX_NAMESPACE::ModelProto MakeFusedConvModel(const std::string& activation,
+                                              const std::vector<float>& activation_params) {
+  ONNX_NAMESPACE::ModelProto model_proto;
+  model_proto.set_ir_version(ONNX_NAMESPACE::IR_VERSION);
+  auto* onnx_opset = model_proto.add_opset_import();
+  onnx_opset->set_domain("");
+  onnx_opset->set_version(13);
+  auto* ms_opset = model_proto.add_opset_import();
+  ms_opset->set_domain("com.microsoft");
+  ms_opset->set_version(1);
+
+  auto* graph_proto = model_proto.mutable_graph();
+  graph_proto->set_name("fused_conv_test");
+
+  auto add_tensor_value = [&](auto* proto, const char* name, const std::vector<int64_t>& shape) {
+    proto->set_name(name);
+    auto* tt = proto->mutable_type()->mutable_tensor_type();
+    tt->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+    for (int64_t d : shape) tt->mutable_shape()->add_dim()->set_dim_value(d);
+  };
+  add_tensor_value(graph_proto->add_input(), "X", {1, 2, 4, 4});
+  add_tensor_value(graph_proto->add_output(), "Y", {1, 3, 3, 3});
+
+  // Weight initializer: {3, 2, 2, 2} = 24 floats, deterministic pattern.
+  auto* w_init = graph_proto->add_initializer();
+  w_init->set_name("W");
+  w_init->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT);
+  for (int64_t d : {3, 2, 2, 2}) w_init->add_dims(d);
+  for (int i = 0; i < 3 * 2 * 2 * 2; ++i) {
+    w_init->add_float_data(static_cast<float>(i) * 0.05f - 0.4f);
+  }
+
+  auto* node = graph_proto->add_node();
+  node->set_op_type("FusedConv");
+  node->set_domain("com.microsoft");
+  node->add_input("X");
+  node->add_input("W");
+  node->add_output("Y");
+
+  // Set pads explicitly since the CoreML conv builder's VALID-pad branch
+  // omits the 'pad' input that the MIL op requires. Conv attrs otherwise
+  // default: strides=[1,1].
+  auto* pads_attr = node->add_attribute();
+  pads_attr->set_name("pads");
+  pads_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_INTS);
+  for (int64_t v : {0, 0, 0, 0}) pads_attr->add_ints(v);
+
+  auto* act_attr = node->add_attribute();
+  act_attr->set_name("activation");
+  act_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_STRING);
+  act_attr->set_s(activation);
+
+  if (!activation_params.empty()) {
+    auto* act_params_attr = node->add_attribute();
+    act_params_attr->set_name("activation_params");
+    act_params_attr->set_type(ONNX_NAMESPACE::AttributeProto_AttributeType_FLOATS);
+    for (float v : activation_params) act_params_attr->add_floats(v);
+  }
+
+  return model_proto;
+}
+
+void RunFusedConvTest(const std::string& activation,
+                      const std::vector<float>& activation_params,
+                      std::string_view log_id) {
+  auto model_proto = MakeFusedConvModel(activation, activation_params);
+  std::string model_data;
+  ASSERT_TRUE(model_proto.SerializeToString(&model_data));
+  gsl::span<const std::byte> model_span{reinterpret_cast<const std::byte*>(model_data.data()), model_data.size()};
+
+#if defined(__APPLE__)
+  std::vector<float> x_data(1 * 2 * 4 * 4);
+  for (size_t i = 0; i < x_data.size(); ++i) x_data[i] = static_cast<float>(i) * 0.1f - 1.5f;
+  OrtValue ml_value_x;
+  AllocatorPtr allocator = CPUAllocator::DefaultInstance();
+  CreateMLValue<float>(allocator, {1, 2, 4, 4}, x_data, &ml_value_x);
+
+  NameMLValMap feeds;
+  feeds.insert(std::make_pair("X", ml_value_x));
+
+  RunAndVerifyOutputsWithEP(model_span, std::string(log_id),
+                            MakeCoreMLExecutionProvider("MLProgram"),
+                            feeds,
+                            EPVerificationParams{ExpectedEPNodeAssignment::All});
+#else
+  TestModelLoad(model_span, MakeCoreMLExecutionProvider("MLProgram"), ExpectedEPNodeAssignment::All);
+#endif
+}
+}  // namespace
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestRelu) {
+  // Param-less activation. Exercises the Conv → activation wiring with no
+  // `activation_params` attribute.
+  RunFusedConvTest("Relu", {}, "FusedConvTestRelu_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestHardSigmoid) {
+  // Two-param activation (alpha, beta) with non-default values — catches any
+  // activation_params-wiring bug. Depends on the HardSigmoid CoreML builder
+  // landed in #28182.
+  RunFusedConvTest("HardSigmoid", {0.15f, 0.55f}, "FusedConvTestHardSigmoid_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestClip) {
+  // Two-param activation where params map to alpha=min, beta=max in CoreML's
+  // clip op. Covers the remaining parametric activation.
+  RunFusedConvTest("Clip", {-0.5f, 0.5f}, "FusedConvTestClip_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestLeakyRelu) {
+  // Single-param activation (alpha). Heavily used by YOLOv3 — a CPU-optimized
+  // YOLOv3 graph contains 72 Conv→LeakyRelu fusions, all of which would
+  // otherwise fall back to CPU and fragment the CoreML partition.
+  RunFusedConvTest("LeakyRelu", {0.1f}, "FusedConvTestLeakyRelu_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestSigmoid) {
+  // Param-less Sigmoid activation. Distinct from the Relu test only in the
+  // emitted MIL op (`sigmoid` vs `relu`); guards against regressions in
+  // op-name dispatch.
+  RunFusedConvTest("Sigmoid", {}, "FusedConvTestSigmoid_MLProgram");
+}
+
+TEST(CoreMLExecutionProviderTest, FusedConvTestTanh) {
+  // Param-less Tanh activation; same rationale as the Sigmoid test for the
+  // remaining elementwise activation.
+  RunFusedConvTest("Tanh", {}, "FusedConvTestTanh_MLProgram");
+}
 #endif  // !(ORT_MINIMAL_BUILD)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md
@@ -53,3 +53,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution
 |ai.onnx:Transpose||
 |ai.onnx:Unsqueeze||
 |com.microsoft:QuickGelu|Produced by ORT's `QuickGeluFusion` optimizer pass. Decomposed into `mul` / `sigmoid` / `mul`.|
+|com.microsoft:FusedConv|Produced by ORT's `ConvActivationFusion` pass. Decomposed into `conv` + the fused activation (`Relu`, `Sigmoid`, `Tanh`, `LeakyRelu`, `Clip`, `HardSigmoid`).|