apache · WeichenXu123 · Mar 3, 2023 · Mar 6, 2023 · Mar 6, 2023 · Mar 6, 2023
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/base.proto b/connector/connect/common/src/main/protobuf/spark/connect/base.proto
@@ -24,6 +24,7 @@ import "spark/connect/commands.proto";
 import "spark/connect/expressions.proto";
 import "spark/connect/relations.proto";
 import "spark/connect/types.proto";
+import "spark/connect/ml.proto";
 
 option java_multiple_files = true;
 option java_package = "org.apache.spark.connect.proto";
@@ -36,6 +37,7 @@ message Plan {
   oneof op_type {
     Relation root = 1;
     Command command = 2;
+    MlCommand ml_command = 3;
   }
 }
 
@@ -261,6 +263,9 @@ message ExecutePlanResponse {
     // Special case for executing SQL commands.
     SqlCommandResult sql_command_result = 5;
 
+    // ML command response
+    MlCommandResponse ml_command_result = 100;
+
     // Support arbitrary result objects.
     google.protobuf.Any extension = 999;
   }

diff --git a/connector/connect/common/src/main/protobuf/spark/connect/ml.proto b/connector/connect/common/src/main/protobuf/spark/connect/ml.proto
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "spark/connect/expressions.proto";
+import "spark/connect/relations.proto";
+import "spark/connect/ml_common.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+
+// MlEvaluator represents a ML Evaluator
+message MlEvaluator {
+  // The name of the evaluator in the registry
+  string name = 1;
+  // param settings for the evaluator
+  MlParams params = 2;
+  // unique id of the evaluator
+  string uid = 3;
+}
+
+
+// a MlCommand is a type container that has exactly one ML command set
+message MlCommand {
+  oneof ml_command_type {
+    // call `estimator.fit` and returns a model
+    Fit fit = 1;
+    // get model attribute
+    FetchModelAttr fetch_model_attr = 2;
+    // get model summary attribute
+    FetchModelSummaryAttr fetch_model_summary_attr = 3;
+    // load model
+    LoadModel load_model = 4;
+    // save model
+    SaveModel save_model = 5;
+    // call `evaluator.evaluate`
+    Evaluate evaluate = 6;
+    // save estimator or transformer
+    SaveStage save_stage = 7;
+    // load estimator or transformer
+    LoadStage load_stage = 8;
+    // save estimator
+    SaveEvaluator save_evaluator = 9;
+    // load estimator
+    LoadEvaluator load_evaluator = 10;
+    // copy model, returns new model reference id
+    CopyModel copy_model = 11;
+    // delete server side model object by model reference id
+    DeleteModel delete_model = 12;
+  }
+
+  message Fit {
+    MlStage estimator = 1;
+    Relation dataset = 2;
+  }
+
+  message Evaluate {
+    MlEvaluator evaluator = 1;
+  }
+
+  message LoadModel {
+    string name = 1;
+    string path = 2;
+  }
+
+  message SaveModel {
+    int64 model_ref_id = 1;
+    string path = 2; // saving path
+    bool overwrite = 3;
+    map<string, string> options = 4; // saving options
+  }
+
+  message LoadStage {
+    string name = 1;
+    string path = 2;
+    MlStage.StageType type = 3;
+  }
+
+  message SaveStage {
+    MlStage stage = 1;
+    string path = 2; // saving path
+    bool overwrite = 3;
+    map<string, string> options = 4; // saving options
+  }
+
+  message LoadEvaluator {
+    string name = 1;
+    string path = 2;
+  }
+
+  message SaveEvaluator {
+    MlEvaluator evaluator = 1;
+    string path = 2; // saving path
+    bool overwrite = 3;
+    map<string, string> options = 4; // saving options
+  }
+
+  message FetchModelAttr {
+    int64 model_ref_id = 1;
+    string name = 2;
+  }
+
+  message FetchModelSummaryAttr {
+    int64 model_ref_id = 1;
+    string name = 2;
+    MlParams params = 3;
+
+    // Evaluation dataset that it uses to computes
+    // the summary attribute
+    // If not set, get attributes from
+    // model.summary (i.e. the summary on training dataset)
+    optional Relation evaluation_dataset = 4;
+  }
+
+  message CopyModel {
+    int64 model_ref_id = 1;
+  }
+
+  message DeleteModel {
+    int64 model_ref_id = 1;
+  }
+}
+
+
+message MlCommandResponse {
+  oneof ml_command_response_type {
+    Expression.Literal literal = 1;
+    ModelInfo model_info = 2;
+    Vector vector = 3;
+    Matrix matrix = 4;
+    MlStage stage = 5;
+  }
+  message ModelInfo {
+    int64 model_ref_id = 1;
+    string model_uid = 2;
+    MlParams params = 3;
+  }
+}
+
+
+message Vector {
+  oneof one_of {
+    Dense dense = 1;
+    Sparse sparse = 2;
+  }
+  message Dense {
+    repeated double value = 1;
+  }
+  message Sparse {
+    int32 size = 1;
+    repeated double index = 2;
+    repeated double value = 3;
+  }
+}
+
+message Matrix {
+  oneof one_of {
+    Dense dense = 1;
+    Sparse sparse = 2;
+  }
+  message Dense {
+    int32 num_rows = 1;
+    int32 num_cols = 2;
+    repeated double value = 3;
+    bool is_transposed = 4;
+  }
+  message Sparse {
+    int32 num_rows = 1;
+    int32 num_cols = 2;
+    repeated double colptr = 3;
+    repeated double row_index = 4;
+    repeated double value = 5;
+    bool is_transposed = 6;
+  }
+}
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/ml_common.proto b/connector/connect/common/src/main/protobuf/spark/connect/ml_common.proto
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = 'proto3';
+
+package spark.connect;
+
+import "spark/connect/expressions.proto";
+
+option java_multiple_files = true;
+option java_package = "org.apache.spark.connect.proto";
+
+
+// MlParams stores param settings for
+// ML Estimator / Transformer / Model / Evaluator
+message MlParams {
+  // user-supplied params
+  map<string, Expression.Literal> params = 1;
+  // default params
+  map<string, Expression.Literal> default_params = 2;
+}
+
+// MlStage stores ML stage data (Estimator or Transformer)
+message MlStage {
+  // The name of the stage in the registry
+  string name = 1;
+  // param settings for the stage
+  MlParams params = 2;
+  // unique id of the stage
+  string uid = 3;
+  StageType type = 4;
+  enum StageType {
+    UNSPECIFIED = 0;
+    ESTIMATOR = 1;
+    TRANSFORMER = 2;
+  }
+}
diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -23,6 +23,7 @@ import "google/protobuf/any.proto";
 import "spark/connect/expressions.proto";
 import "spark/connect/types.proto";
 import "spark/connect/catalog.proto";
+import "spark/connect/ml_common.proto";
 
 option java_multiple_files = true;
 option java_package = "org.apache.spark.connect.proto";
@@ -82,13 +83,50 @@ message Relation {
     // Catalog API (experimental / unstable)
     Catalog catalog = 200;
 
+    // ML relation
+    MlRelation ml_relation = 300;
+
     // This field is used to mark extensions to the protocol. When plugins generate arbitrary
     // relations they can add them here. During the planning the correct resolution is done.
     google.protobuf.Any extension = 998;
     Unknown unknown = 999;
   }
 }
 
+message MlRelation {
+  oneof ml_relation_type {
+    ModelTransform model_transform = 1;
+    FeatureTransform feature_transform = 2;
+    ModelAttr model_attr = 3;
+    ModelSummaryAttr model_summary_attr = 4;
+  }
+  message ModelTransform {
+    Relation input = 1;
+    int64 model_ref_id = 2;
+    MlParams params = 3;
+  }
+  message FeatureTransform {
+    Relation input = 1;
+    MlStage transformer = 2;
+  }
+  message ModelAttr {
+    int64 model_ref_id = 1;
+    string name = 2;
+  }
+  message ModelSummaryAttr {
+    int64 model_ref_id = 1;
+    string name = 2;
+    MlParams params = 3;
+
+    // Evaluation dataset that it uses to computes
+    // the summary attribute
+    // If not set, get attributes from
+    // model.summary (i.e. the summary on training dataset)
+    optional Relation evaluation_dataset = 4;
+  }
+}
+
+
 // Used for testing purposes only.
 message Unknown {}