huangxiaopingRD · huangxiaopingRD · Dec 14, 2022 · Dec 9, 2022 · Dec 9, 2022 · Dec 9, 2022
diff --git a/appveyor.yml b/appveyor.yml
@@ -28,6 +28,7 @@ only_commits:
   files:
     - appveyor.yml
     - dev/appveyor-install-dependencies.ps1
+    - build/spark-build-info.ps1
     - R/
     - sql/core/src/main/scala/org/apache/spark/sql/api/r/
     - core/src/main/scala/org/apache/spark/api/r/

diff --git a/build/spark-build-info.ps1 b/build/spark-build-info.ps1
@@ -0,0 +1,46 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# This script generates the build info for spark and places it into the spark-version-info.properties file.
+# Arguments:
+#   ResourceDir - The target directory where properties file would be created. [./core/target/extra-resources]
+#   SparkVersion - The current version of spark
+
+param(
+    # The resource directory.
+    [Parameter(Position = 0)]
+    [String]
+    $ResourceDir,
+
+    # The Spark version.
+    [Parameter(Position = 1)]
+    [String]
+    $SparkVersion
+)
+
+$null = New-Item -Type Directory -Force $ResourceDir
+$SparkBuildInfoPath = $ResourceDir.TrimEnd('\').TrimEnd('/') + '\spark-version-info.properties'
+
+$SparkBuildInfoContent =
+"version=$SparkVersion
+user=$($Env:USERNAME)
+revision=$(git rev-parse HEAD)
+branch=$(git rev-parse --abbrev-ref HEAD)
+date=$([DateTime]::UtcNow | Get-Date -UFormat +%Y-%m-%dT%H:%M:%SZ)
+url=$(git config --get remote.origin.url)"
+
+Set-Content -Path $SparkBuildInfoPath -Value $SparkBuildInfoContent
diff --git a/common/network-yarn/pom.xml b/common/network-yarn/pom.xml
@@ -136,6 +136,11 @@
             <goals>
               <goal>shade</goal>
             </goals>
+            <configuration>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer"/>
+              </transformers>
+            </configuration>
           </execution>
         </executions>
       </plugin>

diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala
@@ -70,10 +70,6 @@ case class AvroScan(
 
   override def hashCode(): Int = super.hashCode()
 
-  override def description(): String = {
-    super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]")
-  }
-
   override def getMetaData(): Map[String, String] = {
     super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters))
   }

diff --git a/connector/connect/README.md b/connector/connect/README.md
@@ -32,15 +32,15 @@ for example, compiling `connect` module on CentOS 6 or CentOS 7 which the defaul
 specifying the user-defined `protoc` and `protoc-gen-grpc-java` binary files as follows:
 
 ```bash
-export CONNECT_PROTOC_EXEC_PATH=/path-to-protoc-exe
+export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe
 export CONNECT_PLUGIN_EXEC_PATH=/path-to-protoc-gen-grpc-java-exe
 ./build/mvn -Phive -Puser-defined-protoc clean package
 ```
 
 or
 
 ```bash
-export CONNECT_PROTOC_EXEC_PATH=/path-to-protoc-exe
+export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe
 export CONNECT_PLUGIN_EXEC_PATH=/path-to-protoc-gen-grpc-java-exe
 ./build/sbt -Puser-defined-protoc clean package
 ```
@@ -82,7 +82,7 @@ To use the release version of Spark Connect:
 
 ```bash
 # Run all Spark Connect Python tests as a module.
-./python/run-tests --module pyspark-connect
+./python/run-tests --module pyspark-connect --parallelism 1
 ```
 
 

diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml
@@ -193,7 +193,7 @@
         <profile>
             <id>user-defined-protoc</id>
             <properties>
-                <connect.protoc.executable.path>${env.CONNECT_PROTOC_EXEC_PATH}</connect.protoc.executable.path>
+                <spark.protoc.executable.path>${env.SPARK_PROTOC_EXEC_PATH}</spark.protoc.executable.path>
                 <connect.plugin.executable.path>${env.CONNECT_PLUGIN_EXEC_PATH}</connect.plugin.executable.path>
             </properties>
             <build>
@@ -203,7 +203,7 @@
                         <artifactId>protobuf-maven-plugin</artifactId>
                         <version>0.6.1</version>
                         <configuration>
-                            <protocExecutable>${connect.protoc.executable.path}</protocExecutable>
+                            <protocExecutable>${spark.protoc.executable.path}</protocExecutable>
                             <pluginId>grpc-java</pluginId>
                             <pluginExecutable>${connect.plugin.executable.path}</pluginExecutable>
                             <protoSourceRoot>src/main/protobuf</protoSourceRoot>

diff --git a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto
@@ -77,9 +77,7 @@ message Expression {
       int32 year_month_interval = 20;
       int64 day_time_interval = 21;
 
-      Array array = 22;
-      Struct struct = 23;
-      Map map = 24;
+      DataType typed_null = 22;
     }
 
     // whether the literal type should be treated as a nullable type. Applies to
@@ -107,25 +105,6 @@ message Expression {
       int32 days = 2;
       int64 microseconds = 3;
     }
-
-    message Struct {
-      // A possibly heterogeneously typed list of literals
-      repeated Literal fields = 1;
-    }
-
-    message Array {
-      // A homogeneously typed list of literals
-      repeated Literal values = 1;
-    }
-
-    message Map {
-      repeated Pair pairs = 1;
-
-      message Pair {
-        Literal key = 1;
-        Literal value = 2;
-      }
-    }
   }
 
   // An unresolved attribute that is not explicitly bound to a specific column, but the column

diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto
@@ -20,6 +20,7 @@ syntax = 'proto3';
 package spark.connect;
 
 import "spark/connect/expressions.proto";
+import "spark/connect/types.proto";
 
 option java_multiple_files = true;
 option java_package = "org.apache.spark.connect.proto";
@@ -54,6 +55,7 @@ message Relation {
     Tail tail = 22;
     WithColumns with_columns = 23;
     Hint hint = 24;
+    Unpivot unpivot = 25;
 
     // NA functions
     NAFill fill_na = 90;
@@ -304,6 +306,17 @@ message LocalRelation {
   // Local collection data serialized into Arrow IPC streaming format which contains
   // the schema of the data.
   bytes data = 1;
+
+  // (Optional) The user provided schema.
+  //
+  // The Sever side will update the column names and data types according to this schema.
+  oneof schema {
+
+    DataType datatype = 2;
+
+    // Server will use Catalyst parser to parse this string to DataType.
+    string datatype_str = 3;
+  }
 }
 
 // Relation of type [[Sample]] that samples a fraction of the dataset.
@@ -570,3 +583,21 @@ message Hint {
   // (Optional) Hint parameters.
   repeated Expression.Literal parameters = 3;
 }
+
+// Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set.
+message Unpivot {
+  // (Required) The input relation.
+  Relation input = 1;
+
+  // (Required) Id columns.
+  repeated Expression ids = 2;
+
+  // (Optional) Value columns to unpivot.
+  repeated Expression values = 3;
+
+  // (Required) Name of the variable column.
+  string variable_column_name = 4;
+
+  // (Required) Name of the value column.
+  string value_column_name = 5;
+}
diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml
@@ -55,6 +55,12 @@
       <groupId>org.apache.spark</groupId>
       <artifactId>spark-connect-common_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
@@ -106,19 +112,80 @@
       <artifactId>spark-tags_${scala.binary.version}</artifactId>
       <version>${project.version}</version>
       <scope>provided</scope>
+      <exclusions>
+        <exclusion>
+          <groupId>com.google.guava</groupId>
+          <artifactId>guava</artifactId>
+        </exclusion>
+      </exclusions>
     </dependency>
     <!-- #if scala-2.13 --><!--
     <dependency>
       <groupId>org.scala-lang.modules</groupId>
       <artifactId>scala-parallel-collections_${scala.binary.version}</artifactId>
     </dependency>
     --><!-- #endif scala-2.13 -->
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava</artifactId>
+      <version>${guava.version}</version>
+      <scope>compile</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>failureaccess</artifactId>
+      <version>${guava.failureaccess.version}</version>
+    </dependency>
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
       <version>${protobuf.version}</version>
       <scope>compile</scope>
     </dependency>
+    <dependency>
+      <groupId>io.grpc</groupId>
+      <artifactId>grpc-netty</artifactId>
+      <version>${io.grpc.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>io.grpc</groupId>
+      <artifactId>grpc-protobuf</artifactId>
+      <version>${io.grpc.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>io.grpc</groupId>
+      <artifactId>grpc-services</artifactId>
+      <version>${io.grpc.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>io.grpc</groupId>
+      <artifactId>grpc-stub</artifactId>
+      <version>${io.grpc.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>io.netty</groupId>
+      <artifactId>netty-codec-http2</artifactId>
+      <version>${netty.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>io.netty</groupId>
+      <artifactId>netty-handler-proxy</artifactId>
+      <version>${netty.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>io.netty</groupId>
+      <artifactId>netty-transport-native-unix-common</artifactId>
+      <version>${netty.version}</version>
+      <scope>provided</scope>
+    </dependency>
+    <dependency> <!-- necessary for Java 9+ -->
+      <groupId>org.apache.tomcat</groupId>
+      <artifactId>annotations-api</artifactId>
+      <version>${tomcat.annotations.api.version}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.scalacheck</groupId>
       <artifactId>scalacheck_${scala.binary.version}</artifactId>

diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala
@@ -38,9 +38,10 @@ private[spark] object Connect {
 
   val CONNECT_GRPC_ARROW_MAX_BATCH_SIZE =
     ConfigBuilder("spark.connect.grpc.arrow.maxBatchSize")
-      .doc("When using Apache Arrow, limit the maximum size of one arrow batch that " +
-        "can be sent from server side to client side. Currently, we conservatively use 70% " +
-        "of it because the size is not accurate but estimated.")
+      .doc(
+        "When using Apache Arrow, limit the maximum size of one arrow batch that " +
+          "can be sent from server side to client side. Currently, we conservatively use 70% " +
+          "of it because the size is not accurate but estimated.")
       .version("3.4.0")
       .bytesConf(ByteUnit.MiB)
       .createWithDefaultString("4m")

diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala
@@ -719,6 +719,53 @@ package object dsl {
           .build()
       }
 
+      def unpivot(
+          ids: Seq[Expression],
+          values: Seq[Expression],
+          variableColumnName: String,
+          valueColumnName: String): Relation = {
+        Relation
+          .newBuilder()
+          .setUnpivot(
+            Unpivot
+              .newBuilder()
+              .setInput(logicalPlan)
+              .addAllIds(ids.asJava)
+              .addAllValues(values.asJava)
+              .setVariableColumnName(variableColumnName)
+              .setValueColumnName(valueColumnName))
+          .build()
+      }
+
+      def unpivot(
+          ids: Seq[Expression],
+          variableColumnName: String,
+          valueColumnName: String): Relation = {
+        Relation
+          .newBuilder()
+          .setUnpivot(
+            Unpivot
+              .newBuilder()
+              .setInput(logicalPlan)
+              .addAllIds(ids.asJava)
+              .setVariableColumnName(variableColumnName)
+              .setValueColumnName(valueColumnName))
+          .build()
+      }
+
+      def melt(
+          ids: Seq[Expression],
+          values: Seq[Expression],
+          variableColumnName: String,
+          valueColumnName: String): Relation =
+        unpivot(ids, values, variableColumnName, valueColumnName)
+
+      def melt(
+          ids: Seq[Expression],
+          variableColumnName: String,
+          valueColumnName: String): Relation =
+        unpivot(ids, variableColumnName, valueColumnName)
+
       private def createSetOperation(
           left: Relation,
           right: Relation,