From 9981c3185a4733ff83f79d7b68960eda7853f605 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Sun, 22 May 2022 21:41:49 +0800 Subject: [PATCH 1/3] Upgrade parquet to 1.12.3 --- dev/deps/spark-deps-hadoop-2-hive-2.3 | 12 ++++++------ dev/deps/spark-deps-hadoop-3-hive-2.3 | 12 ++++++------ docs/sql-data-sources-parquet.md | 4 ++-- pom.xml | 15 +++++++++++++-- project/SparkBuild.scala | 3 +++ 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2-hive-2.3 b/dev/deps/spark-deps-hadoop-2-hive-2.3 index 14da1045ef73..a42371585c9b 100644 --- a/dev/deps/spark-deps-hadoop-2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2-hive-2.3 @@ -225,12 +225,12 @@ orc-shims/1.7.4//orc-shims-1.7.4.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.12.2//parquet-column-1.12.2.jar -parquet-common/1.12.2//parquet-common-1.12.2.jar -parquet-encoding/1.12.2//parquet-encoding-1.12.2.jar -parquet-format-structures/1.12.2//parquet-format-structures-1.12.2.jar -parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar -parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar +parquet-column/1.12.3//parquet-column-1.12.3.jar +parquet-common/1.12.3//parquet-common-1.12.3.jar +parquet-encoding/1.12.3//parquet-encoding-1.12.3.jar +parquet-format-structures/1.12.3//parquet-format-structures-1.12.3.jar +parquet-hadoop/1.12.3//parquet-hadoop-1.12.3.jar +parquet-jackson/1.12.3//parquet-jackson-1.12.3.jar pickle/1.2//pickle-1.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.5//py4j-0.10.9.5.jar diff --git a/dev/deps/spark-deps-hadoop-3-hive-2.3 b/dev/deps/spark-deps-hadoop-3-hive-2.3 index 525c01307a42..cd37e50ce6f7 100644 --- a/dev/deps/spark-deps-hadoop-3-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3-hive-2.3 @@ -214,12 +214,12 @@ orc-shims/1.7.4//orc-shims-1.7.4.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.12.2//parquet-column-1.12.2.jar -parquet-common/1.12.2//parquet-common-1.12.2.jar -parquet-encoding/1.12.2//parquet-encoding-1.12.2.jar -parquet-format-structures/1.12.2//parquet-format-structures-1.12.2.jar -parquet-hadoop/1.12.2//parquet-hadoop-1.12.2.jar -parquet-jackson/1.12.2//parquet-jackson-1.12.2.jar +parquet-column/1.12.3//parquet-column-1.12.3.jar +parquet-common/1.12.3//parquet-common-1.12.3.jar +parquet-encoding/1.12.3//parquet-encoding-1.12.3.jar +parquet-format-structures/1.12.3//parquet-format-structures-1.12.3.jar +parquet-hadoop/1.12.3//parquet-hadoop-1.12.3.jar +parquet-jackson/1.12.3//parquet-jackson-1.12.3.jar pickle/1.2//pickle-1.2.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.5//py4j-0.10.9.5.jar diff --git a/docs/sql-data-sources-parquet.md b/docs/sql-data-sources-parquet.md index 0c3b5cc61561..2189892c9283 100644 --- a/docs/sql-data-sources-parquet.md +++ b/docs/sql-data-sources-parquet.md @@ -257,7 +257,7 @@ REFRESH TABLE my_table; Since Spark 3.2, columnar encryption is supported for Parquet tables with Apache Parquet 1.12+. -Parquet uses the envelope encryption practice, where file parts are encrypted with "data encryption keys" (DEKs), and the DEKs are encrypted with "master encryption keys" (MEKs). The DEKs are randomly generated by Parquet for each encrypted file/column. The MEKs are generated, stored and managed in a Key Management Service (KMS) of user’s choice. The Parquet Maven [repository](https://repo1.maven.org/maven2/org/apache/parquet/parquet-hadoop/1.12.0/) has a jar with a mock KMS implementation that allows to run column encryption and decryption using a spark-shell only, without deploying a KMS server (download the `parquet-hadoop-tests.jar` file and place it in the Spark `jars` folder): +Parquet uses the envelope encryption practice, where file parts are encrypted with "data encryption keys" (DEKs), and the DEKs are encrypted with "master encryption keys" (MEKs). The DEKs are randomly generated by Parquet for each encrypted file/column. The MEKs are generated, stored and managed in a Key Management Service (KMS) of user’s choice. The Parquet Maven [repository](https://repo1.maven.org/maven2/org/apache/parquet/parquet-hadoop/1.12.3/) has a jar with a mock KMS implementation that allows to run column encryption and decryption using a spark-shell only, without deploying a KMS server (download the `parquet-hadoop-tests.jar` file and place it in the Spark `jars` folder):
@@ -349,7 +349,7 @@ df2 = spark.read.parquet("/path/to/table.parquet.encrypted") #### KMS Client -The InMemoryKMS class is provided only for illustration and simple demonstration of Parquet encryption functionality. **It should not be used in a real deployment**. The master encryption keys must be kept and managed in a production-grade KMS system, deployed in user's organization. Rollout of Spark with Parquet encryption requires implementation of a client class for the KMS server. Parquet provides a plug-in [interface](https://github.com/apache/parquet-mr/blob/apache-parquet-1.12.0/parquet-hadoop/src/main/java/org/apache/parquet/crypto/keytools/KmsClient.java) for development of such classes, +The InMemoryKMS class is provided only for illustration and simple demonstration of Parquet encryption functionality. **It should not be used in a real deployment**. The master encryption keys must be kept and managed in a production-grade KMS system, deployed in user's organization. Rollout of Spark with Parquet encryption requires implementation of a client class for the KMS server. Parquet provides a plug-in [interface](https://github.com/apache/parquet-mr/blob/1.12.3/parquet-hadoop/src/main/java/org/apache/parquet/crypto/keytools/KmsClient.java) for development of such classes,
{% highlight java %} diff --git a/pom.xml b/pom.xml index d5618c8f1034..1efe2b10bca7 100644 --- a/pom.xml +++ b/pom.xml @@ -131,7 +131,7 @@ 3.2.0 10.14.2.0 - 1.12.2 + 1.12.3 1.7.4 9.4.46.v20220331 4.0.3 @@ -341,6 +341,17 @@ false + + staged + hive-staged-releases + https://repository.apache.org/content/repositories/staging/ + + true + + + true + + @@ -2357,7 +2368,7 @@ ${hive.group} hive-service-rpc - + org.apache.parquet parquet-hadoop-bundle diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 3309b72a5578..036a30b4d0e0 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -285,6 +285,9 @@ object SparkBuild extends PomBuild { DefaultMavenRepository, Resolver.mavenLocal, Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) + ) ++ Seq( + "hive-staged-releases-mirror" at "https://repository.apache.org/content/repositories/staging/", + Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) ), externalResolvers := resolvers.value, otherResolvers := SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))).value, From e4976c2d2d80690bafbdaf38950cd67cb09f3836 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Mon, 23 May 2022 06:50:59 +0800 Subject: [PATCH 2/3] Fix --- .../sql/execution/datasources/parquet/ParquetFilterSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index be081dadb2fa..aca2d783e8ac 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -28,7 +28,7 @@ import scala.reflect.runtime.universe.TypeTag import org.apache.hadoop.fs.Path import org.apache.parquet.filter2.predicate.{FilterApi, FilterPredicate, Operators} import org.apache.parquet.filter2.predicate.FilterApi._ -import org.apache.parquet.filter2.predicate.Operators.{Column => _, _} +import org.apache.parquet.filter2.predicate.Operators.{Column => _, Eq, Gt, GtEq, Lt, LtEq, NotEq, UserDefinedByInstance} import org.apache.parquet.hadoop.{ParquetFileReader, ParquetInputFormat, ParquetOutputFormat} import org.apache.parquet.hadoop.util.HadoopInputFile import org.apache.parquet.schema.MessageType From 6a7907a82fc2501064d3b9339c850d6b1adab439 Mon Sep 17 00:00:00 2001 From: Yuming Wang Date: Fri, 27 May 2022 07:05:43 +0800 Subject: [PATCH 3/3] Remove --- pom.xml | 11 ----------- project/SparkBuild.scala | 3 --- 2 files changed, 14 deletions(-) diff --git a/pom.xml b/pom.xml index 1efe2b10bca7..20ebbaa76370 100644 --- a/pom.xml +++ b/pom.xml @@ -341,17 +341,6 @@ false - - staged - hive-staged-releases - https://repository.apache.org/content/repositories/staging/ - - true - - - true - - diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala index 036a30b4d0e0..3309b72a5578 100644 --- a/project/SparkBuild.scala +++ b/project/SparkBuild.scala @@ -285,9 +285,6 @@ object SparkBuild extends PomBuild { DefaultMavenRepository, Resolver.mavenLocal, Resolver.file("ivyLocal", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) - ) ++ Seq( - "hive-staged-releases-mirror" at "https://repository.apache.org/content/repositories/staging/", - Resolver.file("local", file(Path.userHome.absolutePath + "/.ivy2/local"))(Resolver.ivyStylePatterns) ), externalResolvers := resolvers.value, otherResolvers := SbtPomKeys.mvnLocalRepository(dotM2 => Seq(Resolver.file("dotM2", dotM2))).value,