-
Notifications
You must be signed in to change notification settings - Fork 2.5k
[HUDI-5579] Fixing Kryo registration to be properly wired into Spark sessions #7702
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
3a40483
129dc6d
2e55fd5
2aa127a
c4ea857
93ae5c2
0035012
c3939bc
eb429f3
cd63a6d
de70d52
b39029b
807a2b6
f474c11
0e0a0fe
c372fe7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,11 +19,12 @@ | |
| package org.apache.spark | ||
|
|
||
| import com.esotericsoftware.kryo.Kryo | ||
| import com.esotericsoftware.kryo.serializers.JavaSerializer | ||
| import org.apache.hudi.client.model.HoodieInternalRow | ||
| import org.apache.hudi.common.model.{HoodieKey, HoodieSparkRecord} | ||
| import org.apache.hudi.common.util.HoodieCommonKryoProvider | ||
| import org.apache.hudi.common.config.SerializableConfiguration | ||
| import org.apache.hudi.common.model.HoodieSparkRecord | ||
| import org.apache.hudi.common.util.HoodieCommonKryoRegistrar | ||
| import org.apache.hudi.config.HoodieWriteConfig | ||
| import org.apache.spark.internal.config.ConfigBuilder | ||
| import org.apache.spark.serializer.KryoRegistrator | ||
|
|
||
| /** | ||
|
|
@@ -42,22 +43,31 @@ import org.apache.spark.serializer.KryoRegistrator | |
| * or renamed (w/o correspondingly updating such usages)</li> | ||
| * </ol> | ||
| */ | ||
| class HoodieSparkKryoProvider extends HoodieCommonKryoProvider { | ||
| override def registerClasses(): Array[Class[_]] = { | ||
| class HoodieSparkKryoRegistrar extends HoodieCommonKryoRegistrar with KryoRegistrator { | ||
| override def registerClasses(kryo: Kryo): Unit = { | ||
| /////////////////////////////////////////////////////////////////////////// | ||
| // NOTE: DO NOT REORDER REGISTRATIONS | ||
| /////////////////////////////////////////////////////////////////////////// | ||
| val classes = super[HoodieCommonKryoProvider].registerClasses() | ||
| classes ++ Array( | ||
| classOf[HoodieWriteConfig], | ||
| classOf[HoodieSparkRecord], | ||
| classOf[HoodieInternalRow] | ||
| ) | ||
| super[HoodieCommonKryoRegistrar].registerClasses(kryo) | ||
|
|
||
| kryo.register(classOf[HoodieWriteConfig]) | ||
|
|
||
| kryo.register(classOf[HoodieSparkRecord]) | ||
| kryo.register(classOf[HoodieInternalRow]) | ||
|
|
||
| // NOTE: Hadoop's configuration is not a serializable object by itself, and hence | ||
| // we're relying on [[SerializableConfiguration]] wrapper to work it around | ||
| kryo.register(classOf[SerializableConfiguration], new JavaSerializer()) | ||
| } | ||
| } | ||
|
|
||
| object HoodieSparkKryoProvider { | ||
| object HoodieSparkKryoRegistrar { | ||
|
|
||
| // NOTE: We're copying definition of the config introduced in Spark 3.0 | ||
| // (to stay compatible w/ Spark 2.4) | ||
| private val KRYO_USER_REGISTRATORS = "spark.kryo.registrator" | ||
|
|
||
| def register(conf: SparkConf): SparkConf = { | ||
| conf.registerKryoClasses(new HoodieSparkKryoProvider().registerClasses()) | ||
| conf.set(KRYO_USER_REGISTRATORS, Seq(classOf[HoodieSparkKryoRegistrar].getName).mkString(",")) | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to convert it to a string, so i kept it generic so that we can drop in one more class. Not strictly necessary though |
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| package org.apache.hudi.testutils; | ||
|
|
||
| import org.apache.hudi.HoodieSparkUtils; | ||
| import org.apache.hudi.avro.HoodieAvroUtils; | ||
| import org.apache.hudi.client.SparkRDDReadClient; | ||
| import org.apache.hudi.common.engine.HoodieEngineContext; | ||
|
|
@@ -92,10 +93,18 @@ public class HoodieClientTestUtils { | |
| */ | ||
| public static SparkConf getSparkConfForTest(String appName) { | ||
| SparkConf sparkConf = new SparkConf().setAppName(appName) | ||
| .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[4]") | ||
| .setMaster("local[4]") | ||
| .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") | ||
| .set("spark.kryo.registrator", "org.apache.spark.HoodieSparkKryoRegistrar") | ||
| .set("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also move these common options into a tool method?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is exactly the method you're referring to (used in tests) |
||
| .set("spark.sql.shuffle.partitions", "4") | ||
| .set("spark.default.parallelism", "4"); | ||
|
|
||
| if (HoodieSparkUtils.gteqSpark3_2()) { | ||
| sparkConf.set("spark.sql.catalog.spark_catalog", | ||
| "org.apache.spark.sql.hudi.catalog.HoodieCatalog"); | ||
| } | ||
|
|
||
| String evlogDir = System.getProperty("SPARK_EVLOG_DIR"); | ||
| if (evlogDir != null) { | ||
| sparkConf.set("spark.eventLog.enabled", "true"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
|
|
||
| package org.apache.hudi.common.util; | ||
|
|
||
| import com.esotericsoftware.kryo.Kryo; | ||
| import org.apache.hudi.common.HoodieJsonPayload; | ||
| import org.apache.hudi.common.model.AWSDmsAvroPayload; | ||
| import org.apache.hudi.common.model.DefaultHoodieRecordPayload; | ||
|
|
@@ -36,6 +37,8 @@ | |
| import org.apache.hudi.common.model.debezium.PostgresDebeziumAvroPayload; | ||
| import org.apache.hudi.metadata.HoodieMetadataPayload; | ||
|
|
||
| import java.util.Arrays; | ||
|
|
||
| /** | ||
| * NOTE: PLEASE READ CAREFULLY BEFORE CHANGING | ||
| * | ||
|
|
@@ -52,14 +55,13 @@ | |
| * or renamed (w/o correspondingly updating such usages)</li> | ||
| * </ol> | ||
| */ | ||
| public class HoodieCommonKryoProvider { | ||
| public class HoodieCommonKryoRegistrar { | ||
|
|
||
| public Class<?>[] registerClasses() { | ||
| public void registerClasses(Kryo kryo) { | ||
| /////////////////////////////////////////////////////////////////////////// | ||
| // NOTE: DO NOT REORDER REGISTRATIONS | ||
| /////////////////////////////////////////////////////////////////////////// | ||
|
|
||
| return new Class<?>[] { | ||
| Arrays.stream(new Class<?>[] { | ||
| HoodieAvroRecord.class, | ||
| HoodieAvroIndexedRecord.class, | ||
| HoodieEmptyRecord.class, | ||
|
|
@@ -81,7 +83,8 @@ public Class<?>[] registerClasses() { | |
|
|
||
| HoodieRecordLocation.class, | ||
| HoodieRecordGlobalLocation.class | ||
| }; | ||
| }) | ||
| .forEachOrdered(kryo::register); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A stateless function (function that does not take any side effect) is always a better choice especially for tool method, personally I prefer the old way we handle this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree in principle, but here we actually aligning it w/ an interface of |
||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Guess we can make it public so that there is no need to hard code the option key
spark.kryo.registratoreverywhere.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We actually won't be able to use it everywhere, so i rather stuck w/ the Spark option for consistency (which is the way we handle every other option as well)