-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-41396][SQL][PROTOBUF] OneOf field support and recursion checks #38922
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
1266857
d38cc71
e2dc559
f0d2e5f
c8c7bd7
2337892
5340bb4
f71d1ea
b0eba7f
660c354
ae82005
a48f7d6
dd47096
231f0a8
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -157,6 +157,8 @@ private[sql] class ProtobufDeserializer( | |
|
|
||
| case (null, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal) | ||
|
|
||
| case (MESSAGE, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal) | ||
|
|
||
| // TODO: we can avoid boxing if future version of Protobuf provide primitive accessors. | ||
| case (BOOLEAN, BooleanType) => | ||
| (updater, ordinal, value) => updater.setBoolean(ordinal, value.asInstanceOf[Boolean]) | ||
|
|
@@ -235,7 +237,7 @@ private[sql] class ProtobufDeserializer( | |
| writeRecord(new RowUpdater(row), value.asInstanceOf[DynamicMessage]) | ||
| updater.set(ordinal, row) | ||
|
|
||
| case (MESSAGE, ArrayType(st: StructType, containsNull)) => | ||
| case (MESSAGE, ArrayType(st: DataType, containsNull)) => | ||
|
||
| newArrayWriter(protoType, protoPath, catalystPath, st, containsNull) | ||
|
|
||
| case (ENUM, StringType) => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,12 @@ private[sql] class ProtobufOptions( | |
|
|
||
| val parseMode: ParseMode = | ||
| parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode) | ||
|
|
||
| val circularReferenceType: String = parameters.getOrElse("circularReferenceType", "FIELD_NAME") | ||
|
||
|
|
||
| // User can choose a circularReferenceDepth of 0, 1, or 1. | ||
| // Going beyond 3 levels of recursion is not allowed. | ||
| val circularReferenceDepth: Int = parameters.getOrElse("circularReferenceDepth", "-1").toInt | ||
|
||
| } | ||
|
|
||
| private[sql] object ProtobufOptions { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -40,19 +40,26 @@ object SchemaConverters { | |
| * | ||
| * @since 3.4.0 | ||
| */ | ||
| def toSqlType(descriptor: Descriptor): SchemaType = { | ||
| toSqlTypeHelper(descriptor) | ||
| def toSqlType( | ||
| descriptor: Descriptor, | ||
| protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = { | ||
| toSqlTypeHelper(descriptor, protobufOptions) | ||
| } | ||
|
|
||
| def toSqlTypeHelper(descriptor: Descriptor): SchemaType = ScalaReflectionLock.synchronized { | ||
| def toSqlTypeHelper( | ||
| descriptor: Descriptor, | ||
| protobufOptions: ProtobufOptions): SchemaType = ScalaReflectionLock.synchronized { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not related to this PR, but why would we lock
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I just noticed. Not sure if if we need. |
||
| SchemaType( | ||
| StructType(descriptor.getFields.asScala.flatMap(structFieldFor(_, Set.empty)).toArray), | ||
| StructType(descriptor.getFields.asScala.flatMap( | ||
| structFieldFor(_, Map.empty, Map.empty, protobufOptions: ProtobufOptions)).toArray), | ||
| nullable = true) | ||
| } | ||
|
|
||
| def structFieldFor( | ||
| fd: FieldDescriptor, | ||
| existingRecordNames: Set[String]): Option[StructField] = { | ||
| existingRecordNames: Map[String, Int], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we add comments to explain what map key and value means here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @cloud-fan added a comment. |
||
| existingRecordTypes: Map[String, Int], | ||
|
||
| protobufOptions: ProtobufOptions): Option[StructField] = { | ||
| import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ | ||
| val dataType = fd.getJavaType match { | ||
| case INT => Some(IntegerType) | ||
|
|
@@ -81,9 +88,19 @@ object SchemaConverters { | |
| fd.getMessageType.getFields.forEach { field => | ||
| field.getName match { | ||
| case "key" => | ||
| keyType = structFieldFor(field, existingRecordNames).get.dataType | ||
| keyType = | ||
| structFieldFor( | ||
| field, | ||
| existingRecordNames, | ||
| existingRecordTypes, | ||
| protobufOptions).get.dataType | ||
| case "value" => | ||
| valueType = structFieldFor(field, existingRecordNames).get.dataType | ||
| valueType = | ||
| structFieldFor( | ||
| field, | ||
| existingRecordNames, | ||
| existingRecordTypes, | ||
| protobufOptions).get.dataType | ||
| } | ||
| } | ||
| return Option( | ||
|
|
@@ -92,14 +109,40 @@ object SchemaConverters { | |
| MapType(keyType, valueType, valueContainsNull = false).defaultConcreteType, | ||
| nullable = false)) | ||
| case MESSAGE => | ||
| if (existingRecordNames.contains(fd.getFullName)) { | ||
| throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString()) | ||
| // Setting the circularReferenceDepth to 0 allows the field to be recursed once, setting | ||
| // it to 1 allows it to be recursed twice, and setting it to 2 allows it to be recursed | ||
| // thrice. circularReferenceDepth value greater than 2 is not allowed. If the not | ||
| // specified, it will default to -1, which disables recursive fields. | ||
| if (protobufOptions.circularReferenceType.equals("FIELD_TYPE")) { | ||
| if (existingRecordTypes.contains(fd.getType.name()) && | ||
| (protobufOptions.circularReferenceDepth < 0 || | ||
| protobufOptions.circularReferenceDepth >= 3)) { | ||
| throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString()) | ||
| } else if (existingRecordTypes.contains(fd.getType.name()) && | ||
|
||
| (existingRecordTypes.getOrElse(fd.getType.name(), 0) | ||
| <= protobufOptions.circularReferenceDepth)) { | ||
| return Some(StructField(fd.getName, NullType, nullable = false)) | ||
| } | ||
| } else { | ||
| if (existingRecordNames.contains(fd.getFullName) && | ||
| (protobufOptions.circularReferenceDepth < 0 || | ||
| protobufOptions.circularReferenceDepth >= 3)) { | ||
| throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString()) | ||
| } else if (existingRecordNames.contains(fd.getFullName) && | ||
| existingRecordNames.getOrElse(fd.getFullName, 0) | ||
| <= protobufOptions.circularReferenceDepth) { | ||
| return Some(StructField(fd.getName, NullType, nullable = false)) | ||
| } | ||
| } | ||
| val newRecordNames = existingRecordNames + fd.getFullName | ||
|
|
||
| val newRecordNames = existingRecordNames + | ||
| (fd.getFullName -> (existingRecordNames.getOrElse(fd.getFullName, 0) + 1)) | ||
| val newRecordTypes = existingRecordTypes + | ||
| (fd.getType.name() -> (existingRecordTypes.getOrElse(fd.getType.name(), 0) + 1)) | ||
|
|
||
| Option( | ||
| fd.getMessageType.getFields.asScala | ||
| .flatMap(structFieldFor(_, newRecordNames)) | ||
| .flatMap(structFieldFor(_, newRecordNames, newRecordTypes, protobufOptions)) | ||
| .toSeq) | ||
| .filter(_.nonEmpty) | ||
| .map(StructType.apply) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -170,4 +170,40 @@ message timeStampMsg { | |
| message durationMsg { | ||
| string key = 1; | ||
| Duration duration = 2; | ||
| } | ||
| } | ||
|
|
||
| message OneOfEvent { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you testing more OneOf and recusion in the same message? Could you split them into separate messages?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rangadi I see a lot of use cases for the "payload" Oneof the field and recursive fields in it. So I thought combining Oneof with recursion would be a good test. will separate There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Combined one is fine, we could keep it. Better to have a simpler separate tests as well. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice |
||
| string key = 1; | ||
| oneof payload { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How do one-of fields look like in spark schema? Could you give an example? I could not see the schema in the unit tests.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rangadi the "Oneof" field is of message type, Oneof will be converted to a struct type. |
||
| int32 col_1 = 2; | ||
| string col_2 = 3; | ||
| int64 col_3 = 4; | ||
| } | ||
| repeated string col_4 = 5; | ||
| } | ||
|
|
||
| message OneOfEventWithRecursion { | ||
| string key = 1; | ||
| oneof payload { | ||
| EventRecursiveA recursiveA = 3; | ||
| EventRecursiveB recursiveB = 6; | ||
| } | ||
| string value = 7; | ||
| } | ||
|
|
||
| message EventRecursiveA { | ||
| OneOfEventWithRecursion recursiveA = 1; | ||
| string key = 2; | ||
| } | ||
|
|
||
| message EventRecursiveB { | ||
| string key = 1; | ||
| string value = 2; | ||
| OneOfEventWithRecursion recursiveA = 3; | ||
| } | ||
|
|
||
| message Status { | ||
| int32 id = 1; | ||
| Timestamp trade_time = 2; | ||
| Status status = 3; | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -26,11 +26,11 @@ import com.google.protobuf.{ByteString, DynamicMessage} | |
| import org.apache.spark.sql.{Column, QueryTest, Row} | ||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.functions.{lit, struct} | ||
| import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.SimpleMessageRepeated | ||
| import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.{EventRecursiveA, EventRecursiveB, OneOfEvent, OneOfEventWithRecursion, SimpleMessageRepeated} | ||
|
||
| import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.SimpleMessageRepeated.NestedEnum | ||
| import org.apache.spark.sql.protobuf.utils.ProtobufUtils | ||
| import org.apache.spark.sql.test.SharedSparkSession | ||
| import org.apache.spark.sql.types.{DayTimeIntervalType, IntegerType, StringType, StructField, StructType, TimestampType} | ||
| import org.apache.spark.sql.types.{DataType, DayTimeIntervalType, IntegerType, StringType, StructField, StructType, TimestampType} | ||
|
|
||
| class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with ProtobufTestBase | ||
| with Serializable { | ||
|
|
@@ -417,7 +417,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot | |
| .show() | ||
| } | ||
| assert(e.getMessage.contains( | ||
| "Found recursive reference in Protobuf schema, which can not be processed by Spark:" | ||
| "Found recursive reference in Protobuf schema, which can not be processed by Spark" | ||
| )) | ||
| } | ||
| } | ||
|
|
@@ -453,7 +453,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot | |
| .show() | ||
| } | ||
| assert(e.getMessage.contains( | ||
| "Found recursive reference in Protobuf schema, which can not be processed by Spark:" | ||
| "Found recursive reference in Protobuf schema, which can not be processed by Spark" | ||
| )) | ||
| } | ||
| } | ||
|
|
@@ -693,4 +693,173 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot | |
| errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR", | ||
| parameters = Map("descFilePath" -> testFileDescriptor)) | ||
| } | ||
|
|
||
| test("Verify OneOf field between from_protobuf -> to_protobuf and struct -> from_protobuf") { | ||
| val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEvent") | ||
| val oneOfEvent = OneOfEvent.newBuilder() | ||
| .setKey("key") | ||
| .setCol1(123) | ||
| .setCol3(109202L) | ||
| .setCol2("col2value") | ||
| .addCol4("col4value").build() | ||
|
|
||
| val df = Seq(oneOfEvent.toByteArray).toDF("value") | ||
|
|
||
| checkWithFileAndClassName("OneOfEvent") { | ||
| case (name, descFilePathOpt) => | ||
| val fromProtoDf = df.select( | ||
| from_protobuf_wrapper($"value", name, descFilePathOpt) as 'sample) | ||
| val toDf = fromProtoDf.select( | ||
| to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto) | ||
| val toFromDf = toDf.select( | ||
| from_protobuf_wrapper($"toProto", name, descFilePathOpt) as 'fromToProto) | ||
| checkAnswer(fromProtoDf, toFromDf) | ||
| val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name) | ||
| descriptor.getFields.asScala.map(f => { | ||
| assert(actualFieldNames.contains(f.getName)) | ||
| }) | ||
|
|
||
| val eventFromSpark = OneOfEvent.parseFrom( | ||
| toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) | ||
| // OneOf field: the last set value(by order) will overwrite all previous ones. | ||
| assert(eventFromSpark.getCol2.equals("col2value")) | ||
| assert(eventFromSpark.getCol3 == 0) | ||
| val expectedFields = descriptor.getFields.asScala.map(f => f.getName) | ||
| eventFromSpark.getDescriptorForType.getFields.asScala.map(f => { | ||
| assert(expectedFields.contains(f.getName)) | ||
| }) | ||
|
|
||
| val jsonSchema = | ||
| """{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields": | ||
| |[{"name":"key","type":"string","nullable":true},{"name":"col_1","type":"integer", | ||
| |"nullable":true},{"name":"col_2","type":"string","nullable":true},{"name":"col_3", | ||
| |"type":"long","nullable":true},{"name":"col_4","type":{"type":"array", | ||
| |"elementType":"string","containsNull":false},"nullable":false}]},"nullable":true}]} | ||
| |{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields": | ||
| |[{"name":"key","type":"string","nullable":true},{"name":"col_1","type":"integer", | ||
| |"nullable":true},{"name":"col_2","type":"string","nullable":true},{"name":"col_3", | ||
| |"type":"long","nullable":true},{"name":"col_4","type":{"type":"array", | ||
| |"elementType":"string","containsNull":false},"nullable":false}]}, | ||
| |"nullable":true}]}""".stripMargin | ||
| val schema = DataType.fromJson(jsonSchema).asInstanceOf[StructType] | ||
| val data = Seq(Row(Row("key", 123, "col2value", 109202L, Seq("col4value")))) | ||
| val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) | ||
| val dataDfToProto = dataDf.select( | ||
| to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto) | ||
|
|
||
| val eventFromSparkSchema = OneOfEvent.parseFrom( | ||
| dataDfToProto.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) | ||
| assert(eventFromSparkSchema.getCol2.isEmpty) | ||
| assert(eventFromSparkSchema.getCol3 == 109202L) | ||
| eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => { | ||
| assert(expectedFields.contains(f.getName)) | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| test("Verify OneOf field with recursive fields between from_protobuf -> to_protobuf " + | ||
| "and struct -> from_protobuf") { | ||
| val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEventWithRecursion") | ||
|
|
||
| val recursiveANested = EventRecursiveA.newBuilder() | ||
| .setKey("keyNested3").build() | ||
| val oneOfEventNested = OneOfEventWithRecursion.newBuilder() | ||
| .setKey("keyNested2") | ||
| .setValue("valueNested2") | ||
| .setRecursiveA(recursiveANested).build() | ||
| val recursiveA = EventRecursiveA.newBuilder().setKey("recursiveAKey") | ||
| .setRecursiveA(oneOfEventNested).build() | ||
| val recursiveB = EventRecursiveB.newBuilder() | ||
| .setKey("recursiveBKey") | ||
| .setValue("recursiveBvalue").build() | ||
| val oneOfEventWithRecursion = OneOfEventWithRecursion.newBuilder() | ||
| .setKey("key1") | ||
| .setValue("value1") | ||
| .setRecursiveB(recursiveB) | ||
| .setRecursiveA(recursiveA).build() | ||
|
|
||
| val df = Seq(oneOfEventWithRecursion.toByteArray).toDF("value") | ||
|
|
||
| val options = new java.util.HashMap[String, String]() | ||
| options.put("circularReferenceDepth", "1") | ||
|
|
||
| val fromProtoDf = df.select( | ||
| functions.from_protobuf($"value", | ||
| "OneOfEventWithRecursion", | ||
| testFileDesc, options) as 'sample) | ||
|
|
||
| val toDf = fromProtoDf.select( | ||
| functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto) | ||
| val toFromDf = toDf.select( | ||
| functions.from_protobuf($"toProto", | ||
| "OneOfEventWithRecursion", | ||
| testFileDesc, | ||
| options) as 'fromToProto) | ||
|
|
||
| checkAnswer(fromProtoDf, toFromDf) | ||
|
|
||
| val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name) | ||
| descriptor.getFields.asScala.map(f => { | ||
| assert(actualFieldNames.contains(f.getName)) | ||
| }) | ||
|
|
||
| val eventFromSpark = OneOfEventWithRecursion.parseFrom( | ||
| toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) | ||
|
|
||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getKey.equals("keyNested2")) | ||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getValue.equals("valueNested2")) | ||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getRecursiveA.getKey.isEmpty) | ||
|
|
||
| val expectedFields = descriptor.getFields.asScala.map(f => f.getName) | ||
| eventFromSpark.getDescriptorForType.getFields.asScala.map(f => { | ||
| assert(expectedFields.contains(f.getName)) | ||
| }) | ||
|
|
||
| val jsonSchema = | ||
| """{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields": | ||
| |[{"name":"key","type":"string","nullable":true},{"name":"recursiveA","type": | ||
| |{"type":"struct","fields":[{"name":"recursiveA","type":{"type":"struct","fields": | ||
| |[{"name":"key","type":"string","nullable":true},{"name":"recursiveA","type":"void", | ||
| |"nullable":true},{"name":"recursiveB","type":{"type":"struct","fields":[{"name":"key", | ||
| |"type":"string","nullable":true},{"name":"value","type":"string","nullable":true}, | ||
| |{"name":"recursiveA","type":{"type":"struct","fields":[{"name":"key","type":"string", | ||
| |"nullable":true},{"name":"recursiveA","type":"void","nullable":true},{"name":"recursiveB", | ||
| |"type":"void","nullable":true},{"name":"value","type":"string","nullable":true}]}, | ||
| |"nullable":true}]},"nullable":true},{"name":"value","type":"string","nullable":true}]}, | ||
| |"nullable":true},{"name":"key","type":"string","nullable":true}]},"nullable":true}, | ||
| |{"name":"recursiveB","type":{"type":"struct","fields":[{"name":"key","type":"string", | ||
| |"nullable":true},{"name":"value","type":"string","nullable":true},{"name":"recursiveA", | ||
| |"type":{"type":"struct","fields":[{"name":"key","type":"string","nullable":true}, | ||
| |{"name":"recursiveA","type":{"type":"struct","fields":[{"name":"recursiveA","type": | ||
| |{"type":"struct","fields":[{"name":"key","type":"string","nullable":true}, | ||
| |{"name":"recursiveA","type":"void","nullable":true},{"name":"recursiveB","type":"void", | ||
| |"nullable":true},{"name":"value","type":"string","nullable":true}]},"nullable":true}, | ||
| |{"name":"key","type":"string","nullable":true}]},"nullable":true},{"name":"recursiveB", | ||
| |"type":"void","nullable":true},{"name":"value","type":"string","nullable":true}]}, | ||
| |"nullable":true}]},"nullable":true},{"name":"value","type":"string","nullable":true}]}, | ||
| |"nullable":true}]}""".stripMargin | ||
| val schema = DataType.fromJson(jsonSchema).asInstanceOf[StructType] | ||
| val data = Seq( | ||
| Row( | ||
| Row("key1", | ||
| Row( | ||
| Row("keyNested2", null, null, "valueNested2"), | ||
| "recursiveAKey"), | ||
| null, | ||
| "value1") | ||
| ) | ||
| ) | ||
| val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) | ||
| val dataDfToProto = dataDf.select( | ||
| functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto) | ||
|
|
||
| val eventFromSparkSchema = OneOfEventWithRecursion.parseFrom( | ||
| dataDfToProto.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) | ||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getKey.equals("keyNested2")) | ||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getValue.equals("valueNested2")) | ||
| assert(eventFromSpark.getRecursiveA.getRecursiveA.getRecursiveA.getKey.isEmpty) | ||
| eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => { | ||
| assert(expectedFields.contains(f.getName)) | ||
| }) | ||
| } | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add test that clearly shows the expected schema similar to my comment here: https://github.com/apache/spark/pull/38922/files#r1051292604 It is not easy to seem from these test what schema does 0 or 2 results in. |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is this for? For handling limited recursion?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, correct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a comment about we might be dropping data here? It will not be easy to see for a future reader.
We could have an option to error our if the actual data has more recursion than the configure.