apache · SandishKumarHN · Dec 5, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 7, 2022
diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala
@@ -39,7 +39,7 @@ private[protobuf] case class ProtobufDataToCatalyst(
   override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
 
   override lazy val dataType: DataType = {
-    val dt = SchemaConverters.toSqlType(messageDescriptor).dataType
+    val dt = SchemaConverters.toSqlType(messageDescriptor, protobufOptions).dataType
     parseMode match {
       // With PermissiveMode, the output Catalyst row might contain columns of null values for
       // corrupt records, even if some of the columns are not nullable in the user-provided schema.

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala
@@ -157,6 +157,8 @@ private[sql] class ProtobufDeserializer(
 
       case (null, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal)
 
+      case (MESSAGE, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal)
+
       // TODO: we can avoid boxing if future version of Protobuf provide primitive accessors.
       case (BOOLEAN, BooleanType) =>
         (updater, ordinal, value) => updater.setBoolean(ordinal, value.asInstanceOf[Boolean])
@@ -235,7 +237,7 @@ private[sql] class ProtobufDeserializer(
           writeRecord(new RowUpdater(row), value.asInstanceOf[DynamicMessage])
           updater.set(ordinal, row)
 
-      case (MESSAGE, ArrayType(st: StructType, containsNull)) =>
+      case (MESSAGE, ArrayType(st: DataType, containsNull)) =>
         newArrayWriter(protoType, protoPath, catalystPath, st, containsNull)
 
       case (ENUM, StringType) =>

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
@@ -38,6 +38,12 @@ private[sql] class ProtobufOptions(
 
   val parseMode: ParseMode =
     parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode)
+
+  val circularReferenceType: String = parameters.getOrElse("circularReferenceType", "FIELD_NAME")
+
+  // User can choose a circularReferenceDepth of 0, 1, or 1.
+  // Going beyond 3 levels of recursion is not allowed.
+  val circularReferenceDepth: Int = parameters.getOrElse("circularReferenceDepth", "-1").toInt
 }
 
 private[sql] object ProtobufOptions {

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala
@@ -40,19 +40,26 @@ object SchemaConverters {
    *
    * @since 3.4.0
    */
-  def toSqlType(descriptor: Descriptor): SchemaType = {
-    toSqlTypeHelper(descriptor)
+  def toSqlType(
+      descriptor: Descriptor,
+      protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = {
+    toSqlTypeHelper(descriptor, protobufOptions)
   }
 
-  def toSqlTypeHelper(descriptor: Descriptor): SchemaType = ScalaReflectionLock.synchronized {
+  def toSqlTypeHelper(
+      descriptor: Descriptor,
+      protobufOptions: ProtobufOptions): SchemaType = ScalaReflectionLock.synchronized {
     SchemaType(
-      StructType(descriptor.getFields.asScala.flatMap(structFieldFor(_, Set.empty)).toArray),
+      StructType(descriptor.getFields.asScala.flatMap(
+        structFieldFor(_, Map.empty, Map.empty, protobufOptions: ProtobufOptions)).toArray),
       nullable = true)
   }
 
   def structFieldFor(
       fd: FieldDescriptor,
-      existingRecordNames: Set[String]): Option[StructField] = {
+      existingRecordNames: Map[String, Int],
+      existingRecordTypes: Map[String, Int],
+      protobufOptions: ProtobufOptions): Option[StructField] = {
     import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._
     val dataType = fd.getJavaType match {
       case INT => Some(IntegerType)
@@ -81,9 +88,19 @@ object SchemaConverters {
         fd.getMessageType.getFields.forEach { field =>
           field.getName match {
             case "key" =>
-              keyType = structFieldFor(field, existingRecordNames).get.dataType
+              keyType =
+                structFieldFor(
+                  field,
+                  existingRecordNames,
+                  existingRecordTypes,
+                  protobufOptions).get.dataType
             case "value" =>
-              valueType = structFieldFor(field, existingRecordNames).get.dataType
+              valueType =
+                structFieldFor(
+                  field,
+                  existingRecordNames,
+                  existingRecordTypes,
+                  protobufOptions).get.dataType
           }
         }
         return Option(
@@ -92,14 +109,40 @@ object SchemaConverters {
             MapType(keyType, valueType, valueContainsNull = false).defaultConcreteType,
             nullable = false))
       case MESSAGE =>
-        if (existingRecordNames.contains(fd.getFullName)) {
-          throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString())
+        // Setting the circularReferenceDepth to 0 allows the field to be recursed once, setting
+        // it to 1 allows it to be recursed twice, and setting it to 2 allows it to be recursed
+        // thrice. circularReferenceDepth value greater than 2 is not allowed. If the not
+        // specified, it will default to -1, which disables recursive fields.
+        if (protobufOptions.circularReferenceType.equals("FIELD_TYPE")) {
+          if (existingRecordTypes.contains(fd.getType.name()) &&
+            (protobufOptions.circularReferenceDepth < 0 ||
+              protobufOptions.circularReferenceDepth >= 3)) {
+            throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString())
+          } else if (existingRecordTypes.contains(fd.getType.name()) &&
+            (existingRecordTypes.getOrElse(fd.getType.name(), 0)
+              <= protobufOptions.circularReferenceDepth)) {
+            return Some(StructField(fd.getName, NullType, nullable = false))
+          }
+        } else {
+          if (existingRecordNames.contains(fd.getFullName) &&
+            (protobufOptions.circularReferenceDepth < 0 ||
+              protobufOptions.circularReferenceDepth >= 3)) {
+            throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString())
+          } else if (existingRecordNames.contains(fd.getFullName) &&
+            existingRecordNames.getOrElse(fd.getFullName, 0)
+              <= protobufOptions.circularReferenceDepth) {
+            return Some(StructField(fd.getName, NullType, nullable = false))
+          }
         }
-        val newRecordNames = existingRecordNames + fd.getFullName
+
+        val newRecordNames = existingRecordNames +
+          (fd.getFullName -> (existingRecordNames.getOrElse(fd.getFullName, 0) + 1))
+        val newRecordTypes = existingRecordTypes +
+          (fd.getType.name() -> (existingRecordTypes.getOrElse(fd.getType.name(), 0) + 1))
 
         Option(
           fd.getMessageType.getFields.asScala
-            .flatMap(structFieldFor(_, newRecordNames))
+            .flatMap(structFieldFor(_, newRecordNames, newRecordTypes, protobufOptions))
             .toSeq)
           .filter(_.nonEmpty)
           .map(StructType.apply)

diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.desc b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc
diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.proto b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto
@@ -170,4 +170,40 @@ message timeStampMsg {
 message durationMsg {
   string key = 1;
   Duration duration = 2;
-}
+}
+
+message OneOfEvent {
+  string key = 1;
+  oneof payload {
+    int32 col_1 = 2;
+    string col_2 = 3;
+    int64 col_3 = 4;
+  }
+  repeated string col_4 = 5;
+}
+
+message OneOfEventWithRecursion {
+  string key = 1;
+  oneof payload {
+    EventRecursiveA recursiveA = 3;
+    EventRecursiveB recursiveB = 6;
+  }
+  string value = 7;
+}
+
+message EventRecursiveA {
+  OneOfEventWithRecursion recursiveA = 1;
+  string key = 2;
+}
+
+message EventRecursiveB {
+  string key = 1;
+  string value = 2;
+  OneOfEventWithRecursion recursiveA = 3;
+}
+
+message Status {
+  int32 id = 1;
+  Timestamp trade_time = 2;
+  Status status = 3;
+}
diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala
@@ -26,11 +26,11 @@ import com.google.protobuf.{ByteString, DynamicMessage}
 import org.apache.spark.sql.{Column, QueryTest, Row}
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.functions.{lit, struct}
-import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.SimpleMessageRepeated
+import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.{EventRecursiveA, EventRecursiveB, OneOfEvent, OneOfEventWithRecursion, SimpleMessageRepeated}
 import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.SimpleMessageRepeated.NestedEnum
 import org.apache.spark.sql.protobuf.utils.ProtobufUtils
 import org.apache.spark.sql.test.SharedSparkSession
-import org.apache.spark.sql.types.{DayTimeIntervalType, IntegerType, StringType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types.{DataType, DayTimeIntervalType, IntegerType, StringType, StructField, StructType, TimestampType}
 
 class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with ProtobufTestBase
   with Serializable {
@@ -417,7 +417,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot
             .show()
         }
         assert(e.getMessage.contains(
-          "Found recursive reference in Protobuf schema, which can not be processed by Spark:"
+          "Found recursive reference in Protobuf schema, which can not be processed by Spark"
         ))
     }
   }
@@ -453,7 +453,7 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot
             .show()
         }
         assert(e.getMessage.contains(
-          "Found recursive reference in Protobuf schema, which can not be processed by Spark:"
+          "Found recursive reference in Protobuf schema, which can not be processed by Spark"
         ))
     }
   }
@@ -693,4 +693,173 @@ class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with Prot
       errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR",
       parameters = Map("descFilePath" -> testFileDescriptor))
   }
+
+  test("Verify OneOf field between from_protobuf -> to_protobuf and struct -> from_protobuf") {
+    val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEvent")
+    val oneOfEvent = OneOfEvent.newBuilder()
+      .setKey("key")
+      .setCol1(123)
+      .setCol3(109202L)
+      .setCol2("col2value")
+      .addCol4("col4value").build()
+
+    val df = Seq(oneOfEvent.toByteArray).toDF("value")
+
+    checkWithFileAndClassName("OneOfEvent") {
+      case (name, descFilePathOpt) =>
+        val fromProtoDf = df.select(
+          from_protobuf_wrapper($"value", name, descFilePathOpt) as 'sample)
+        val toDf = fromProtoDf.select(
+          to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto)
+        val toFromDf = toDf.select(
+          from_protobuf_wrapper($"toProto", name, descFilePathOpt) as 'fromToProto)
+        checkAnswer(fromProtoDf, toFromDf)
+        val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name)
+        descriptor.getFields.asScala.map(f => {
+          assert(actualFieldNames.contains(f.getName))
+        })
+
+        val eventFromSpark = OneOfEvent.parseFrom(
+          toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0))
+        // OneOf field: the last set value(by order) will overwrite all previous ones.
+        assert(eventFromSpark.getCol2.equals("col2value"))
+        assert(eventFromSpark.getCol3 == 0)
+        val expectedFields = descriptor.getFields.asScala.map(f => f.getName)
+        eventFromSpark.getDescriptorForType.getFields.asScala.map(f => {
+          assert(expectedFields.contains(f.getName))
+        })
+
+        val jsonSchema =
+         """{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields":
+            |[{"name":"key","type":"string","nullable":true},{"name":"col_1","type":"integer",
+            |"nullable":true},{"name":"col_2","type":"string","nullable":true},{"name":"col_3",
+            |"type":"long","nullable":true},{"name":"col_4","type":{"type":"array",
+            |"elementType":"string","containsNull":false},"nullable":false}]},"nullable":true}]}
+            |{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields":
+            |[{"name":"key","type":"string","nullable":true},{"name":"col_1","type":"integer",
+            |"nullable":true},{"name":"col_2","type":"string","nullable":true},{"name":"col_3",
+            |"type":"long","nullable":true},{"name":"col_4","type":{"type":"array",
+            |"elementType":"string","containsNull":false},"nullable":false}]},
+            |"nullable":true}]}""".stripMargin
+        val schema = DataType.fromJson(jsonSchema).asInstanceOf[StructType]
+        val data = Seq(Row(Row("key", 123, "col2value", 109202L, Seq("col4value"))))
+        val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
+        val dataDfToProto = dataDf.select(
+          to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto)
+
+        val eventFromSparkSchema = OneOfEvent.parseFrom(
+          dataDfToProto.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0))
+        assert(eventFromSparkSchema.getCol2.isEmpty)
+        assert(eventFromSparkSchema.getCol3 == 109202L)
+        eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => {
+          assert(expectedFields.contains(f.getName))
+        })
+    }
+  }
+
+  test("Verify OneOf field with recursive fields between from_protobuf -> to_protobuf " +
+    "and struct -> from_protobuf") {
+    val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEventWithRecursion")
+
+    val recursiveANested = EventRecursiveA.newBuilder()
+      .setKey("keyNested3").build()
+    val oneOfEventNested = OneOfEventWithRecursion.newBuilder()
+      .setKey("keyNested2")
+      .setValue("valueNested2")
+      .setRecursiveA(recursiveANested).build()
+    val recursiveA = EventRecursiveA.newBuilder().setKey("recursiveAKey")
+      .setRecursiveA(oneOfEventNested).build()
+    val recursiveB = EventRecursiveB.newBuilder()
+      .setKey("recursiveBKey")
+      .setValue("recursiveBvalue").build()
+    val oneOfEventWithRecursion = OneOfEventWithRecursion.newBuilder()
+      .setKey("key1")
+      .setValue("value1")
+      .setRecursiveB(recursiveB)
+      .setRecursiveA(recursiveA).build()
+
+    val df = Seq(oneOfEventWithRecursion.toByteArray).toDF("value")
+
+    val options = new java.util.HashMap[String, String]()
+    options.put("circularReferenceDepth", "1")
+
+    val fromProtoDf = df.select(
+      functions.from_protobuf($"value",
+        "OneOfEventWithRecursion",
+        testFileDesc, options) as 'sample)
+
+    val toDf = fromProtoDf.select(
+      functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto)
+    val toFromDf = toDf.select(
+      functions.from_protobuf($"toProto",
+        "OneOfEventWithRecursion",
+        testFileDesc,
+        options) as 'fromToProto)
+
+    checkAnswer(fromProtoDf, toFromDf)
+
+    val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name)
+    descriptor.getFields.asScala.map(f => {
+      assert(actualFieldNames.contains(f.getName))
+    })
+
+    val eventFromSpark = OneOfEventWithRecursion.parseFrom(
+      toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0))
+
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getKey.equals("keyNested2"))
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getValue.equals("valueNested2"))
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getRecursiveA.getKey.isEmpty)
+
+    val expectedFields = descriptor.getFields.asScala.map(f => f.getName)
+    eventFromSpark.getDescriptorForType.getFields.asScala.map(f => {
+      assert(expectedFields.contains(f.getName))
+    })
+
+    val jsonSchema =
+      """{"type":"struct","fields":[{"name":"sample","type":{"type":"struct","fields":
+        |[{"name":"key","type":"string","nullable":true},{"name":"recursiveA","type":
+        |{"type":"struct","fields":[{"name":"recursiveA","type":{"type":"struct","fields":
+        |[{"name":"key","type":"string","nullable":true},{"name":"recursiveA","type":"void",
+        |"nullable":true},{"name":"recursiveB","type":{"type":"struct","fields":[{"name":"key",
+        |"type":"string","nullable":true},{"name":"value","type":"string","nullable":true},
+        |{"name":"recursiveA","type":{"type":"struct","fields":[{"name":"key","type":"string",
+        |"nullable":true},{"name":"recursiveA","type":"void","nullable":true},{"name":"recursiveB",
+        |"type":"void","nullable":true},{"name":"value","type":"string","nullable":true}]},
+        |"nullable":true}]},"nullable":true},{"name":"value","type":"string","nullable":true}]},
+        |"nullable":true},{"name":"key","type":"string","nullable":true}]},"nullable":true},
+        |{"name":"recursiveB","type":{"type":"struct","fields":[{"name":"key","type":"string",
+        |"nullable":true},{"name":"value","type":"string","nullable":true},{"name":"recursiveA",
+        |"type":{"type":"struct","fields":[{"name":"key","type":"string","nullable":true},
+        |{"name":"recursiveA","type":{"type":"struct","fields":[{"name":"recursiveA","type":
+        |{"type":"struct","fields":[{"name":"key","type":"string","nullable":true},
+        |{"name":"recursiveA","type":"void","nullable":true},{"name":"recursiveB","type":"void",
+        |"nullable":true},{"name":"value","type":"string","nullable":true}]},"nullable":true},
+        |{"name":"key","type":"string","nullable":true}]},"nullable":true},{"name":"recursiveB",
+        |"type":"void","nullable":true},{"name":"value","type":"string","nullable":true}]},
+        |"nullable":true}]},"nullable":true},{"name":"value","type":"string","nullable":true}]},
+        |"nullable":true}]}""".stripMargin
+    val schema = DataType.fromJson(jsonSchema).asInstanceOf[StructType]
+    val data = Seq(
+      Row(
+        Row("key1",
+          Row(
+            Row("keyNested2", null, null, "valueNested2"),
+            "recursiveAKey"),
+          null,
+          "value1")
+      )
+    )
+    val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
+    val dataDfToProto = dataDf.select(
+      functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto)
+
+    val eventFromSparkSchema = OneOfEventWithRecursion.parseFrom(
+      dataDfToProto.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0))
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getKey.equals("keyNested2"))
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getValue.equals("valueNested2"))
+    assert(eventFromSpark.getRecursiveA.getRecursiveA.getRecursiveA.getKey.isEmpty)
+    eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => {
+      assert(expectedFields.contains(f.getName))
+    })
+  }
 }
diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json
@@ -1016,7 +1016,7 @@
   },
   "RECURSIVE_PROTOBUF_SCHEMA" : {
     "message" : [
-      "Found recursive reference in Protobuf schema, which can not be processed by Spark: <fieldDescriptor>"
+      "Found recursive reference in Protobuf schema, which can not be processed by Spark by default: <fieldDescriptor>. try setting the option `circularReferenceDepth` as 0 or 1 or 2. Going beyond 3 levels of recursion is not allowed."
     ]
   },
   "RENAME_SRC_PATH_NOT_FOUND" : {