apache · SandishKumarHN · Dec 5, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 7, 2022
diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala
@@ -39,7 +39,7 @@ private[protobuf] case class ProtobufDataToCatalyst(
   override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType)
 
   override lazy val dataType: DataType = {
-    val dt = SchemaConverters.toSqlType(messageDescriptor).dataType
+    val dt = SchemaConverters.toSqlType(messageDescriptor, protobufOptions).dataType
     parseMode match {
       // With PermissiveMode, the output Catalyst row might contain columns of null values for
       // corrupt records, even if some of the columns are not nullable in the user-provided schema.

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala
@@ -157,6 +157,8 @@ private[sql] class ProtobufDeserializer(
 
       case (null, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal)
 
+      case (MESSAGE, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal)
+
       // TODO: we can avoid boxing if future version of Protobuf provide primitive accessors.
       case (BOOLEAN, BooleanType) =>
         (updater, ordinal, value) => updater.setBoolean(ordinal, value.asInstanceOf[Boolean])
@@ -235,7 +237,7 @@ private[sql] class ProtobufDeserializer(
           writeRecord(new RowUpdater(row), value.asInstanceOf[DynamicMessage])
           updater.set(ordinal, row)
 
-      case (MESSAGE, ArrayType(st: StructType, containsNull)) =>
+      case (MESSAGE, ArrayType(st: DataType, containsNull)) =>
         newArrayWriter(protoType, protoPath, catalystPath, st, containsNull)
 
       case (ENUM, StringType) =>

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
@@ -38,6 +38,12 @@ private[sql] class ProtobufOptions(
 
   val parseMode: ParseMode =
     parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode)
+
+  // Setting the circularReferenceDepth to 0 allows the field to be recursed once, setting
+  // it to 1 allows it to be recursed twice, and setting it to 2 allows it to be recursed
+  // thrice. circularReferenceDepth value greater than 2 is not allowed. If the not
+  // specified, it will default to -1, which disables recursive fields.
+  val circularReferenceDepth: Int = parameters.getOrElse("circularReferenceDepth", "-1").toInt
 }
 
 private[sql] object ProtobufOptions {

diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala
@@ -40,19 +40,25 @@ object SchemaConverters {
    *
    * @since 3.4.0
    */
-  def toSqlType(descriptor: Descriptor): SchemaType = {
-    toSqlTypeHelper(descriptor)
+  def toSqlType(
+      descriptor: Descriptor,
+      protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = {
+    toSqlTypeHelper(descriptor, protobufOptions)
   }
 
-  def toSqlTypeHelper(descriptor: Descriptor): SchemaType = ScalaReflectionLock.synchronized {
+  def toSqlTypeHelper(
+      descriptor: Descriptor,
+      protobufOptions: ProtobufOptions): SchemaType = ScalaReflectionLock.synchronized {
     SchemaType(
-      StructType(descriptor.getFields.asScala.flatMap(structFieldFor(_, Set.empty)).toArray),
+      StructType(descriptor.getFields.asScala.flatMap(
+        structFieldFor(_, Map.empty, protobufOptions: ProtobufOptions)).toArray),
       nullable = true)
   }
 
   def structFieldFor(
       fd: FieldDescriptor,
-      existingRecordNames: Set[String]): Option[StructField] = {
+      existingRecordNames: Map[String, Int],
+      protobufOptions: ProtobufOptions): Option[StructField] = {
     import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._
     val dataType = fd.getJavaType match {
       case INT => Some(IntegerType)
@@ -81,9 +87,17 @@ object SchemaConverters {
         fd.getMessageType.getFields.forEach { field =>
           field.getName match {
             case "key" =>
-              keyType = structFieldFor(field, existingRecordNames).get.dataType
+              keyType =
+                structFieldFor(
+                  field,
+                  existingRecordNames,
+                  protobufOptions).get.dataType
             case "value" =>
-              valueType = structFieldFor(field, existingRecordNames).get.dataType
+              valueType =
+                structFieldFor(
+                  field,
+                  existingRecordNames,
+                  protobufOptions).get.dataType
           }
         }
         return Option(
@@ -92,14 +106,26 @@ object SchemaConverters {
             MapType(keyType, valueType, valueContainsNull = false).defaultConcreteType,
             nullable = false))
       case MESSAGE =>
-        if (existingRecordNames.contains(fd.getFullName)) {
+        // Setting the circularReferenceDepth to 0 allows the field to be recursed once, setting
+        // it to 1 allows it to be recursed twice, and setting it to 2 allows it to be recursed
+        // thrice. circularReferenceDepth value greater than 2 is not allowed. If the not
+        // specified, it will default to -1, which disables recursive fields.
+        val recordName = fd.getMessageType.getFullName
-        val recordName = fd.getMessageType.getFullName
+        val recordName = fd.getFullName
-        val recordName = fd.getMessageType.getFullName
+        val recordName = fd.getFullName
+        if (existingRecordNames.contains(recordName) &&
+          protobufOptions.circularReferenceDepth < 0 ) {
           throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString())
+        } else if (existingRecordNames.contains(recordName) &&
+          existingRecordNames.getOrElse(recordName, 0)
+            > protobufOptions.circularReferenceDepth) {
+          return Some(StructField(fd.getName, NullType, nullable = false))
         }
-        val newRecordNames = existingRecordNames + fd.getFullName
+
+        val newRecordNames = existingRecordNames +
+          (recordName -> (existingRecordNames.getOrElse(recordName, 0) + 1))
 
         Option(
           fd.getMessageType.getFields.asScala
-            .flatMap(structFieldFor(_, newRecordNames))
+            .flatMap(structFieldFor(_, newRecordNames, protobufOptions))
             .toSeq)
           .filter(_.nonEmpty)
           .map(StructType.apply)

diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.desc b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc
diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.proto b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto
@@ -170,4 +170,118 @@ message timeStampMsg {
 message durationMsg {
   string key = 1;
   Duration duration = 2;
-}
+}
+
+message OneOfEvent {
+  string key = 1;
+  oneof payload {
+    int32 col_1 = 2;
+    string col_2 = 3;
+    int64 col_3 = 4;
+  }
+  repeated string col_4 = 5;
+}
+
+message EventWithRecursion {
+  int32 key = 1;
+  messageA a = 2;
+}
+message messageA {
+  EventWithRecursion a = 1;
+  messageB b = 2;
+}
+message messageB {
+  EventWithRecursion aa = 1;
+  messageC c = 2;
+}
+message messageC {
+  EventWithRecursion aaa = 1;
+  int32 key= 2;
+}
+
+message Employee {
+  string firstName = 1;
+  string lastName = 2;
+  oneof role {
+    IC ic = 3;
+    EM em = 4;
+    EM2 em2 = 5;
+    Director dir = 6;
+    SeniorDirector sDir = 7;
+    VP vp = 8;
+    SVP svp = 9;
+    CTO cto = 10;
+    CEO ceo = 11;
+  }
+}
+
+message IC {
+  repeated string skills = 1;
+  Employee icManager = 2;		// EM or EM2 or Director..
+}
+
+message EM {
+  int64 teamsize = 1;
+  Employee emManager = 2;		// EM2 or Director..
+}
+
+message EM2 {
+  int64 teamsize = 1;
+  Employee em2Manager = 2;		// Director or Senior Director..
+}
+
+message Director {
+  int64 teamsize = 1;
+  Employee dirManager = 2;		// Senior Director or VP..
+}
+
+message SeniorDirector {
+  int64 teamsize = 1;
+  Employee sdManager = 2;		// VP or SVP...
+}
+
+message VP {
+  int64 teamsize = 1;
+  Employee vpManager = 2;		// SVP or CTO...
+}
+
+message SVP {
+  int64 teamsize = 1;
+  Employee svpManager = 2;		// CTO or CET
+}
+
+message CTO {
+  int64 teamsize = 1;
+  Employee ctoManager = 2;		// CEO
+}
+
+message CEO {
+  int64 teamsize = 1;
+  Employee ceoManager = 2;		// null
+}
+
+message OneOfEventWithRecursion {
+  string key = 1;
+  oneof payload {
+    EventRecursiveA recursiveA = 3;
+    EventRecursiveB recursiveB = 6;
+  }
+  string value = 7;
+}
+
+message EventRecursiveA {
+  OneOfEventWithRecursion recursiveA = 1;
+  string key = 2;
+}
+
+message EventRecursiveB {
+  string key = 1;
+  string value = 2;
+  OneOfEventWithRecursion recursiveA = 3;
+}
+
+message Status {
+  int32 id = 1;
+  Timestamp trade_time = 2;
+  Status status = 3;
+}