apache · shujingyang-db · Dec 11, 2023 · Dec 11, 2023 · Dec 11, 2023 · Dec 12, 2023
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -25,6 +25,7 @@ import javax.xml.stream.events._
 import javax.xml.transform.stream.StreamSource
 import javax.xml.validation.Schema
 
+import scala.annotation.tailrec
 import scala.collection.mutable.ArrayBuffer
 import scala.jdk.CollectionConverters._
 import scala.util.Try
@@ -35,7 +36,21 @@ import org.apache.spark.SparkUpgradeException
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.ExprUtils
-import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, BadRecordException, CaseInsensitiveMap, DateFormatter, DropMalformedMode, FailureSafeParser, GenericArrayData, MapData, ParseMode, PartialResultArrayException, PartialResultException, PermissiveMode, TimestampFormatter}
+import org.apache.spark.sql.catalyst.util.{
+  ArrayBasedMapData,
+  BadRecordException,
+  CaseInsensitiveMap,
+  DateFormatter,
+  DropMalformedMode,
+  FailureSafeParser,
+  GenericArrayData,
+  MapData,
+  ParseMode,
+  PartialResultArrayException,
+  PartialResultException,
+  PermissiveMode,
+  TimestampFormatter
+}
 import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
 import org.apache.spark.sql.catalyst.xml.StaxXmlParser.convertStream
 import org.apache.spark.sql.errors.QueryExecutionErrors
@@ -69,6 +84,7 @@ class StaxXmlParser(
 
   private val decimalParser = ExprUtils.getDecimalParser(options.locale)
 
+  private val caseSensitive = SQLConf.get.caseSensitiveAnalysis
 
   /**
    * Parses a single XML string and turns it into either one resulting row or no row (if the
@@ -85,7 +101,7 @@ class StaxXmlParser(
   }
 
   private def getFieldNameToIndex(schema: StructType): Map[String, Int] = {
-    if (SQLConf.get.caseSensitiveAnalysis) {
+    if (caseSensitive) {
       schema.map(_.name).zipWithIndex.toMap
     } else {
       CaseInsensitiveMap(schema.map(_.name).zipWithIndex.toMap)
@@ -201,27 +217,30 @@ class StaxXmlParser(
       case (_: EndElement, _: DataType) => null
       case (c: Characters, ArrayType(st, _)) =>
         // For `ArrayType`, it needs to return the type of element. The values are merged later.
+        parser.next
         convertTo(c.getData, st)
       case (c: Characters, st: StructType) =>
-        // If a value tag is present, this can be an attribute-only element whose values is in that
-        // value tag field. Or, it can be a mixed-type element with both some character elements
-        // and other complex structure. Character elements are ignored.
-        val attributesOnly = st.fields.forall { f =>
-          f.name == options.valueTag || f.name.startsWith(options.attributePrefix)
-        }
-        if (attributesOnly) {
-          // If everything else is an attribute column, there's no complex structure.
-          // Just return the value of the character element, or null if we don't have a value tag
-          st.find(_.name == options.valueTag).map(
-            valueTag => convertTo(c.getData, valueTag.dataType)).orNull
-        } else {
-          // Otherwise, ignore this character element, and continue parsing the following complex
-          // structure
-          parser.next
-          parser.peek match {
-            case _: EndElement => null // no struct here at all; done
-            case _ => convertObject(parser, st)
-          }
+        parser.next
+        parser.peek match {
+          case _: EndElement =>
+            // It couldn't be an array of value tags
+            // as the opening tag is immediately followed by a closing tag.
+            if (c.isWhiteSpace) {
+              return null
+            }
+            val indexOpt = getFieldNameToIndex(st).get(options.valueTag)
+            indexOpt match {
+              case Some(index) =>
+                convertTo(c.getData, st.fields(index).dataType)
+              case None => null
+            }
+          case _ =>
+            val row = convertObject(parser, st)
+            if (!c.isWhiteSpace) {
+              addOrUpdate(row.toSeq(st).toArray, st, options.valueTag, c.getData, addToTail = false)
+            } else {
+              row
+            }
         }
       case (_: Characters, _: StringType) =>
         convertTo(StaxXmlParserUtils.currentStructureAsString(parser), StringType)
@@ -237,6 +256,7 @@ class StaxXmlParser(
           case _ => convertField(parser, dataType, attributes)
         }
       case (c: Characters, dt: DataType) =>
+        parser.next
         convertTo(c.getData, dt)
       case (e: XMLEvent, dt: DataType) =>
         throw new IllegalArgumentException(
@@ -262,7 +282,12 @@ class StaxXmlParser(
         case e: StartElement =>
           kvPairs +=
             (UTF8String.fromString(StaxXmlParserUtils.getName(e.asStartElement.getName, options)) ->
-             convertField(parser, valueType))
+            convertField(parser, valueType))
+        case c: Characters if !c.isWhiteSpace =>
+          // Create a value tag field for it
+          kvPairs +=
+          // TODO: We don't support an array value tags in map yet.
+          (UTF8String.fromString(options.valueTag) -> convertTo(c.getData, valueType))
         case _: EndElement =>
           shouldStop = StaxXmlParserUtils.checkEndElement(parser)
         case _ => // do nothing
@@ -343,8 +368,9 @@ class StaxXmlParser(
     val row = new Array[Any](schema.length)
     val nameToIndex = getFieldNameToIndex(schema)
     // If there are attributes, then we process them first.
-    convertAttributes(rootAttributes, schema).toSeq.foreach { case (f, v) =>
-      nameToIndex.get(f).foreach { row(_) = v }
+    convertAttributes(rootAttributes, schema).toSeq.foreach {
+      case (f, v) =>
+        nameToIndex.get(f).foreach { row(_) = v }
     }
 
     val wildcardColName = options.wildcardColName
@@ -405,15 +431,11 @@ class StaxXmlParser(
             badRecordException = badRecordException.orElse(Some(e))
         }
 
-        case c: Characters if !c.isWhiteSpace && isRootAttributesOnly =>
-          nameToIndex.get(options.valueTag) match {
-            case Some(index) =>
-              row(index) = convertTo(c.getData, schema(index).dataType)
-            case None => // do nothing
-          }
+        case c: Characters if !c.isWhiteSpace =>
+          addOrUpdate(row, schema, options.valueTag, c.getData)
 
         case _: EndElement =>
-          shouldStop = StaxXmlParserUtils.checkEndElement(parser)
+          shouldStop = parseAndCheckEndElement(row, schema, parser)
 
         case _ => // do nothing
       }
@@ -576,6 +598,54 @@ class StaxXmlParser(
       castTo(data, FloatType).asInstanceOf[Float]
     }
   }
+
+  @tailrec
+  private def parseAndCheckEndElement(
+      row: Array[Any],
+      schema: StructType,
+      parser: XMLEventReader): Boolean = {
+    parser.peek match {
+      case _: EndElement | _: EndDocument => true
+      case _: StartElement => false
+      case c: Characters if !c.isWhiteSpace =>
+        parser.nextEvent()
+        addOrUpdate(row, schema, options.valueTag, c.getData)
+        parseAndCheckEndElement(row, schema, parser)
+      case _ =>
+        parser.nextEvent()
+        parseAndCheckEndElement(row, schema, parser)
+    }
+  }
+
+  private def addOrUpdate(
+      row: Array[Any],
+      schema: StructType,
+      name: String,
+      data: String,
+      addToTail: Boolean = true): InternalRow = {
+    schema.getFieldIndex(name) match {
+      case Some(index) =>
+        schema(index).dataType match {
+          case ArrayType(elementType, _) =>
+            val value = convertTo(data, elementType)
+            val result = if (row(index) == null) {
+              ArrayBuffer(value)
+            } else {
+              val genericArrayData = row(index).asInstanceOf[GenericArrayData]
+              if (addToTail) {
+                genericArrayData.toArray(elementType) :+ value
+              } else {
+                value +: genericArrayData.toArray(elementType)
+              }
+            }
+            row(index) = new GenericArrayData(result)
+          case dataType =>
+            row(index) = convertTo(data, dataType)
+        }
+      case None => // do nothing
+    }
+    InternalRow.fromSeq(row.toIndexedSeq)
+  }
 }
 
 /**

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
@@ -164,7 +164,6 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
     }
   }
 
-  @tailrec
   private def inferField(parser: XMLEventReader): DataType = {
     parser.peek match {
       case _: EndElement => NullType
@@ -182,18 +181,25 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
           case _ => inferField(parser)
         }
       case c: Characters if !c.isWhiteSpace =>
-        // This could be the characters of a character-only element, or could have mixed
-        // characters and other complex structure
         val characterType = inferFrom(c.getData)
         parser.nextEvent()
         parser.peek match {
           case _: StartElement =>
-            // Some more elements follow; so ignore the characters.
-            // Use the schema of the rest
-            inferObject(parser).asInstanceOf[StructType]
+            // Some more elements follow;
+            // This is a mix of values and other elements
+            val innerType = inferObject(parser).asInstanceOf[StructType]
+            addOrUpdateValueTagType(innerType, characterType)
           case _ =>
-            // That's all, just the character-only body; use that as the type
-            characterType
+            val fieldType = inferField(parser)
+            fieldType match {
+              case st: StructType => addOrUpdateValueTagType(st, characterType)
+              case _: NullType => characterType
+              case _: DataType =>
+                // The field type couldn't be an array type
+                new StructType()
+                .add(options.valueTag, addOrUpdateType(Some(characterType), fieldType))
+
+            }
         }
       case e: XMLEvent =>
         throw new IllegalArgumentException(s"Failed to parse data with unexpected event $e")
@@ -229,17 +235,19 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
     val nameToDataType =
       collection.mutable.TreeMap.empty[String, DataType](caseSensitivityOrdering)
 
-    def addOrUpdateType(fieldName: String, newType: DataType): Unit = {
-      val oldTypeOpt = nameToDataType.get(fieldName)
-      oldTypeOpt match {
-        // If the field name exists in the map,
-        // merge the type and infer the combined field as an array type if necessary
-        case Some(oldType) if !oldType.isInstanceOf[ArrayType] =>
-          nameToDataType.update(fieldName, ArrayType(compatibleType(oldType, newType)))
-        case Some(oldType) =>
-          nameToDataType.update(fieldName, compatibleType(oldType, newType))
-        case None =>
-          nameToDataType.put(fieldName, newType)
+    @tailrec
+    def inferAndCheckEndElement(parser: XMLEventReader): Boolean = {
+      parser.peek match {
+        case _: EndElement | _: EndDocument => true
+        case _: StartElement => false
+        case c: Characters if !c.isWhiteSpace =>
+          val characterType = inferFrom(c.getData)
+          parser.nextEvent()
+          addOrUpdateType(nameToDataType, options.valueTag, characterType)
+          inferAndCheckEndElement(parser)
+        case _ =>
+          parser.nextEvent()
+          inferAndCheckEndElement(parser)
       }
     }
 
@@ -248,7 +256,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
       StaxXmlParserUtils.convertAttributesToValuesMap(rootAttributes, options)
     rootValuesMap.foreach {
       case (f, v) =>
-        addOrUpdateType(f, inferFrom(v))
+        addOrUpdateType(nameToDataType, f, inferFrom(v))
     }
     var shouldStop = false
     while (!shouldStop) {
@@ -281,29 +289,19 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
           }
           // Add the field and datatypes so that we can check if this is ArrayType.
           val field = StaxXmlParserUtils.getName(e.asStartElement.getName, options)
-          addOrUpdateType(field, inferredType)
+          addOrUpdateType(nameToDataType, field, inferredType)
 
         case c: Characters if !c.isWhiteSpace =>
           // This can be an attribute-only object
           val valueTagType = inferFrom(c.getData)
-          addOrUpdateType(options.valueTag, valueTagType)
+          addOrUpdateType(nameToDataType, options.valueTag, valueTagType)
 
         case _: EndElement =>
-          shouldStop = StaxXmlParserUtils.checkEndElement(parser)
+          shouldStop = inferAndCheckEndElement(parser)
 
         case _ => // do nothing
       }
     }
-    // A structure object is an attribute-only element
-    // if it only consists of attributes and valueTags.
-    // If not, we will remove the valueTag field from the schema
-    val attributesOnly = nameToDataType.forall {
-      case (fieldName, _) =>
-        fieldName == options.valueTag || fieldName.startsWith(options.attributePrefix)
-    }
-    if (!attributesOnly) {
-      nameToDataType -= options.valueTag
-    }
 
     // Note: other code relies on this sorting for correctness, so don't remove it!
     StructType(nameToDataType.map{
@@ -534,4 +532,75 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
       }
     }
   }
+
+  /**
+   * This helper function merges the data type of value tags and inner elements.
+   * It could only be structure data. Consider the following case,
+   * <a>
+   *   value1
+   *   <b>1</b>
+   *   value2
+   * </a>
+   * Input: ''a struct<b int, _VALUE string>'' and ''_VALUE string''
+   * Return: ''a struct<b int, _VALUE array<string>>''
+   * @param objectType inner elements' type
+   * @param valueTagType value tag's type
+   */
+  private[xml] def addOrUpdateValueTagType(
+      objectType: DataType,
+      valueTagType: DataType): DataType = {
+    (objectType, valueTagType) match {
+      case (st: StructType, _) =>
+        val valueTagIndexOpt = st.getFieldIndex(options.valueTag)
+
+        valueTagIndexOpt match {
+          // If the field name exists in the inner elements,
+          // merge the type and infer the combined field as an array type if necessary
+          case Some(index) if !st(index).dataType.isInstanceOf[ArrayType] =>
+            updateStructField(
+              st,
+              index,
+              ArrayType(compatibleType(st(index).dataType, valueTagType)))
+          case Some(index) =>
+            updateStructField(st, index, compatibleType(st(index).dataType, valueTagType))
+          case None =>
+            st.add(options.valueTag, valueTagType)
+        }
+      case _ =>
+        throw new IllegalStateException(
+          "illegal state when merging value tags types in schema inference"
+        )
+    }
+  }
+
+  private def updateStructField(
+      structType: StructType,
+      index: Int,
+      newType: DataType): StructType = {
+    val newFields: Array[StructField] =
+      structType.fields.updated(index, structType.fields(index).copy(dataType = newType))
+    StructType(newFields)
+  }
+
+  private def addOrUpdateType(
+      nameToDataType: collection.mutable.TreeMap[String, DataType],
+      fieldName: String,
+      newType: DataType): Unit = {
+    val oldTypeOpt = nameToDataType.get(fieldName)
+    val mergedType = addOrUpdateType(oldTypeOpt, newType)
+    nameToDataType.put(fieldName, mergedType)
+  }
+
+  private def addOrUpdateType(oldTypeOpt: Option[DataType], newType: DataType): DataType = {
+    oldTypeOpt match {
+      // If the field name already exists,
+      // merge the type and infer the combined field as an array type if necessary
+      case Some(oldType) if !oldType.isInstanceOf[ArrayType] && !newType.isInstanceOf[NullType] =>
+        ArrayType(compatibleType(oldType, newType))
+      case Some(oldType) =>
+        compatibleType(oldType, newType)
+      case None =>
+        newType
+    }
+  }
 }