-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-46382][SQL] XML: Capture values interspersed between elements #44318
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 6 commits
52bb404
68c5eac
4f63617
815859f
4be69e3
02193a8
0e79565
f60a758
4f3acc0
3de09f4
775052a
306cbe6
2b1fc93
bc89b57
cc27944
6599147
0fa042d
51cdf54
dcae962
0eb8aeb
a5c3fbc
32bd9fe
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -25,6 +25,7 @@ import javax.xml.stream.events._ | |||||
| import javax.xml.transform.stream.StreamSource | ||||||
| import javax.xml.validation.Schema | ||||||
|
|
||||||
| import scala.annotation.tailrec | ||||||
| import scala.collection.mutable.ArrayBuffer | ||||||
| import scala.jdk.CollectionConverters._ | ||||||
| import scala.util.Try | ||||||
|
|
@@ -35,7 +36,21 @@ import org.apache.spark.SparkUpgradeException | |||||
| import org.apache.spark.internal.Logging | ||||||
| import org.apache.spark.sql.catalyst.InternalRow | ||||||
| import org.apache.spark.sql.catalyst.expressions.ExprUtils | ||||||
| import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, BadRecordException, CaseInsensitiveMap, DateFormatter, DropMalformedMode, FailureSafeParser, GenericArrayData, MapData, ParseMode, PartialResultArrayException, PartialResultException, PermissiveMode, TimestampFormatter} | ||||||
| import org.apache.spark.sql.catalyst.util.{ | ||||||
| ArrayBasedMapData, | ||||||
| BadRecordException, | ||||||
| CaseInsensitiveMap, | ||||||
| DateFormatter, | ||||||
| DropMalformedMode, | ||||||
| FailureSafeParser, | ||||||
| GenericArrayData, | ||||||
| MapData, | ||||||
| ParseMode, | ||||||
| PartialResultArrayException, | ||||||
| PartialResultException, | ||||||
| PermissiveMode, | ||||||
| TimestampFormatter | ||||||
| } | ||||||
| import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT | ||||||
| import org.apache.spark.sql.catalyst.xml.StaxXmlParser.convertStream | ||||||
| import org.apache.spark.sql.errors.QueryExecutionErrors | ||||||
|
|
@@ -62,6 +77,7 @@ class StaxXmlParser( | |||||
|
|
||||||
| private val decimalParser = ExprUtils.getDecimalParser(options.locale) | ||||||
|
|
||||||
| private val caseSensitive = SQLConf.get.caseSensitiveAnalysis | ||||||
|
|
||||||
| /** | ||||||
| * Parses a single XML string and turns it into either one resulting row or no row (if the | ||||||
|
|
@@ -78,7 +94,7 @@ class StaxXmlParser( | |||||
| } | ||||||
|
|
||||||
| private def getFieldNameToIndex(schema: StructType): Map[String, Int] = { | ||||||
| if (SQLConf.get.caseSensitiveAnalysis) { | ||||||
| if (caseSensitive) { | ||||||
| schema.map(_.name).zipWithIndex.toMap | ||||||
| } else { | ||||||
| CaseInsensitiveMap(schema.map(_.name).zipWithIndex.toMap) | ||||||
|
|
@@ -194,27 +210,30 @@ class StaxXmlParser( | |||||
| case (_: EndElement, _: DataType) => null | ||||||
| case (c: Characters, ArrayType(st, _)) => | ||||||
| // For `ArrayType`, it needs to return the type of element. The values are merged later. | ||||||
| parser.next | ||||||
| convertTo(c.getData, st) | ||||||
| case (c: Characters, st: StructType) => | ||||||
| // If a value tag is present, this can be an attribute-only element whose values is in that | ||||||
| // value tag field. Or, it can be a mixed-type element with both some character elements | ||||||
| // and other complex structure. Character elements are ignored. | ||||||
| val attributesOnly = st.fields.forall { f => | ||||||
| f.name == options.valueTag || f.name.startsWith(options.attributePrefix) | ||||||
| } | ||||||
| if (attributesOnly) { | ||||||
| // If everything else is an attribute column, there's no complex structure. | ||||||
| // Just return the value of the character element, or null if we don't have a value tag | ||||||
| st.find(_.name == options.valueTag).map( | ||||||
| valueTag => convertTo(c.getData, valueTag.dataType)).orNull | ||||||
| } else { | ||||||
| // Otherwise, ignore this character element, and continue parsing the following complex | ||||||
| // structure | ||||||
| parser.next | ||||||
| parser.peek match { | ||||||
| case _: EndElement => null // no struct here at all; done | ||||||
| case _ => convertObject(parser, st) | ||||||
| } | ||||||
| parser.next | ||||||
| parser.peek match { | ||||||
| case _: EndElement => | ||||||
| // It couldn't be an array of value tags | ||||||
| // as the opening tag is immediately followed by a closing tag. | ||||||
| if (isEmptyString(c)) { | ||||||
| return null | ||||||
| } | ||||||
| val indexOpt = getFieldNameToIndex(st).get(options.valueTag) | ||||||
| indexOpt match { | ||||||
| case Some(index) => | ||||||
| convertTo(c.getData, st.fields(index).dataType) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this be returning a
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, I get that. It looks like the assumption is that |
||||||
| case None => null | ||||||
| } | ||||||
| case _ => | ||||||
| val row = convertObject(parser, st) | ||||||
|
Comment on lines
+237
to
+238
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will this handle values separated by comment or cdata? If so, we don't need
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for bringing this up! I added some test cases for comments. We still need this branch as |
||||||
| if (!isEmptyString(c)) { | ||||||
|
||||||
| if (!isEmptyString(c)) { | |
| if (!c.isWhiteSpace) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why addToTail is false here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is because in this case, we encounter the interspersed value first and then the nested objects. We want to make sure that the value tag appears before the nested objects
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is parser.next not required here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need to move the next event. currentStructureAsString will move the parser pointer.
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| case c: Characters if !isEmptyString(c) => | |
| case c: Characters if !c.isWhiteSpace => |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
| case c: Characters if !isEmptyString(c) => | |
| case c: Characters if !c.isWhiteSpace => |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is same as c.isWhiteSpace
shujingyang-db marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,6 @@ import javax.xml.namespace.QName | |
| import javax.xml.stream.{EventFilter, XMLEventReader, XMLInputFactory, XMLStreamConstants} | ||
| import javax.xml.stream.events._ | ||
|
|
||
| import scala.annotation.tailrec | ||
| import scala.jdk.CollectionConverters._ | ||
|
|
||
| object StaxXmlParserUtils { | ||
|
|
@@ -70,7 +69,6 @@ object StaxXmlParserUtils { | |
| /** | ||
| * Checks if current event points the EndElement. | ||
| */ | ||
| @tailrec | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why was this removed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch! Adding it back |
||
| def checkEndElement(parser: XMLEventReader): Boolean = { | ||
| parser.peek match { | ||
| case _: EndElement | _: EndDocument => true | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -159,7 +159,7 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean) | |||||
| parser.peek match { | ||||||
| case _: EndElement => NullType | ||||||
| case _: StartElement => inferObject(parser) | ||||||
| case c: Characters if c.isWhiteSpace => | ||||||
| case c: Characters if isEmptyString(c) => | ||||||
|
||||||
| case c: Characters if isEmptyString(c) => | |
| case c: Characters if c.isWhiteSpace => |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For this case, can't we return inferObject(parser)?
In inferObject(parser), the case for StructType can be updated to "unnest" StructType with just valueTag.
Without this, there is lot of code duplication logic for valueTag.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMHO inferObject can't do this. This branch handles both primitive types and nested objects. If we return inferObject(parser), the primitive types will be inferred as a structFields of valueTag
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a test case for this scenario?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A valueTag that locates after a closing tag in the inner element and before the closing tag in the outer element will cover this scenario.
<a>
value2
<b>1</b>
value3
</a>
We covered this case in the most our test cases
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
while the case for valueTag is unlikely to change, its better to add case sensitivity logic to it to make it consistent with other fields. Can be a separate PR. Not a high prio.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for answering this question! I create a Jira ticket for it
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Won't st(index).dataType will be of ArrayType?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this branch handles this case of array type. If it's an array, we will merge the element types.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets add this test case scenario where Array<LongType> is updated to Array<DoubleType>:
<ROW>
<a>
1
<b>2</b>
3
<b>4</b>
5.0
</a>
</ROW>
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,18 @@ | ||
| <?xml version="1.0"?> | ||
| <ROWSET> | ||
| <ROW> | ||
| value1 | ||
| <array> | ||
| value2 | ||
| <b>1</b> | ||
| value3 | ||
| </array> | ||
| <array> | ||
| value4 | ||
| <b>2</b> | ||
| value5 | ||
| <c>3</c> | ||
| value6 | ||
| </array> | ||
| </ROW> | ||
| </ROWSET> |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Lets not allow any whitespace values for
valueTag.