Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions docs/sql-data-sources-xml.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ Data source options of XML can be set via:
<tr>
<td><code>inferSchema</code></td>
<td><code>true</code></td>
<td>If true, attempts to infer an appropriate type for each resulting DataFrame column. If false, all resulting columns are of string type. Default is true. XML built-in functions ignore this option.</td>
<td>If true, attempts to infer an appropriate type for each resulting DataFrame column. If false, all resulting columns are of string type.</td>
<td>read</td>
</tr>

Expand All @@ -108,7 +108,7 @@ Data source options of XML can be set via:
<tr>
<td><code>attributePrefix</code></td>
<td><code>_</code></td>
<td>The prefix for attributes to differentiate attributes from elements. This will be the prefix for field names. Default is _. Can be empty for reading XML, but not for writing.</td>
<td>The prefix for attributes to differentiate attributes from elements. This will be the prefix for field names. Can be empty for reading XML, but not for writing.</td>
<td>read/write</td>
</tr>

Expand Down Expand Up @@ -235,5 +235,12 @@ Data source options of XML can be set via:
<td>write</td>
</tr>

<tr>
<td><code>validateName</code></td>
<td><code>true</code></td>
<td>If true, throws error on XML element name validation failure. For example, SQL field names can have spaces, but XML element names cannot.</td>
<td>write</td>
</tr>

</table>
Other generic options can be found in <a href="https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html"> Generic File Source Options</a>.
2 changes: 2 additions & 0 deletions python/pyspark/sql/connect/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,6 +792,7 @@ def xml(
timestampFormat: Optional[str] = None,
compression: Optional[str] = None,
encoding: Optional[str] = None,
validateName: Optional[bool] = None,
) -> None:
self.mode(mode)
self._set_opts(
Expand All @@ -806,6 +807,7 @@ def xml(
timestampFormat=timestampFormat,
compression=compression,
encoding=encoding,
validateName=validateName,
)
self.format("xml").save(path)

Expand Down
2 changes: 2 additions & 0 deletions python/pyspark/sql/readwriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2096,6 +2096,7 @@ def xml(
timestampFormat: Optional[str] = None,
compression: Optional[str] = None,
encoding: Optional[str] = None,
validateName: Optional[bool] = None,
) -> None:
r"""Saves the content of the :class:`DataFrame` in XML format at the specified path.

Expand Down Expand Up @@ -2155,6 +2156,7 @@ def xml(
timestampFormat=timestampFormat,
compression=compression,
encoding=encoding,
validateName=validateName,
)
self._jwrite.xml(path)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class StaxXmlGenerator(
val factory = XMLOutputFactory.newInstance()
// to_xml disables structure validation to allow multiple root tags
factory.setProperty(WstxOutputProperties.P_OUTPUT_VALIDATE_STRUCTURE, validateStructure)
factory.setProperty(WstxOutputProperties.P_OUTPUT_VALIDATE_NAMES, options.validateName)
val xmlWriter = factory.createXMLStreamWriter(writer)
if (!indentDisabled) {
val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ class XmlOptions(
// setting indent to "" disables indentation in the generated XML.
// Each row will be written in a new line.
val indent = parameters.getOrElse(INDENT, DEFAULT_INDENT)
val validateName = getBool(VALIDATE_NAME, true)

/**
* Infer columns with all valid date entries as date type (otherwise inferred as string or
Expand Down Expand Up @@ -210,6 +211,7 @@ object XmlOptions extends DataSourceOptions {
val TIME_ZONE = newOption("timeZone")
val INDENT = newOption("indent")
val PREFERS_DECIMAL = newOption("prefersDecimal")
val VALIDATE_NAME = newOption("validateName")
// Options with alternative
val ENCODING = "encoding"
val CHARSET = "charset"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
*/
package org.apache.spark.sql.execution.datasources.xml

import java.io.EOFException
import java.io.{EOFException, File}
import java.nio.charset.{StandardCharsets, UnsupportedCharsetException}
import java.nio.file.{Files, Path, Paths}
import java.sql.{Date, Timestamp}
import java.time.{Instant, LocalDateTime}
import java.util.TimeZone
import javax.xml.stream.XMLStreamException

import scala.collection.immutable.ArraySeq
import scala.collection.mutable
Expand Down Expand Up @@ -2828,4 +2829,55 @@ class XmlSuite
}
}
}

test("XML Validate Name") {
val data = Seq(Row("Random String"))

def checkValidation(fieldName: String,
errorMsg: String,
validateName: Boolean = true): Unit = {
val schema = StructType(Seq(StructField(fieldName, StringType)))
val df = spark.createDataFrame(data.asJava, schema)

withTempDir { dir =>
val path = dir.getCanonicalPath
validateName match {
case false =>
df.write
.option("rowTag", "ROW")
.option("validateName", false)
.option("declaration", "")
.option("indent", "")
.mode(SaveMode.Overwrite)
.xml(path)
// read file back and check its content
val xmlFile = new File(path).listFiles()
.filter(_.isFile)
.filter(_.getName.endsWith("xml")).head
val actualContent = Files.readString(xmlFile.toPath).replaceAll("\\n", "")
assert(actualContent ===
s"<${XmlOptions.DEFAULT_ROOT_TAG}><ROW>" +
s"<$fieldName>${data.head.getString(0)}</$fieldName>" +
s"</ROW></${XmlOptions.DEFAULT_ROOT_TAG}>")

case true =>
val e = intercept[SparkException] {
df.write
.option("rowTag", "ROW")
.mode(SaveMode.Overwrite)
.xml(path)
}

assert(e.getCause.getCause.isInstanceOf[XMLStreamException])
assert(e.getMessage.contains(errorMsg))
}
}
}

checkValidation("", "Illegal to pass empty name")
checkValidation(" ", "Illegal first name character ' '")
checkValidation("1field", "Illegal first name character '1'")
checkValidation("field name with space", "Illegal name character ' '")
checkValidation("field", "", false)
}
}