-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23724][SPARK-23765][SQL] Line separator for the json datasource #20885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a794988
dccdaa2
d0abab7
6741796
e4faae1
01f4ef5
24cedb9
d40dda2
ad6496c
358863d
7e5be5e
d138d2d
c26ef5d
5f0b069
ef8248f
2efac08
b2020fa
f99c1e1
6d13d00
77112ef
d632706
bbff402
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -176,7 +176,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, | |
| allowComments=None, allowUnquotedFieldNames=None, allowSingleQuotes=None, | ||
| allowNumericLeadingZero=None, allowBackslashEscapingAnyCharacter=None, | ||
| mode=None, columnNameOfCorruptRecord=None, dateFormat=None, timestampFormat=None, | ||
| multiLine=None, allowUnquotedControlChars=None): | ||
| multiLine=None, allowUnquotedControlChars=None, lineSep=None): | ||
| """ | ||
| Loads JSON files and returns the results as a :class:`DataFrame`. | ||
|
|
||
|
|
@@ -237,6 +237,8 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, | |
| :param allowUnquotedControlChars: allows JSON Strings to contain unquoted control | ||
| characters (ASCII characters with value less than 32, | ||
| including tab and line feed characters) or not. | ||
| :param lineSep: defines the line separator that should be used for parsing. If None is | ||
| set, it covers all ``\\r``, ``\\r\\n`` and ``\\n``. | ||
|
|
||
| >>> df1 = spark.read.json('python/test_support/sql/people.json') | ||
| >>> df1.dtypes | ||
|
|
@@ -254,7 +256,7 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None, | |
| allowBackslashEscapingAnyCharacter=allowBackslashEscapingAnyCharacter, | ||
| mode=mode, columnNameOfCorruptRecord=columnNameOfCorruptRecord, dateFormat=dateFormat, | ||
| timestampFormat=timestampFormat, multiLine=multiLine, | ||
| allowUnquotedControlChars=allowUnquotedControlChars) | ||
| allowUnquotedControlChars=allowUnquotedControlChars, lineSep=lineSep) | ||
| if isinstance(path, basestring): | ||
| path = [path] | ||
| if type(path) == list: | ||
|
|
@@ -746,7 +748,8 @@ def saveAsTable(self, name, format=None, mode=None, partitionBy=None, **options) | |
| self._jwrite.saveAsTable(name) | ||
|
|
||
| @since(1.4) | ||
| def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None): | ||
| def json(self, path, mode=None, compression=None, dateFormat=None, timestampFormat=None, | ||
| lineSep=None): | ||
| """Saves the content of the :class:`DataFrame` in JSON format | ||
| (`JSON Lines text format or newline-delimited JSON <http://jsonlines.org/>`_) at the | ||
| specified path. | ||
|
|
@@ -770,12 +773,15 @@ def json(self, path, mode=None, compression=None, dateFormat=None, timestampForm | |
| formats follow the formats at ``java.text.SimpleDateFormat``. | ||
| This applies to timestamp type. If None is set, it uses the | ||
| default value, ``yyyy-MM-dd'T'HH:mm:ss.SSSXXX``. | ||
| :param lineSep: defines the line separator that should be used for writing. If None is | ||
| set, it uses the default value, ``\\n``. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a method of DataFrameWriter. It writes exactly |
||
|
|
||
| >>> df.write.json(os.path.join(tempfile.mkdtemp(), 'data')) | ||
| """ | ||
| self.mode(mode) | ||
| self._set_opts( | ||
| compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat) | ||
| compression=compression, dateFormat=dateFormat, timestampFormat=timestampFormat, | ||
| lineSep=lineSep) | ||
| self._jwrite.json(path) | ||
|
|
||
| @since(1.4) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,38 @@ private[sql] class JSONOptions( | |
|
|
||
| val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) | ||
|
|
||
| val charset: Option[String] = Some("UTF-8") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It sounds like we need to review #20849 first |
||
|
|
||
| /** | ||
| * A sequence of bytes between two consecutive json records. Format of the option is: | ||
| * selector (1 char) + delimiter body (any length) | sequence of chars | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm afraid of defining our own rule here, is there any standard we can follow? |
||
| * The following selectors are supported: | ||
| * - 'x' + sequence of bytes in hexadecimal format. For example: "x0a 0d". | ||
| * Hex pairs can be separated by any chars different from 0-9,A-F,a-f | ||
| * - '\' - reserved for a sequence of control chars like "\r\n" | ||
| * and unicode escape like "\u000D\u000A" | ||
| * - 'r' and '/' - reserved for future use | ||
| * | ||
| * Note: the option defines a delimiter for the json reader only, the json writer | ||
| * uses '\n' as the delimiter of output records (it is converted to sequence of | ||
| * bytes according to charset) | ||
| */ | ||
| val lineSeparator: Option[Array[Byte]] = parameters.get("lineSep").collect { | ||
| case hexs if hexs.startsWith("x") => | ||
| hexs.replaceAll("[^0-9A-Fa-f]", "").sliding(2, 2).toArray | ||
| .map(Integer.parseInt(_, 16).toByte) | ||
| case reserved if reserved.startsWith("r") || reserved.startsWith("/") => | ||
| throw new NotImplementedError(s"the $reserved selector has not supported yet") | ||
| case delim => delim.getBytes(charset.getOrElse( | ||
| throw new IllegalArgumentException("Please, set the charset option for the delimiter"))) | ||
| } | ||
| val lineSeparatorInRead: Option[Array[Byte]] = lineSeparator | ||
|
|
||
| // Note that JSON uses writer with UTF-8 charset. This string will be written out as UTF-8. | ||
| val lineSeparatorInWrite: String = { | ||
| lineSeparator.map(new String(_, charset.getOrElse("UTF-8"))).getOrElse("\n") | ||
| } | ||
|
|
||
| /** Sets config options on a Jackson [[JsonFactory]]. */ | ||
| def setJacksonOptions(factory: JsonFactory): Unit = { | ||
| factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments) | ||
|
|
@@ -96,4 +128,10 @@ private[sql] class JSONOptions( | |
| allowBackslashEscapingAnyCharacter) | ||
| factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, allowUnquotedControlChars) | ||
| } | ||
|
|
||
| def getTextOptions: Map[String, String] = { | ||
| lineSeparatorInRead.map{ bytes => | ||
| "lineSep" -> bytes.map("x%02x".format(_)).mkString | ||
| }.toMap | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
rename it to
recordDelimiter