-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-26108][SQL] Support custom lineSep in CSV datasource #23080
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
a790bb3
7a47990
be2870f
a058a6f
7e3c026
486b090
a0fedbb
5f013f5
65786df
49b91ea
12022ad
bb8a13b
0869b81
1f5399f
918d163
c06899f
a4c4b67
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -192,6 +192,20 @@ class CSVOptions( | |
| */ | ||
| val emptyValueInWrite = emptyValue.getOrElse("\"\"") | ||
|
|
||
| /** | ||
| * A string between two consecutive JSON records. | ||
| */ | ||
| val lineSeparator: Option[String] = parameters.get("lineSep").map { sep => | ||
| require(sep.nonEmpty, "'lineSep' cannot be an empty string.") | ||
| require(sep.length <= 2, "'lineSep' can contain 1 or 2 characters.") | ||
| sep | ||
| } | ||
|
|
||
| val lineSeparatorInRead: Option[Array[Byte]] = lineSeparator.map { lineSep => | ||
| lineSep.getBytes(charset) | ||
| } | ||
| val lineSeparatorInWrite: Option[String] = lineSeparator | ||
|
|
||
| def asWriterSettings: CsvWriterSettings = { | ||
| val writerSettings = new CsvWriterSettings() | ||
| val format = writerSettings.getFormat | ||
|
|
@@ -200,6 +214,8 @@ class CSVOptions( | |
| format.setQuoteEscape(escape) | ||
| charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) | ||
| format.setComment(comment) | ||
| lineSeparatorInWrite.foreach(format.setLineSeparator) | ||
|
|
||
| writerSettings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceFlagInWrite) | ||
| writerSettings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceFlagInWrite) | ||
| writerSettings.setNullValue(nullValue) | ||
|
|
@@ -216,8 +232,13 @@ class CSVOptions( | |
| format.setDelimiter(delimiter) | ||
| format.setQuote(quote) | ||
| format.setQuoteEscape(escape) | ||
| lineSeparator.foreach {sep => | ||
| format.setLineSeparator(sep) | ||
| format.setNormalizedNewline(0x00.toChar) | ||
|
||
| } | ||
| charToEscapeQuoteEscaping.foreach(format.setCharToEscapeQuoteEscaping) | ||
| format.setComment(comment) | ||
|
|
||
| settings.setIgnoreLeadingWhitespaces(ignoreLeadingWhiteSpaceInRead) | ||
| settings.setIgnoreTrailingWhitespaces(ignoreTrailingWhiteSpaceInRead) | ||
| settings.setReadInputOnSeparateThread(false) | ||
|
|
@@ -227,7 +248,10 @@ class CSVOptions( | |
| settings.setEmptyValue(emptyValueInRead) | ||
| settings.setMaxCharsPerColumn(maxCharsPerColumn) | ||
| settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER) | ||
| settings.setLineSeparatorDetectionEnabled(multiLine == true) | ||
| settings.setLineSeparatorDetectionEnabled(lineSeparatorInRead.isEmpty && multiLine) | ||
| lineSeparatorInRead.foreach { _ => | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nice! |
||
| settings.setNormalizeLineEndingsWithinQuotes(!multiLine) | ||
| } | ||
|
|
||
| settings | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -377,6 +377,8 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo | |
| * <li>`multiLine` (default `false`): parse one record, which may span multiple lines.</li> | ||
| * <li>`locale` (default is `en-US`): sets a locale as language tag in IETF BCP 47 format. | ||
| * For instance, this is used while parsing dates and timestamps.</li> | ||
| * <li>`lineSep` (default covers all `\r`, `\r\n` and `\n`): defines the line separator | ||
| * that should be used for parsing. Maximum length is 2.</li> | ||
|
||
| * </ul> | ||
| * | ||
| * @since 2.0.0 | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@MaxGekk, might not be a super big deal but I believe this should be counted after converting it into
UTF-8.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We could say the line separator should be 1 or 2 bytes (UTF-8) in read path specifically when multiline is enabled.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uniVocityparser checks number of chars, see https://github.com/uniVocity/univocity-parsers/blob/f616d151b48150bc9cb98943f9b6f8353b704359/src/main/java/com/univocity/parsers/common/Format.java#L120-L122and those chars are in
UTF-16, I guess.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hm, I see.