-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support newlines_in_values
CSV option
#11533
Changes from 1 commit
5321e25
e05ca0e
9ca9065
34dcdb0
8c2d98d
ed0075d
356f46b
b9cc96b
4d06432
35198b6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -184,6 +184,10 @@ config_namespace! { | |||||
/// Default value for `format.has_header` for `CREATE EXTERNAL TABLE` | ||||||
/// if not specified explicitly in the statement. | ||||||
pub has_header: bool, default = false | ||||||
|
||||||
/// Default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` | ||||||
// if not specified explicitly in the statement. | ||||||
pub newlines_in_values: bool, default = false | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -1593,6 +1597,7 @@ config_namespace! { | |||||
pub quote: u8, default = b'"' | ||||||
pub escape: Option<u8>, default = None | ||||||
pub double_quote: Option<bool>, default = None | ||||||
pub newlines_in_values: Option<bool>, default = None | ||||||
pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED | ||||||
pub schema_infer_max_rec: usize, default = 100 | ||||||
pub date_format: Option<String>, default = None | ||||||
|
@@ -1665,6 +1670,14 @@ impl CsvOptions { | |||||
self | ||||||
} | ||||||
|
||||||
/// Set true to ensure that newlines in (quoted) values are supported. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The phrasing was a bit awkward initially as I was thinking of the flag as "ensuring support", rather than directly "supporting", since newlines in values are already supported if the file is below the I'm not sure if it's worth conveying that detail through these docs, or else to document this as providing support and treat the fact that newlines in values will "just work" for smaller files as an implementation detail that might change. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've expanded and normalised the documentation. I've gone for documenting the flag as enabling support for newlines in values, with the fact that newlines in values might work without it left as an implementation-defined detail. |
||||||
/// Note that setting this may reduce performance as large file scans will not be repartitioned. | ||||||
/// - default is None | ||||||
pub fn with_newlines_in_values(mut self, newlines_in_values: bool) -> Self { | ||||||
self.newlines_in_values = Some(newlines_in_values); | ||||||
self | ||||||
} | ||||||
|
||||||
/// Set a `CompressionTypeVariant` of CSV | ||||||
/// - defaults to `CompressionTypeVariant::UNCOMPRESSED` | ||||||
pub fn with_file_compression_type( | ||||||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -233,6 +233,14 @@ impl CsvFormat { | |||||
self | ||||||
} | ||||||
|
||||||
/// Set true to ensure that newlines in (quoted) values are supported. | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
/// Note that setting this may reduce performance as large file scans will not be repartitioned. | ||||||
/// - default is None | ||||||
pub fn with_newlines_in_values(mut self, newlines_in_values: bool) -> Self { | ||||||
self.options.newlines_in_values = Some(newlines_in_values); | ||||||
self | ||||||
} | ||||||
|
||||||
/// Set a `FileCompressionType` of CSV | ||||||
/// - defaults to `FileCompressionType::UNCOMPRESSED` | ||||||
pub fn with_file_compression_type( | ||||||
|
@@ -330,6 +338,9 @@ impl FileFormat for CsvFormat { | |||||
self.options.quote, | ||||||
self.options.escape, | ||||||
self.options.comment, | ||||||
self.options | ||||||
.newlines_in_values | ||||||
.unwrap_or(state.config_options().catalog.newlines_in_values), | ||||||
self.options.compression.into(), | ||||||
); | ||||||
Ok(Arc::new(exec)) | ||||||
|
@@ -1052,6 +1063,41 @@ mod tests { | |||||
Ok(()) | ||||||
} | ||||||
|
||||||
#[rstest(n_partitions, case(1), case(2), case(3), case(4))] | ||||||
#[tokio::test] | ||||||
async fn test_csv_parallel_newlines_in_values(n_partitions: usize) -> Result<()> { | ||||||
let config = SessionConfig::new() | ||||||
.with_repartition_file_scans(true) | ||||||
.with_repartition_file_min_size(0) | ||||||
.with_target_partitions(n_partitions); | ||||||
let csv_options = CsvReadOptions::default() | ||||||
.has_header(true) | ||||||
.newlines_in_values(true); | ||||||
let ctx = SessionContext::new_with_config(config); | ||||||
let testdata = arrow_test_data(); | ||||||
ctx.register_csv( | ||||||
"aggr", | ||||||
&format!("{testdata}/csv/aggregate_test_100.csv"), | ||||||
csv_options, | ||||||
) | ||||||
.await?; | ||||||
|
||||||
let query = "select sum(c3) from aggr;"; | ||||||
let query_result = ctx.sql(query).await?.collect().await?; | ||||||
let actual_partitions = count_query_csv_partitions(&ctx, query).await?; | ||||||
|
||||||
#[rustfmt::skip] | ||||||
let expected = ["+--------------+", | ||||||
"| sum(aggr.c3) |", | ||||||
"+--------------+", | ||||||
"| 781 |", | ||||||
"+--------------+"]; | ||||||
assert_batches_eq!(expected, &query_result); | ||||||
assert_eq!(1, actual_partitions); // csv won't be scanned in parallel when newlines_in_values is set | ||||||
|
||||||
Ok(()) | ||||||
} | ||||||
|
||||||
/// Read a single empty csv file in parallel | ||||||
/// | ||||||
/// empty_0_byte.csv: | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1503,6 +1503,7 @@ mod tests { | |
b'"', | ||
None, | ||
None, | ||
false, | ||
FileCompressionType::UNCOMPRESSED, | ||
)) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need more descriptive comment help for users without context to understand this config option, which should include 1. It's CSV specific 2. Can be overridden by the same config field in
CsvOptions
3. Its behavior as stated inCsvOptions
's comment.And update in https://github.com/apache/datafusion/blob/main/docs/source/user-guide/configs.md
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've expanded the documentation. I forgot about the user guide so I'll do that now as well.