From b06e8b06b16825cb88050c76cc1536b40ffd466f Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Sat, 17 Aug 2024 10:40:24 +0300 Subject: [PATCH] catalog.has_header true by default (#11919) --- datafusion/common/src/config.rs | 2 +- .../common/src/file_options/csv_writer.rs | 2 +- .../core/src/datasource/file_format/csv.rs | 23 +++++++++++++++++-- .../tests/user_defined/user_defined_plan.rs | 13 ++++++++--- datafusion/sqllogictest/test_files/copy.slt | 6 ++--- .../sqllogictest/test_files/csv_files.slt | 11 +++++---- datafusion/sqllogictest/test_files/ddl.slt | 7 ++++-- .../sqllogictest/test_files/group_by.slt | 3 ++- .../test_files/information_schema.slt | 4 ++-- datafusion/sqllogictest/test_files/limit.slt | 2 +- datafusion/sqllogictest/test_files/order.slt | 6 +++-- .../sqllogictest/test_files/projection.slt | 6 +++-- .../test_files/tpch/create_tables.slt.part | 16 ++++++------- datafusion/sqllogictest/test_files/window.slt | 3 ++- docs/source/user-guide/configs.md | 2 +- 15 files changed, 71 insertions(+), 35 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c48845c061e7..37d26c6f00c4 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -183,7 +183,7 @@ config_namespace! { /// Default value for `format.has_header` for `CREATE EXTERNAL TABLE` /// if not specified explicitly in the statement. - pub has_header: bool, default = false + pub has_header: bool, default = true /// Specifies whether newlines in (quoted) CSV values are supported. /// diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index ae069079a68f..943288af9164 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { fn try_from(value: &CsvOptions) -> Result { let mut builder = WriterBuilder::default() - .with_header(value.has_header.unwrap_or(false)) + .with_header(value.has_header.unwrap_or(true)) .with_quote(value.quote) .with_delimiter(value.delimiter); diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index c55f678aef0f..24d55ea54068 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -369,7 +369,7 @@ impl FileFormat for CsvFormat { async fn create_writer_physical_plan( &self, input: Arc, - _state: &SessionState, + state: &SessionState, conf: FileSinkConfig, order_requirements: Option>, ) -> Result> { @@ -377,7 +377,26 @@ impl FileFormat for CsvFormat { return not_impl_err!("Overwrites are not implemented yet for CSV"); } - let writer_options = CsvWriterOptions::try_from(&self.options)?; + // `has_header` and `newlines_in_values` fields of CsvOptions may inherit + // their values from session from configuration settings. To support + // this logic, writer options are built from the copy of `self.options` + // with updated values of these special fields. + let has_header = self + .options() + .has_header + .unwrap_or(state.config_options().catalog.has_header); + let newlines_in_values = self + .options() + .newlines_in_values + .unwrap_or(state.config_options().catalog.newlines_in_values); + + let options = self + .options() + .clone() + .with_has_header(has_header) + .with_newlines_in_values(newlines_in_values); + + let writer_options = CsvWriterOptions::try_from(&options)?; let sink_schema = conf.output_schema().clone(); let sink = Arc::new(CsvSink::new(conf, writer_options)); diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 1aa33fc75e5d..62ba113da0d3 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -113,7 +113,11 @@ async fn exec_sql(ctx: &SessionContext, sql: &str) -> Result { /// Create a test table. async fn setup_table(ctx: SessionContext) -> Result { - let sql = "CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) STORED AS CSV location 'tests/data/customer.csv'"; + let sql = " + CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) + STORED AS CSV location 'tests/data/customer.csv' + OPTIONS('format.has_header' 'false') + "; let expected = vec!["++", "++"]; @@ -125,8 +129,11 @@ async fn setup_table(ctx: SessionContext) -> Result { } async fn setup_table_without_schemas(ctx: SessionContext) -> Result { - let sql = - "CREATE EXTERNAL TABLE sales STORED AS CSV location 'tests/data/customer.csv'"; + let sql = " + CREATE EXTERNAL TABLE sales + STORED AS CSV location 'tests/data/customer.csv' + OPTIONS('format.has_header' 'false') + "; let expected = vec!["++", "++"]; diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index ebb3ca2173b8..d2a3a214d71e 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -417,7 +417,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip'); +CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.has_header' false, 'format.compression' gzip); query IT select * from validate_csv; @@ -427,7 +427,7 @@ select * from validate_csv; # Copy from table to single csv query I -COPY source_table to 'test_files/scratch/copy/table.csv'; +COPY source_table to 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' false); ---- 2 @@ -478,7 +478,7 @@ query I COPY source_table to 'test_files/scratch/copy/table_csv_with_options' STORED AS CSV OPTIONS ( -'format.has_header' false, +'format.has_header' true, 'format.compression' uncompressed, 'format.datetime_format' '%FT%H:%M:%S.%9f', 'format.delimiter' ';', diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 3fb9a6f20c24..7cb21abdba10 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -117,14 +117,14 @@ CREATE TABLE src_table_2 ( query I COPY src_table_1 TO 'test_files/scratch/csv_files/csv_partitions/1.csv' -STORED AS CSV; +STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 4 query I COPY src_table_2 TO 'test_files/scratch/csv_files/csv_partitions/2.csv' -STORED AS CSV; +STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 4 @@ -210,7 +210,7 @@ COPY (VALUES ('#second line is a comment'), ('2,3')) TO 'test_files/scratch/csv_files/file_with_comments.csv' -OPTIONS ('format.delimiter' '|'); +OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE stored_table_with_comments ( @@ -219,7 +219,8 @@ CREATE EXTERNAL TABLE stored_table_with_comments ( ) STORED AS CSV LOCATION 'test_files/scratch/csv_files/file_with_comments.csv' OPTIONS ('format.comment' '#', - 'format.delimiter' ','); + 'format.delimiter' ',', + 'format.has_header' 'false'); query TT SELECT * from stored_table_with_comments; @@ -315,7 +316,7 @@ col1 TEXT, col2 TEXT ) STORED AS CSV LOCATION '../core/tests/data/newlines_in_values.csv' -OPTIONS ('format.newlines_in_values' 'true'); +OPTIONS ('format.newlines_in_values' 'true', 'format.has_header' 'false'); query TT select * from stored_table_with_newlines_in_values_safe; diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index a35e688479e7..7164425fc0f5 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -470,7 +470,9 @@ statement ok CREATE EXTERNAL TABLE csv_with_timestamps ( name VARCHAR, ts TIMESTAMP -) STORED AS CSV LOCATION '../core/tests/data/timestamps.csv'; +) STORED AS CSV +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); query TP SELECT * from csv_with_timestamps @@ -496,7 +498,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV PARTITIONED BY (c_date) -LOCATION '../core/tests/data/partitioned_table'; +LOCATION '../core/tests/data/partitioned_table' +OPTIONS('format.has_header' 'false'); query TPD SELECT * from csv_with_timestamps where c_date='2018-11-13' diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 5571315e2acc..3d78bd06c30b 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4264,7 +4264,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV WITH ORDER (ts DESC) -LOCATION '../core/tests/data/timestamps.csv'; +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); # below query should run since it operates on a bounded source and have a sort # at the top of its plan. diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index ff793a72fd8a..efd017a90bc4 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -165,7 +165,7 @@ datafusion.catalog.create_default_catalog_and_schema true datafusion.catalog.default_catalog datafusion datafusion.catalog.default_schema public datafusion.catalog.format NULL -datafusion.catalog.has_header false +datafusion.catalog.has_header true datafusion.catalog.information_schema true datafusion.catalog.location NULL datafusion.catalog.newlines_in_values false @@ -255,7 +255,7 @@ datafusion.catalog.create_default_catalog_and_schema true Whether the default ca datafusion.catalog.default_catalog datafusion The default catalog name - this impacts what SQL queries use if not specified datafusion.catalog.default_schema public The default schema name - this impacts what SQL queries use if not specified datafusion.catalog.format NULL Type of `TableProvider` to use when loading `default` schema -datafusion.catalog.has_header false Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. +datafusion.catalog.has_header true Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information datafusion.catalog.location NULL Location scanned to load tables for `default` schema datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 439df7fede51..7341a9d43bac 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -521,7 +521,7 @@ drop table aggregate_test_100; query I COPY (select * from (values (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e') -)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV; +)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV OPTIONS ('format.has_header' 'false'); ---- 5 diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 569602166b38..f0151417e555 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -98,7 +98,8 @@ NULL three statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); # Demonstrate types query TTT @@ -463,7 +464,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps ( ) STORED AS CSV WITH ORDER (ts ASC NULLS LAST) -LOCATION '../core/tests/data/timestamps.csv'; +LOCATION '../core/tests/data/timestamps.csv' +OPTIONS('format.has_header' 'false'); query TT EXPLAIN SELECT DATE_BIN(INTERVAL '15 minutes', ts, TIMESTAMP '2022-08-03 14:40:00Z') as db15 diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index 3c8855e34712..b5bcb5b4c6f7 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -64,11 +64,13 @@ CREATE TABLE cpu_load_short(host STRING NOT NULL) AS VALUES statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE test_simple (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv' +OPTIONS('format.has_header' 'false'); # projection same fields query I rowsort diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 75bcbc198bef..d6249cb57990 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -31,7 +31,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS supplier ( s_acctbal DECIMAL(15, 2), s_comment VARCHAR, s_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -45,7 +45,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_retailprice DECIMAL(15, 2), p_comment VARCHAR, p_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok @@ -56,7 +56,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS partsupp ( ps_supplycost DECIMAL(15, 2), ps_comment VARCHAR, ps_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS customer ( @@ -69,7 +69,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_mktsegment VARCHAR, c_comment VARCHAR, c_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -83,7 +83,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_shippriority INTEGER, o_comment VARCHAR, o_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -104,7 +104,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipmode VARCHAR, l_comment VARCHAR, l_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS nation ( @@ -113,7 +113,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS nation ( n_regionkey BIGINT, n_comment VARCHAR, n_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS region ( @@ -121,4 +121,4 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false'); diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index ddf6a7aabffc..f56ac414a302 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -49,7 +49,8 @@ OPTIONS ('format.has_header' 'true'); ### execute_with_partition with 4 partitions statement ok CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean) -STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'; +STORED AS CSV LOCATION '../core/tests/data/partitioned_csv' +OPTIONS('format.has_header' 'false'); # for window functions without order by the first, last, and nth function call does not make sense diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6f315f539b11..4255307781b6 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -43,7 +43,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information | | datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema | | datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema | -| datafusion.catalog.has_header | false | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | +| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. | | datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. | | datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption | | datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |