Skip to content

Commit

Permalink
catalog.has_header true by default (#11919)
Browse files Browse the repository at this point in the history
  • Loading branch information
korowa committed Aug 17, 2024
1 parent 2a16704 commit b06e8b0
Show file tree
Hide file tree
Showing 15 changed files with 71 additions and 35 deletions.
2 changes: 1 addition & 1 deletion datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ config_namespace! {

/// Default value for `format.has_header` for `CREATE EXTERNAL TABLE`
/// if not specified explicitly in the statement.
pub has_header: bool, default = false
pub has_header: bool, default = true

/// Specifies whether newlines in (quoted) CSV values are supported.
///
Expand Down
2 changes: 1 addition & 1 deletion datafusion/common/src/file_options/csv_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions {

fn try_from(value: &CsvOptions) -> Result<Self> {
let mut builder = WriterBuilder::default()
.with_header(value.has_header.unwrap_or(false))
.with_header(value.has_header.unwrap_or(true))
.with_quote(value.quote)
.with_delimiter(value.delimiter);

Expand Down
23 changes: 21 additions & 2 deletions datafusion/core/src/datasource/file_format/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -369,15 +369,34 @@ impl FileFormat for CsvFormat {
async fn create_writer_physical_plan(
&self,
input: Arc<dyn ExecutionPlan>,
_state: &SessionState,
state: &SessionState,
conf: FileSinkConfig,
order_requirements: Option<Vec<PhysicalSortRequirement>>,
) -> Result<Arc<dyn ExecutionPlan>> {
if conf.overwrite {
return not_impl_err!("Overwrites are not implemented yet for CSV");
}

let writer_options = CsvWriterOptions::try_from(&self.options)?;
// `has_header` and `newlines_in_values` fields of CsvOptions may inherit
// their values from session from configuration settings. To support
// this logic, writer options are built from the copy of `self.options`
// with updated values of these special fields.
let has_header = self
.options()
.has_header
.unwrap_or(state.config_options().catalog.has_header);
let newlines_in_values = self
.options()
.newlines_in_values
.unwrap_or(state.config_options().catalog.newlines_in_values);

let options = self
.options()
.clone()
.with_has_header(has_header)
.with_newlines_in_values(newlines_in_values);

let writer_options = CsvWriterOptions::try_from(&options)?;

let sink_schema = conf.output_schema().clone();
let sink = Arc::new(CsvSink::new(conf, writer_options));
Expand Down
13 changes: 10 additions & 3 deletions datafusion/core/tests/user_defined/user_defined_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,11 @@ async fn exec_sql(ctx: &SessionContext, sql: &str) -> Result<String> {

/// Create a test table.
async fn setup_table(ctx: SessionContext) -> Result<SessionContext> {
let sql = "CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT) STORED AS CSV location 'tests/data/customer.csv'";
let sql = "
CREATE EXTERNAL TABLE sales(customer_id VARCHAR, revenue BIGINT)
STORED AS CSV location 'tests/data/customer.csv'
OPTIONS('format.has_header' 'false')
";

let expected = vec!["++", "++"];

Expand All @@ -125,8 +129,11 @@ async fn setup_table(ctx: SessionContext) -> Result<SessionContext> {
}

async fn setup_table_without_schemas(ctx: SessionContext) -> Result<SessionContext> {
let sql =
"CREATE EXTERNAL TABLE sales STORED AS CSV location 'tests/data/customer.csv'";
let sql = "
CREATE EXTERNAL TABLE sales
STORED AS CSV location 'tests/data/customer.csv'
OPTIONS('format.has_header' 'false')
";

let expected = vec!["++", "++"];

Expand Down
6 changes: 3 additions & 3 deletions datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS

# validate folder of csv files
statement ok
CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip');
CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.has_header' false, 'format.compression' gzip);

query IT
select * from validate_csv;
Expand All @@ -427,7 +427,7 @@ select * from validate_csv;

# Copy from table to single csv
query I
COPY source_table to 'test_files/scratch/copy/table.csv';
COPY source_table to 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' false);
----
2

Expand Down Expand Up @@ -478,7 +478,7 @@ query I
COPY source_table
to 'test_files/scratch/copy/table_csv_with_options'
STORED AS CSV OPTIONS (
'format.has_header' false,
'format.has_header' true,
'format.compression' uncompressed,
'format.datetime_format' '%FT%H:%M:%S.%9f',
'format.delimiter' ';',
Expand Down
11 changes: 6 additions & 5 deletions datafusion/sqllogictest/test_files/csv_files.slt
Original file line number Diff line number Diff line change
Expand Up @@ -117,14 +117,14 @@ CREATE TABLE src_table_2 (

query I
COPY src_table_1 TO 'test_files/scratch/csv_files/csv_partitions/1.csv'
STORED AS CSV;
STORED AS CSV OPTIONS ('format.has_header' 'false');
----
4


query I
COPY src_table_2 TO 'test_files/scratch/csv_files/csv_partitions/2.csv'
STORED AS CSV;
STORED AS CSV OPTIONS ('format.has_header' 'false');
----
4

Expand Down Expand Up @@ -210,7 +210,7 @@ COPY (VALUES
('#second line is a comment'),
('2,3'))
TO 'test_files/scratch/csv_files/file_with_comments.csv'
OPTIONS ('format.delimiter' '|');
OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE stored_table_with_comments (
Expand All @@ -219,7 +219,8 @@ CREATE EXTERNAL TABLE stored_table_with_comments (
) STORED AS CSV
LOCATION 'test_files/scratch/csv_files/file_with_comments.csv'
OPTIONS ('format.comment' '#',
'format.delimiter' ',');
'format.delimiter' ',',
'format.has_header' 'false');

query TT
SELECT * from stored_table_with_comments;
Expand Down Expand Up @@ -315,7 +316,7 @@ col1 TEXT,
col2 TEXT
) STORED AS CSV
LOCATION '../core/tests/data/newlines_in_values.csv'
OPTIONS ('format.newlines_in_values' 'true');
OPTIONS ('format.newlines_in_values' 'true', 'format.has_header' 'false');

query TT
select * from stored_table_with_newlines_in_values_safe;
Expand Down
7 changes: 5 additions & 2 deletions datafusion/sqllogictest/test_files/ddl.slt
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,9 @@ statement ok
CREATE EXTERNAL TABLE csv_with_timestamps (
name VARCHAR,
ts TIMESTAMP
) STORED AS CSV LOCATION '../core/tests/data/timestamps.csv';
) STORED AS CSV
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

query TP
SELECT * from csv_with_timestamps
Expand All @@ -496,7 +498,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
PARTITIONED BY (c_date)
LOCATION '../core/tests/data/partitioned_table';
LOCATION '../core/tests/data/partitioned_table'
OPTIONS('format.has_header' 'false');

query TPD
SELECT * from csv_with_timestamps where c_date='2018-11-13'
Expand Down
3 changes: 2 additions & 1 deletion datafusion/sqllogictest/test_files/group_by.slt
Original file line number Diff line number Diff line change
Expand Up @@ -4264,7 +4264,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
WITH ORDER (ts DESC)
LOCATION '../core/tests/data/timestamps.csv';
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

# below query should run since it operates on a bounded source and have a sort
# at the top of its plan.
Expand Down
4 changes: 2 additions & 2 deletions datafusion/sqllogictest/test_files/information_schema.slt
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ datafusion.catalog.create_default_catalog_and_schema true
datafusion.catalog.default_catalog datafusion
datafusion.catalog.default_schema public
datafusion.catalog.format NULL
datafusion.catalog.has_header false
datafusion.catalog.has_header true
datafusion.catalog.information_schema true
datafusion.catalog.location NULL
datafusion.catalog.newlines_in_values false
Expand Down Expand Up @@ -255,7 +255,7 @@ datafusion.catalog.create_default_catalog_and_schema true Whether the default ca
datafusion.catalog.default_catalog datafusion The default catalog name - this impacts what SQL queries use if not specified
datafusion.catalog.default_schema public The default schema name - this impacts what SQL queries use if not specified
datafusion.catalog.format NULL Type of `TableProvider` to use when loading `default` schema
datafusion.catalog.has_header false Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.
datafusion.catalog.has_header true Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement.
datafusion.catalog.information_schema true Should DataFusion provide access to `information_schema` virtual tables for displaying schema information
datafusion.catalog.location NULL Location scanned to load tables for `default` schema
datafusion.catalog.newlines_in_values false Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance.
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/limit.slt
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ drop table aggregate_test_100;
query I
COPY (select * from (values
(1, 'a'), (2, 'b'), (3, 'c'), (4, 'd'), (5, 'e')
)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV;
)) TO 'test_files/scratch/limit/data.csv' STORED AS CSV OPTIONS ('format.has_header' 'false');
----
5

Expand Down
6 changes: 4 additions & 2 deletions datafusion/sqllogictest/test_files/order.slt
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ NULL three

statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');

# Demonstrate types
query TTT
Expand Down Expand Up @@ -463,7 +464,8 @@ CREATE EXTERNAL TABLE csv_with_timestamps (
)
STORED AS CSV
WITH ORDER (ts ASC NULLS LAST)
LOCATION '../core/tests/data/timestamps.csv';
LOCATION '../core/tests/data/timestamps.csv'
OPTIONS('format.has_header' 'false');

query TT
EXPLAIN SELECT DATE_BIN(INTERVAL '15 minutes', ts, TIMESTAMP '2022-08-03 14:40:00Z') as db15
Expand Down
6 changes: 4 additions & 2 deletions datafusion/sqllogictest/test_files/projection.slt
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,13 @@ CREATE TABLE cpu_load_short(host STRING NOT NULL) AS VALUES

statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE test_simple (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv/partition-0.csv'
OPTIONS('format.has_header' 'false');

# projection same fields
query I rowsort
Expand Down
16 changes: 8 additions & 8 deletions datafusion/sqllogictest/test_files/tpch/create_tables.slt.part
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS supplier (
s_acctbal DECIMAL(15, 2),
s_comment VARCHAR,
s_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS part (
Expand All @@ -45,7 +45,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part (
p_retailprice DECIMAL(15, 2),
p_comment VARCHAR,
p_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');


statement ok
Expand All @@ -56,7 +56,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS partsupp (
ps_supplycost DECIMAL(15, 2),
ps_comment VARCHAR,
ps_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS customer (
Expand All @@ -69,7 +69,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer (
c_mktsegment VARCHAR,
c_comment VARCHAR,
c_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS orders (
Expand All @@ -83,7 +83,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders (
o_shippriority INTEGER,
o_comment VARCHAR,
o_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS lineitem (
Expand All @@ -104,7 +104,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem (
l_shipmode VARCHAR,
l_comment VARCHAR,
l_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS nation (
Expand All @@ -113,12 +113,12 @@ CREATE EXTERNAL TABLE IF NOT EXISTS nation (
n_regionkey BIGINT,
n_comment VARCHAR,
n_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');

statement ok
CREATE EXTERNAL TABLE IF NOT EXISTS region (
r_regionkey BIGINT,
r_name VARCHAR,
r_comment VARCHAR,
r_rev VARCHAR,
) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|');
) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|', 'format.has_header' 'false');
3 changes: 2 additions & 1 deletion datafusion/sqllogictest/test_files/window.slt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ OPTIONS ('format.has_header' 'true');
### execute_with_partition with 4 partitions
statement ok
CREATE EXTERNAL TABLE test (c1 int, c2 bigint, c3 boolean)
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv';
STORED AS CSV LOCATION '../core/tests/data/partitioned_csv'
OPTIONS('format.has_header' 'false');


# for window functions without order by the first, last, and nth function call does not make sense
Expand Down
2 changes: 1 addition & 1 deletion docs/source/user-guide/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus
| datafusion.catalog.information_schema | false | Should DataFusion provide access to `information_schema` virtual tables for displaying schema information |
| datafusion.catalog.location | NULL | Location scanned to load tables for `default` schema |
| datafusion.catalog.format | NULL | Type of `TableProvider` to use when loading `default` schema |
| datafusion.catalog.has_header | false | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. |
| datafusion.catalog.has_header | true | Default value for `format.has_header` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. |
| datafusion.catalog.newlines_in_values | false | Specifies whether newlines in (quoted) CSV values are supported. This is the default value for `format.newlines_in_values` for `CREATE EXTERNAL TABLE` if not specified explicitly in the statement. Parsing newlines in quoted values may be affected by execution behaviour such as parallel file scanning. Setting this to `true` ensures that newlines in values are parsed successfully, which may reduce performance. |
| datafusion.execution.batch_size | 8192 | Default batch size while creating new batches, it's especially useful for buffer-in-memory batches since creating tiny batches would result in too much metadata memory consumption |
| datafusion.execution.coalesce_batches | true | When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting |
Expand Down

0 comments on commit b06e8b0

Please sign in to comment.