diff --git a/Cargo.toml b/Cargo.toml index 877cead93673..c155e475a026 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,22 +69,22 @@ version = "41.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "52.2.0", features = [ +arrow = { version = "53.0.0", features = [ "prettyprint", ] } -arrow-array = { version = "52.2.0", default-features = false, features = [ +arrow-array = { version = "53.0.0", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "52.2.0", default-features = false } -arrow-flight = { version = "52.2.0", features = [ +arrow-buffer = { version = "53.0.0", default-features = false } +arrow-flight = { version = "53.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "52.2.0", default-features = false, features = [ +arrow-ipc = { version = "53.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "52.2.0", default-features = false } -arrow-schema = { version = "52.2.0", default-features = false } -arrow-string = { version = "52.2.0", default-features = false } +arrow-ord = { version = "53.0.0", default-features = false } +arrow-schema = { version = "53.0.0", default-features = false } +arrow-string = { version = "53.0.0", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" @@ -122,15 +122,17 @@ indexmap = "2.0.0" itertools = "0.13" log = "^0.4" num_cpus = "1.13.0" -object_store = { version = "0.10.2", default-features = false } +object_store = { version = "0.11.0", default-features = false } parking_lot = "0.12" -parquet = { version = "52.2.0", default-features = false, features = [ +parquet = { version = "53.0.0", default-features = false, features = [ "arrow", "async", "object_store", ] } +pbjson = { version = "0.7.0" } # Should match arrow-flight's version of prost. -prost = "0.12.3" +prost = "0.13.1" +prost-derive = "0.13.1" rand = "0.8" regex = "1.8" rstest = "0.22.0" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 8cdee8c96f95..ddc33b43d134 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -179,9 +179,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" +checksum = "45aef0d9cf9a039bf6cd1acc451b137aca819977b0928dece52bd92811b640ba" dependencies = [ "arrow-arith", "arrow-array", @@ -200,9 +200,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" +checksum = "03675e42d1560790f3524800e41403b40d0da1c793fe9528929fde06d8c7649a" dependencies = [ "arrow-array", "arrow-buffer", @@ -215,9 +215,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" +checksum = "cd2bf348cf9f02a5975c5962c7fa6dee107a2009a7b41ac5fb1a027e12dc033f" dependencies = [ "ahash", "arrow-buffer", @@ -232,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" +checksum = "3092e37715f168976012ce52273c3989b5793b0db5f06cbaa246be25e5f0924d" dependencies = [ "bytes", "half", @@ -243,9 +243,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da26719e76b81d8bc3faad1d4dbdc1bcc10d14704e63dc17fc9f3e7e1e567c8e" +checksum = "7ce1018bb710d502f9db06af026ed3561552e493e989a79d0d0f5d9cf267a785" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" +checksum = "fd178575f45624d045e4ebee714e246a05d9652e41363ee3f57ec18cca97f740" dependencies = [ "arrow-array", "arrow-buffer", @@ -283,9 +283,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" +checksum = "4e4ac0c4ee79150afe067dc4857154b3ee9c1cd52b5f40d59a77306d0ed18d65" dependencies = [ "arrow-buffer", "arrow-schema", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" +checksum = "bb307482348a1267f91b0912e962cd53440e5de0f7fb24c5f7b10da70b38c94a" dependencies = [ "arrow-array", "arrow-buffer", @@ -310,9 +310,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" +checksum = "d24805ba326758effdd6f2cbdd482fcfab749544f21b134701add25b33f474e6" dependencies = [ "arrow-array", "arrow-buffer", @@ -330,9 +330,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" +checksum = "644046c479d80ae8ed02a7f1e1399072ea344ca6a7b0e293ab2d5d9ed924aa3b" dependencies = [ "arrow-array", "arrow-buffer", @@ -345,9 +345,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" +checksum = "a29791f8eb13b340ce35525b723f5f0df17ecb955599e11f65c2a94ab34e2efb" dependencies = [ "ahash", "arrow-array", @@ -359,15 +359,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" +checksum = "c85320a3a2facf2b2822b57aa9d6d9d55edb8aee0b6b5d3b8df158e503d10858" [[package]] name = "arrow-select" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" +checksum = "9cc7e6b582e23855fd1625ce46e51647aa440c20ea2e71b1d748e0839dd73cba" dependencies = [ "ahash", "arrow-array", @@ -379,9 +379,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" +checksum = "0775b6567c66e56ded19b87a954b6b1beffbdd784ef95a3a2b03f59570c1d230" dependencies = [ "arrow-array", "arrow-buffer", @@ -540,9 +540,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.41.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af0a3f676cba2c079c9563acc9233998c8951cdbe38629a0bef3c8c1b02f3658" +checksum = "11822090cf501c316c6f75711d77b96fba30658e3867a7762e5e2f5d32d31e81" dependencies = [ "aws-credential-types", "aws-runtime", @@ -562,9 +562,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.42.0" +version = "1.40.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91b6a04495547162cf52b075e3c15a17ab6608bf9c5785d3e5a5509b3f09f5c" +checksum = "78a2a06ff89176123945d1bbe865603c4d7101bea216a550bb4d2e4e9ba74d74" dependencies = [ "aws-credential-types", "aws-runtime", @@ -584,9 +584,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.41.0" +version = "1.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99c56bcd6a56cab7933980a54148b476a5a69a7694e3874d9aa2a566f150447d" +checksum = "a20a91795850826a6f456f4a48eff1dfa59a0e69bdbf5b8c50518fd372106574" dependencies = [ "aws-credential-types", "aws-runtime", @@ -731,6 +731,7 @@ dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.1.0", "http-body 0.4.6", @@ -743,6 +744,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -1179,9 +1182,9 @@ checksum = "7762d17f1241643615821a8455a0b2c3e803784b058693d990b11f2dce25a0ca" [[package]] name = "dashmap" -version = "6.0.1" +version = "6.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "804c8821570c3f8b70230c2ba75ffa5c0f9a4189b9a432b6656c536712acae28" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" dependencies = [ "cfg-if", "crossbeam-utils", @@ -1269,6 +1272,9 @@ dependencies = [ "async-trait", "aws-config", "aws-credential-types", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", "clap", "ctor", "datafusion", @@ -2627,9 +2633,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.10.2" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6da452820c715ce78221e8202ccc599b4a52f3e1eb3eedb487b680c81a8e3f3" +checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" dependencies = [ "async-trait", "base64 0.22.1", @@ -2708,9 +2714,9 @@ dependencies = [ [[package]] name = "parquet" -version = "52.2.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e977b9066b4d3b03555c22bdc442f3fadebd96a39111249113087d0edb2691cd" +checksum = "f0fbf928021131daaa57d334ca8e3904fe9ae22f73c56244fc7db9b04eedc3d8" dependencies = [ "ahash", "arrow-array", @@ -3186,9 +3192,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.35" +version = "0.38.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a85d50532239da68e9addb745ba38ff4612a242c1c7ceea689c4bc7c2f43c36f" +checksum = "3f55e80d50763938498dd5ebb18647174e0c76dc38c5505294bb224624f30f36" dependencies = [ "bitflags 2.6.0", "errno", @@ -3504,24 +3510,23 @@ checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "snafu" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4de37ad025c587a29e8f3f5605c00f70b98715ef90b9061a815b9e59e9042d6" +checksum = "2b835cb902660db3415a672d862905e791e54d306c6e8189168c7f3d9ae1c79d" dependencies = [ - "doc-comment", "snafu-derive", ] [[package]] name = "snafu-derive" -version = "0.7.5" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990079665f075b699031e9c08fd3ab99be5029b96f3b78dc0709e8f77e4efebf" +checksum = "38d1e02fca405f6280643174a50c942219f0bbf4dbf7d480f1dd864d6f211ae5" dependencies = [ - "heck 0.4.1", + "heck 0.5.0", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.77", ] [[package]] @@ -4446,9 +4451,9 @@ dependencies = [ [[package]] name = "zstd-sys" -version = "2.0.12+zstd.1.5.6" +version = "2.0.13+zstd.1.5.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a4e40c320c3cb459d9a9ff6de98cff88f4751ee9275d140e2be94a2b74e4c13" +checksum = "38ff0f21cfee8f97d94cef41359e0c89aa6113028ab0291aa8ca0038995a95aa" dependencies = [ "cc", "pkg-config", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index a22265825623..f2f52846ab54 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -26,13 +26,20 @@ license = "Apache-2.0" homepage = "https://datafusion.apache.org" repository = "https://github.com/apache/datafusion" # Specify MSRV here as `cargo msrv` doesn't support workspace version -rust-version = "1.78" +rust-version = "1.76" readme = "README.md" [dependencies] -arrow = { version = "52.2.0" } +arrow = { version = "53.0.0" } async-trait = "0.1.73" aws-config = "1.5.5" +# begin pin aws-sdk crates otherwise CI MSRV check fails +# We can't update these libraries yet as it requires Rust 1.78, which is not available until Nov 2024 +# per https://github.com/apache/datafusion?tab=readme-ov-file#rust-version-compatibility-policy +aws-sdk-sso = "=1.39.0" +aws-sdk-ssooidc = "=1.40.0" +aws-sdk-sts = "=1.39.0" +# end pin aws-sdk crates aws-credential-types = "1.2.0" clap = { version = "4.5.16", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "41.0.0", features = [ @@ -49,9 +56,9 @@ dirs = "4.0.0" env_logger = "0.9" futures = "0.3" mimalloc = { version = "0.1", default-features = false } -object_store = { version = "0.10.1", features = ["aws", "gcp", "http"] } +object_store = { version = "0.11.0", features = ["aws", "gcp", "http"] } parking_lot = { version = "0.12" } -parquet = { version = "52.2.0", default-features = false } +parquet = { version = "53.0.0", default-features = false } regex = "1.8" rustyline = "11.0" tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync", "parking_lot", "signal"] } diff --git a/datafusion-cli/src/functions.rs b/datafusion-cli/src/functions.rs index a85c43f3576f..99511e969386 100644 --- a/datafusion-cli/src/functions.rs +++ b/datafusion-cli/src/functions.rs @@ -32,6 +32,7 @@ use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::ExecutionPlan; use datafusion::scalar::ScalarValue; use parquet::basic::ConvertedType; +use parquet::data_type::{ByteArray, FixedLenByteArray}; use parquet::file::reader::FileReader; use parquet::file::serialized_reader::SerializedFileReader; use parquet::file::statistics::Statistics; @@ -250,49 +251,69 @@ impl TableProvider for ParquetMetadataTable { fn convert_parquet_statistics( value: &Statistics, converted_type: ConvertedType, -) -> (String, String) { +) -> (Option, Option) { match (value, converted_type) { - (Statistics::Boolean(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::Int32(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::Int64(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::Int96(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::Float(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::Double(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::ByteArray(val), ConvertedType::UTF8) => { - let min_bytes = val.min(); - let max_bytes = val.max(); - let min = min_bytes - .as_utf8() - .map(|v| v.to_string()) - .unwrap_or_else(|_| min_bytes.to_string()); - - let max = max_bytes - .as_utf8() - .map(|v| v.to_string()) - .unwrap_or_else(|_| max_bytes.to_string()); - (min, max) - } - (Statistics::ByteArray(val), _) => (val.min().to_string(), val.max().to_string()), - (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => { - let min_bytes = val.min(); - let max_bytes = val.max(); - let min = min_bytes - .as_utf8() - .map(|v| v.to_string()) - .unwrap_or_else(|_| min_bytes.to_string()); - - let max = max_bytes - .as_utf8() - .map(|v| v.to_string()) - .unwrap_or_else(|_| max_bytes.to_string()); - (min, max) - } - (Statistics::FixedLenByteArray(val), _) => { - (val.min().to_string(), val.max().to_string()) - } + (Statistics::Boolean(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int32(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int64(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Int96(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Float(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::Double(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::ByteArray(val), ConvertedType::UTF8) => ( + byte_array_to_string(val.min_opt()), + byte_array_to_string(val.max_opt()), + ), + (Statistics::ByteArray(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), + (Statistics::FixedLenByteArray(val), ConvertedType::UTF8) => ( + fixed_len_byte_array_to_string(val.min_opt()), + fixed_len_byte_array_to_string(val.max_opt()), + ), + (Statistics::FixedLenByteArray(val), _) => ( + val.min_opt().map(|v| v.to_string()), + val.max_opt().map(|v| v.to_string()), + ), } } +/// Convert to a string if it has utf8 encoding, otherwise print bytes directly +fn byte_array_to_string(val: Option<&ByteArray>) -> Option { + val.map(|v| { + v.as_utf8() + .map(|s| s.to_string()) + .unwrap_or_else(|_e| v.to_string()) + }) +} + +/// Convert to a string if it has utf8 encoding, otherwise print bytes directly +fn fixed_len_byte_array_to_string(val: Option<&FixedLenByteArray>) -> Option { + val.map(|v| { + v.as_utf8() + .map(|s| s.to_string()) + .unwrap_or_else(|_e| v.to_string()) + }) +} + pub struct ParquetMetadataFunc {} impl TableFunctionImpl for ParquetMetadataFunc { @@ -376,17 +397,13 @@ impl TableFunctionImpl for ParquetMetadataFunc { let converted_type = column.column_descr().converted_type(); if let Some(s) = column.statistics() { - let (min_val, max_val) = if s.has_min_max_set() { - let (min_val, max_val) = - convert_parquet_statistics(s, converted_type); - (Some(min_val), Some(max_val)) - } else { - (None, None) - }; + let (min_val, max_val) = + convert_parquet_statistics(s, converted_type); stats_min_arr.push(min_val.clone()); stats_max_arr.push(max_val.clone()); - stats_null_count_arr.push(Some(s.null_count() as i64)); - stats_distinct_count_arr.push(s.distinct_count().map(|c| c as i64)); + stats_null_count_arr.push(s.null_count_opt().map(|c| c as i64)); + stats_distinct_count_arr + .push(s.distinct_count_opt().map(|c| c as i64)); stats_min_value_arr.push(min_val); stats_max_value_arr.push(max_val); } else { diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 57b9930177d4..f430a87e190d 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -73,13 +73,13 @@ mimalloc = { version = "0.1", default-features = false } num_cpus = { workspace = true } object_store = { workspace = true, features = ["aws", "http"] } prost = { workspace = true } -prost-derive = { version = "0.13", default-features = false } +prost-derive = { workspace = true } serde = { version = "1.0.136", features = ["derive"] } serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.11" +tonic = "0.12.1" url = { workspace = true } uuid = "1.7" diff --git a/datafusion-examples/examples/csv_opener.rs b/datafusion-examples/examples/csv_opener.rs index 1f45026a214d..e7b7ead109bc 100644 --- a/datafusion-examples/examples/csv_opener.rs +++ b/datafusion-examples/examples/csv_opener.rs @@ -47,6 +47,7 @@ async fn main() -> Result<()> { true, b',', b'"', + None, object_store, Some(b'#'), ); diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 79e20ba1215c..1ac27b40c219 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -61,7 +61,7 @@ num_cpus = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.21.0", optional = true } +pyo3 = { version = "0.22.0", optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 7c247103f6e7..19978e102cc8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1604,6 +1604,7 @@ config_namespace! { pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' + pub terminator: Option, default = None pub escape: Option, default = None pub double_quote: Option, default = None /// Specifies whether newlines in (quoted) values are supported. @@ -1672,6 +1673,13 @@ impl CsvOptions { self } + /// The character that terminates a row. + /// - default to None (CRLF) + pub fn with_terminator(mut self, terminator: Option) -> Self { + self.terminator = terminator; + self + } + /// The escape character in a row. /// - default is None pub fn with_escape(mut self, escape: Option) -> Self { @@ -1718,6 +1726,11 @@ impl CsvOptions { self.quote } + /// The terminator character. + pub fn terminator(&self) -> Option { + self.terminator + } + /// The escape character. pub fn escape(&self) -> Option { self.escape diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 88802af23a4b..0cb325e0b02b 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -2800,6 +2800,13 @@ impl ScalarValue { let a = array.slice(index, 1); Self::Map(Arc::new(a.as_map().to_owned())) } + DataType::Union(fields, mode) => { + let array = as_union_array(array); + let ti = array.type_id(index); + let index = array.value_offset(index); + let value = ScalarValue::try_from_array(array.child(ti), index)?; + ScalarValue::Union(Some((ti, Box::new(value))), fields.clone(), *mode) + } other => { return _not_impl_err!( "Can't create a scalar from array of type \"{other:?}\"" @@ -3035,8 +3042,15 @@ impl ScalarValue { ScalarValue::DurationNanosecond(val) => { eq_array_primitive!(array, index, DurationNanosecondArray, val)? } - ScalarValue::Union(_, _, _) => { - return _not_impl_err!("Union is not supported yet") + ScalarValue::Union(value, _, _) => { + let array = as_union_array(array); + let ti = array.type_id(index); + let index = array.value_offset(index); + if let Some((ti_v, value)) = value { + ti_v == &ti && value.eq_array(array.child(ti), index)? + } else { + array.child(ti).is_null(index) + } } ScalarValue::Dictionary(key_type, v) => { let (values_array, values_index) = match key_type.as_ref() { @@ -4356,7 +4370,7 @@ mod tests { .strip_backtrace(); assert_eq!( err, - "Arrow error: Compute error: Overflow happened on: 2147483647 - -2147483648" + "Arrow error: Arithmetic overflow: Overflow happened on: 2147483647 - -2147483648" ) } @@ -4377,7 +4391,7 @@ mod tests { .sub_checked(&int_value_2) .unwrap_err() .strip_backtrace(); - assert_eq!(err, "Arrow error: Compute error: Overflow happened on: 9223372036854775807 - -9223372036854775808") + assert_eq!(err, "Arrow error: Arithmetic overflow: Overflow happened on: 9223372036854775807 - -9223372036854775808") } #[test] @@ -5536,6 +5550,112 @@ mod tests { assert_eq!(&array, &expected); } + #[test] + fn test_scalar_union_sparse() { + let field_a = Arc::new(Field::new("A", DataType::Int32, true)); + let field_b = Arc::new(Field::new("B", DataType::Boolean, true)); + let field_c = Arc::new(Field::new("C", DataType::Utf8, true)); + let fields = UnionFields::from_iter([(0, field_a), (1, field_b), (2, field_c)]); + + let mut values_a = vec![None; 6]; + values_a[0] = Some(42); + let mut values_b = vec![None; 6]; + values_b[1] = Some(true); + let mut values_c = vec![None; 6]; + values_c[2] = Some("foo"); + let children: Vec = vec![ + Arc::new(Int32Array::from(values_a)), + Arc::new(BooleanArray::from(values_b)), + Arc::new(StringArray::from(values_c)), + ]; + + let type_ids = ScalarBuffer::from(vec![0, 1, 2, 0, 1, 2]); + let array: ArrayRef = Arc::new( + UnionArray::try_new(fields.clone(), type_ids, None, children) + .expect("UnionArray"), + ); + + let expected = [ + (0, ScalarValue::from(42)), + (1, ScalarValue::from(true)), + (2, ScalarValue::from("foo")), + (0, ScalarValue::Int32(None)), + (1, ScalarValue::Boolean(None)), + (2, ScalarValue::Utf8(None)), + ]; + + for (i, (ti, value)) in expected.into_iter().enumerate() { + let is_null = value.is_null(); + let value = Some((ti, Box::new(value))); + let expected = ScalarValue::Union(value, fields.clone(), UnionMode::Sparse); + let actual = ScalarValue::try_from_array(&array, i).expect("try_from_array"); + + assert_eq!( + actual, expected, + "[{i}] {actual} was not equal to {expected}" + ); + + assert!( + expected.eq_array(&array, i).expect("eq_array"), + "[{i}] {expected}.eq_array was false" + ); + + if is_null { + assert!(actual.is_null(), "[{i}] {actual} was not null") + } + } + } + + #[test] + fn test_scalar_union_dense() { + let field_a = Arc::new(Field::new("A", DataType::Int32, true)); + let field_b = Arc::new(Field::new("B", DataType::Boolean, true)); + let field_c = Arc::new(Field::new("C", DataType::Utf8, true)); + let fields = UnionFields::from_iter([(0, field_a), (1, field_b), (2, field_c)]); + let children: Vec = vec![ + Arc::new(Int32Array::from(vec![Some(42), None])), + Arc::new(BooleanArray::from(vec![Some(true), None])), + Arc::new(StringArray::from(vec![Some("foo"), None])), + ]; + + let type_ids = ScalarBuffer::from(vec![0, 1, 2, 0, 1, 2]); + let offsets = ScalarBuffer::from(vec![0, 0, 0, 1, 1, 1]); + let array: ArrayRef = Arc::new( + UnionArray::try_new(fields.clone(), type_ids, Some(offsets), children) + .expect("UnionArray"), + ); + + let expected = [ + (0, ScalarValue::from(42)), + (1, ScalarValue::from(true)), + (2, ScalarValue::from("foo")), + (0, ScalarValue::Int32(None)), + (1, ScalarValue::Boolean(None)), + (2, ScalarValue::Utf8(None)), + ]; + + for (i, (ti, value)) in expected.into_iter().enumerate() { + let is_null = value.is_null(); + let value = Some((ti, Box::new(value))); + let expected = ScalarValue::Union(value, fields.clone(), UnionMode::Dense); + let actual = ScalarValue::try_from_array(&array, i).expect("try_from_array"); + + assert_eq!( + actual, expected, + "[{i}] {actual} was not equal to {expected}" + ); + + assert!( + expected.eq_array(&array, i).expect("eq_array"), + "[{i}] {expected}.eq_array was false" + ); + + if is_null { + assert!(actual.is_null(), "[{i}] {actual} was not null") + } + } + } + #[test] fn test_lists_in_struct() { let field_a = Arc::new(Field::new("A", DataType::Utf8, false)); @@ -5893,7 +6013,7 @@ mod tests { let root_err = err.find_root(); match root_err{ DataFusionError::ArrowError( - ArrowError::ComputeError(_), + ArrowError::ArithmeticOverflow(_), _, ) => {} _ => return Err(err), diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index e43f6ab29abc..99e8f13776fc 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -247,6 +247,13 @@ impl CsvFormat { self } + /// The character used to indicate the end of a row. + /// - default to None (CRLF) + pub fn with_terminator(mut self, terminator: Option) -> Self { + self.options.terminator = terminator; + self + } + /// Specifies whether newlines in (quoted) values are supported. /// /// Parsing newlines in quoted values may be affected by execution behaviour such as @@ -359,6 +366,7 @@ impl FileFormat for CsvFormat { .with_has_header(has_header) .with_delimeter(self.options.delimiter) .with_quote(self.options.quote) + .with_terminator(self.options.terminator) .with_escape(self.options.escape) .with_comment(self.options.comment) .with_newlines_in_values(newlines_in_values) diff --git a/datafusion/core/src/datasource/file_format/options.rs b/datafusion/core/src/datasource/file_format/options.rs index db90262edbf8..7ef5a2182d1c 100644 --- a/datafusion/core/src/datasource/file_format/options.rs +++ b/datafusion/core/src/datasource/file_format/options.rs @@ -59,6 +59,8 @@ pub struct CsvReadOptions<'a> { pub delimiter: u8, /// An optional quote character. Defaults to `b'"'`. pub quote: u8, + /// An optional terminator character. Defaults to None (CRLF). + pub terminator: Option, /// An optional escape character. Defaults to None. pub escape: Option, /// If enabled, lines beginning with this byte are ignored. @@ -102,6 +104,7 @@ impl<'a> CsvReadOptions<'a> { schema_infer_max_records: DEFAULT_SCHEMA_INFER_MAX_RECORD, delimiter: b',', quote: b'"', + terminator: None, escape: None, newlines_in_values: false, file_extension: DEFAULT_CSV_EXTENSION, @@ -136,6 +139,12 @@ impl<'a> CsvReadOptions<'a> { self } + /// Specify terminator to use for CSV read + pub fn terminator(mut self, terminator: Option) -> Self { + self.terminator = terminator; + self + } + /// Specify delimiter to use for CSV read pub fn escape(mut self, escape: u8) -> Self { self.escape = Some(escape); @@ -511,6 +520,7 @@ impl ReadOptions<'_> for CsvReadOptions<'_> { .with_delimiter(self.delimiter) .with_quote(self.quote) .with_escape(self.escape) + .with_terminator(self.terminator) .with_newlines_in_values(self.newlines_in_values) .with_schema_infer_max_rec(self.schema_infer_max_records) .with_file_compression_type(self.file_compression_type.to_owned()); diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 23e765f0f2cd..76e8ad9da559 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -2010,7 +2010,7 @@ mod tests { // test result in int_col let int_col_index = page_index.get(4).unwrap(); - let int_col_offset = offset_index.get(4).unwrap(); + let int_col_offset = offset_index.get(4).unwrap().page_locations(); // 325 pages in int_col assert_eq!(int_col_offset.len(), 325); diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 5ab32ed36e53..6cd1864deb1d 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -77,6 +77,7 @@ pub struct CsvExec { has_header: bool, delimiter: u8, quote: u8, + terminator: Option, escape: Option, comment: Option, newlines_in_values: bool, @@ -98,6 +99,7 @@ pub struct CsvExecBuilder { has_header: bool, delimiter: u8, quote: u8, + terminator: Option, escape: Option, comment: Option, newlines_in_values: bool, @@ -112,6 +114,7 @@ impl CsvExecBuilder { has_header: false, delimiter: b',', quote: b'"', + terminator: None, escape: None, comment: None, newlines_in_values: false, @@ -143,6 +146,14 @@ impl CsvExecBuilder { self } + /// Set the line terminator. If not set, the default is CRLF. + /// + /// The default is None. + pub fn with_terminator(mut self, terminator: Option) -> Self { + self.terminator = terminator; + self + } + /// Set the escape character. /// /// The default is `None` (i.e. quotes cannot be escaped). @@ -191,6 +202,7 @@ impl CsvExecBuilder { has_header, delimiter, quote, + terminator, escape, comment, newlines_in_values, @@ -210,6 +222,7 @@ impl CsvExecBuilder { has_header, delimiter, quote, + terminator, escape, newlines_in_values, metrics: ExecutionPlanMetricsSet::new(), @@ -229,6 +242,7 @@ impl CsvExec { has_header: bool, delimiter: u8, quote: u8, + terminator: Option, escape: Option, comment: Option, newlines_in_values: bool, @@ -238,6 +252,7 @@ impl CsvExec { .with_has_header(has_header) .with_delimeter(delimiter) .with_quote(quote) + .with_terminator(terminator) .with_escape(escape) .with_comment(comment) .with_newlines_in_values(newlines_in_values) @@ -270,6 +285,11 @@ impl CsvExec { self.quote } + /// The line terminator + pub fn terminator(&self) -> Option { + self.terminator + } + /// Lines beginning with this byte are ignored. pub fn comment(&self) -> Option { self.comment @@ -406,10 +426,10 @@ impl ExecutionPlan for CsvExec { delimiter: self.delimiter, quote: self.quote, escape: self.escape, + terminator: self.terminator, object_store, comment: self.comment, }); - let opener = CsvOpener { config, file_compression_type: self.file_compression_type.to_owned(), @@ -441,6 +461,7 @@ impl ExecutionPlan for CsvExec { delimiter: self.delimiter, quote: self.quote, escape: self.escape, + terminator: self.terminator, comment: self.comment, newlines_in_values: self.newlines_in_values, metrics: self.metrics.clone(), @@ -459,6 +480,7 @@ pub struct CsvConfig { has_header: bool, delimiter: u8, quote: u8, + terminator: Option, escape: Option, object_store: Arc, comment: Option, @@ -474,6 +496,7 @@ impl CsvConfig { has_header: bool, delimiter: u8, quote: u8, + terminator: Option, object_store: Arc, comment: Option, ) -> Self { @@ -484,6 +507,7 @@ impl CsvConfig { has_header, delimiter, quote, + terminator, escape: None, object_store, comment, @@ -502,7 +526,9 @@ impl CsvConfig { .with_batch_size(self.batch_size) .with_header(self.has_header) .with_quote(self.quote); - + if let Some(terminator) = self.terminator { + builder = builder.with_terminator(terminator); + } if let Some(proj) = &self.file_projection { builder = builder.with_projection(proj.clone()); } @@ -775,6 +801,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -844,6 +871,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -913,6 +941,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -979,6 +1008,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -1044,6 +1074,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -1139,6 +1170,7 @@ mod tests { .with_has_header(true) .with_delimeter(b',') .with_quote(b'"') + .with_terminator(None) .with_escape(None) .with_comment(None) .with_newlines_in_values(false) @@ -1210,6 +1242,107 @@ mod tests { crate::assert_batches_eq!(expected, &result); } + #[tokio::test] + async fn test_terminator() { + let session_ctx = SessionContext::new(); + let store = object_store::memory::InMemory::new(); + + let data = bytes::Bytes::from("a,b\r1,2\r3,4"); + let path = object_store::path::Path::from("a.csv"); + store.put(&path, data.into()).await.unwrap(); + + let url = Url::parse("memory://").unwrap(); + session_ctx.register_object_store(&url, Arc::new(store)); + + let df = session_ctx + .read_csv("memory:///", CsvReadOptions::new().terminator(Some(b'\r'))) + .await + .unwrap(); + + let result = df.collect().await.unwrap(); + + let expected = [ + "+---+---+", + "| a | b |", + "+---+---+", + "| 1 | 2 |", + "| 3 | 4 |", + "+---+---+", + ]; + + crate::assert_batches_eq!(expected, &result); + + let e = session_ctx + .read_csv("memory:///", CsvReadOptions::new().terminator(Some(b'\n'))) + .await + .unwrap() + .collect() + .await + .unwrap_err(); + assert_eq!(e.strip_backtrace(), "Arrow error: Csv error: incorrect number of fields for line 1, expected 2 got more than 2") + } + + #[tokio::test] + async fn test_create_external_table_with_terminator() -> Result<()> { + let ctx = SessionContext::new(); + ctx.sql( + r#" + CREATE EXTERNAL TABLE t1 ( + col1 TEXT, + col2 TEXT + ) STORED AS CSV + LOCATION 'tests/data/cr_terminator.csv' + OPTIONS ('format.terminator' E'\r', 'format.has_header' 'true'); + "#, + ) + .await? + .collect() + .await?; + + let df = ctx.sql(r#"select * from t1"#).await?.collect().await?; + let expected = [ + "+------+--------+", + "| col1 | col2 |", + "+------+--------+", + "| id0 | value0 |", + "| id1 | value1 |", + "| id2 | value2 |", + "| id3 | value3 |", + "+------+--------+", + ]; + crate::assert_batches_eq!(expected, &df); + Ok(()) + } + + #[tokio::test] + async fn test_create_external_table_with_terminator_with_newlines_in_values( + ) -> Result<()> { + let ctx = SessionContext::new(); + ctx.sql(r#" + CREATE EXTERNAL TABLE t1 ( + col1 TEXT, + col2 TEXT + ) STORED AS CSV + LOCATION 'tests/data/newlines_in_values_cr_terminator.csv' + OPTIONS ('format.terminator' E'\r', 'format.has_header' 'true', 'format.newlines_in_values' 'true'); + "#).await?.collect().await?; + + let df = ctx.sql(r#"select * from t1"#).await?.collect().await?; + let expected = [ + "+-------+-----------------------------+", + "| col1 | col2 |", + "+-------+-----------------------------+", + "| 1 | hello\rworld |", + "| 2 | something\relse |", + "| 3 | \rmany\rlines\rmake\rgood test\r |", + "| 4 | unquoted |", + "| value | end |", + "+-------+-----------------------------+", + ]; + crate::assert_batches_eq!(expected, &df); + Ok(()) + } + #[tokio::test] async fn write_csv_results_error_handling() -> Result<()> { let ctx = SessionContext::new(); @@ -1365,6 +1498,7 @@ mod tests { has_header, delimiter, quote, + terminator, escape, comment, newlines_in_values, @@ -1374,6 +1508,7 @@ mod tests { assert_eq!(has_header, default_options.has_header.unwrap_or(false)); assert_eq!(delimiter, default_options.delimiter); assert_eq!(quote, default_options.quote); + assert_eq!(terminator, default_options.terminator); assert_eq!(escape, default_options.escape); assert_eq!(comment, default_options.comment); assert_eq!( diff --git a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs index e4d26a460ecd..4e71993b5153 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs @@ -392,13 +392,16 @@ impl<'a> PagesPruningStatistics<'a> { trace!("No page offsets for row group {row_group_index}, skipping"); return None; }; - let Some(page_offsets) = row_group_page_offsets.get(parquet_column_index) else { + let Some(offset_index_metadata) = + row_group_page_offsets.get(parquet_column_index) + else { trace!( "No page offsets for column {:?} in row group {row_group_index}, skipping", converter.arrow_field() ); return None; }; + let page_offsets = offset_index_metadata.page_locations(); Some(Self { row_group_index, diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs index 6a6910748fc8..ccd77d90be57 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs @@ -487,11 +487,23 @@ mod tests { let schema_descr = get_test_schema_descr(vec![field]); let rgm1 = get_row_group_meta_data( &schema_descr, - vec![ParquetStatistics::int32(Some(1), Some(10), None, 0, false)], + vec![ParquetStatistics::int32( + Some(1), + Some(10), + None, + Some(0), + false, + )], ); let rgm2 = get_row_group_meta_data( &schema_descr, - vec![ParquetStatistics::int32(Some(11), Some(20), None, 0, false)], + vec![ParquetStatistics::int32( + Some(11), + Some(20), + None, + Some(0), + false, + )], ); let metrics = parquet_file_metrics(); @@ -520,11 +532,17 @@ mod tests { let schema_descr = get_test_schema_descr(vec![field]); let rgm1 = get_row_group_meta_data( &schema_descr, - vec![ParquetStatistics::int32(None, None, None, 0, false)], + vec![ParquetStatistics::int32(None, None, None, Some(0), false)], ); let rgm2 = get_row_group_meta_data( &schema_descr, - vec![ParquetStatistics::int32(Some(11), Some(20), None, 0, false)], + vec![ParquetStatistics::int32( + Some(11), + Some(20), + None, + Some(0), + false, + )], ); let metrics = parquet_file_metrics(); // missing statistics for first row group mean that the result from the predicate expression @@ -560,15 +578,15 @@ mod tests { let rgm1 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(1), Some(10), None, 0, false), - ParquetStatistics::int32(Some(1), Some(10), None, 0, false), + ParquetStatistics::int32(Some(1), Some(10), None, Some(0), false), + ParquetStatistics::int32(Some(1), Some(10), None, Some(0), false), ], ); let rgm2 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(11), Some(20), None, 0, false), - ParquetStatistics::int32(Some(11), Some(20), None, 0, false), + ParquetStatistics::int32(Some(11), Some(20), None, Some(0), false), + ParquetStatistics::int32(Some(11), Some(20), None, Some(0), false), ], ); @@ -633,16 +651,16 @@ mod tests { let rgm1 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(-10), Some(-1), None, 0, false), // c2 - ParquetStatistics::int32(Some(1), Some(10), None, 0, false), + ParquetStatistics::int32(Some(-10), Some(-1), None, Some(0), false), // c2 + ParquetStatistics::int32(Some(1), Some(10), None, Some(0), false), ], ); // rg1 has c2 greater than zero, c1 less than zero let rgm2 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(1), Some(10), None, 0, false), - ParquetStatistics::int32(Some(-10), Some(-1), None, 0, false), + ParquetStatistics::int32(Some(1), Some(10), None, Some(0), false), + ParquetStatistics::int32(Some(-10), Some(-1), None, Some(0), false), ], ); @@ -669,15 +687,15 @@ mod tests { let rgm1 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(1), Some(10), None, 0, false), - ParquetStatistics::boolean(Some(false), Some(true), None, 0, false), + ParquetStatistics::int32(Some(1), Some(10), None, Some(0), false), + ParquetStatistics::boolean(Some(false), Some(true), None, Some(0), false), ], ); let rgm2 = get_row_group_meta_data( &schema_descr, vec![ - ParquetStatistics::int32(Some(11), Some(20), None, 0, false), - ParquetStatistics::boolean(Some(false), Some(true), None, 1, false), + ParquetStatistics::int32(Some(11), Some(20), None, Some(0), false), + ParquetStatistics::boolean(Some(false), Some(true), None, Some(1), false), ], ); vec![rgm1, rgm2] @@ -775,7 +793,7 @@ mod tests { Some(100), Some(600), None, - 0, + Some(0), false, )], ); @@ -783,13 +801,25 @@ mod tests { &schema_descr, // [0.1, 0.2] // c1 > 5, this row group will not be included in the results. - vec![ParquetStatistics::int32(Some(10), Some(20), None, 0, false)], + vec![ParquetStatistics::int32( + Some(10), + Some(20), + None, + Some(0), + false, + )], ); let rgm3 = get_row_group_meta_data( &schema_descr, // [1, None] // c1 > 5, this row group can not be filtered out, so will be included in the results. - vec![ParquetStatistics::int32(Some(100), None, None, 0, false)], + vec![ParquetStatistics::int32( + Some(100), + None, + None, + Some(0), + false, + )], ); let metrics = parquet_file_metrics(); let mut row_groups = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(3)); @@ -837,7 +867,7 @@ mod tests { Some(100), Some(600), None, - 0, + Some(0), false, )], ); @@ -845,30 +875,62 @@ mod tests { &schema_descr, // [10, 20] // c1 > 5, this row group will be included in the results. - vec![ParquetStatistics::int32(Some(10), Some(20), None, 0, false)], + vec![ParquetStatistics::int32( + Some(10), + Some(20), + None, + Some(0), + false, + )], ); let rgm3 = get_row_group_meta_data( &schema_descr, // [0, 2] // c1 > 5, this row group will not be included in the results. - vec![ParquetStatistics::int32(Some(0), Some(2), None, 0, false)], + vec![ParquetStatistics::int32( + Some(0), + Some(2), + None, + Some(0), + false, + )], ); let rgm4 = get_row_group_meta_data( &schema_descr, // [None, 2] - // c1 > 5, this row group can not be filtered out, so will be included in the results. - vec![ParquetStatistics::int32(None, Some(2), None, 0, false)], + // c1 > 5, this row group will also not be included in the results + // (the min value is unknown, but the max value is 2, so no values can be greater than 5) + vec![ParquetStatistics::int32( + None, + Some(2), + None, + Some(0), + false, + )], + ); + let rgm5 = get_row_group_meta_data( + &schema_descr, + // [2, None] + // c1 > 5, this row group must be included + // (the min value is 2, but the max value is unknown, so it may have values greater than 5) + vec![ParquetStatistics::int32( + Some(2), + None, + None, + Some(0), + false, + )], ); let metrics = parquet_file_metrics(); - let mut row_groups = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(4)); + let mut row_groups = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(5)); row_groups.prune_by_statistics( &schema, &schema_descr, - &[rgm1, rgm2, rgm3, rgm4], + &[rgm1, rgm2, rgm3, rgm4, rgm5], &pruning_predicate, &metrics, ); - assert_pruned(row_groups, ExpectedPruning::Some(vec![0, 1, 3])); + assert_pruned(row_groups, ExpectedPruning::Some(vec![0, 1, 4])); } #[test] fn row_group_pruning_predicate_decimal_type3() { @@ -896,19 +958,25 @@ mod tests { Some(600), Some(800), None, - 0, + Some(0), false, )], ); let rgm2 = get_row_group_meta_data( &schema_descr, // [0.1, 0.2] - vec![ParquetStatistics::int64(Some(10), Some(20), None, 0, false)], + vec![ParquetStatistics::int64( + Some(10), + Some(20), + None, + Some(0), + false, + )], ); let rgm3 = get_row_group_meta_data( &schema_descr, // [0.1, 0.2] - vec![ParquetStatistics::int64(None, None, None, 0, false)], + vec![ParquetStatistics::int64(None, None, None, Some(0), false)], ); let metrics = parquet_file_metrics(); let mut row_groups = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(3)); @@ -957,7 +1025,7 @@ mod tests { 8000i128.to_be_bytes().to_vec(), ))), None, - 0, + Some(0), false, )], ); @@ -973,7 +1041,7 @@ mod tests { 20000i128.to_be_bytes().to_vec(), ))), None, - 0, + Some(0), false, )], ); @@ -981,7 +1049,11 @@ mod tests { let rgm3 = get_row_group_meta_data( &schema_descr, vec![ParquetStatistics::fixed_len_byte_array( - None, None, None, 0, false, + None, + None, + None, + Some(0), + false, )], ); let metrics = parquet_file_metrics(); @@ -1027,7 +1099,7 @@ mod tests { // 80.00 Some(ByteArray::from(8000i128.to_be_bytes().to_vec())), None, - 0, + Some(0), false, )], ); @@ -1039,13 +1111,19 @@ mod tests { // 200.00 Some(ByteArray::from(20000i128.to_be_bytes().to_vec())), None, - 0, + Some(0), false, )], ); let rgm3 = get_row_group_meta_data( &schema_descr, - vec![ParquetStatistics::byte_array(None, None, None, 0, false)], + vec![ParquetStatistics::byte_array( + None, + None, + None, + Some(0), + false, + )], ); let metrics = parquet_file_metrics(); let mut row_groups = RowGroupAccessPlanFilter::new(ParquetAccessPlan::new_all(3)); diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 90f175b59385..675ac798bf4e 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -1035,17 +1035,15 @@ impl SessionStateBuilder { } } - /// Set defaults for table_factories, file formats, expr_planners and builtin - /// scalar and aggregate functions. - pub fn with_default_features(mut self) -> Self { - self.table_factories = Some(SessionStateDefaults::default_table_factories()); - self.file_formats = Some(SessionStateDefaults::default_file_formats()); - self.expr_planners = Some(SessionStateDefaults::default_expr_planners()); - self.scalar_functions = Some(SessionStateDefaults::default_scalar_functions()); - self.aggregate_functions = - Some(SessionStateDefaults::default_aggregate_functions()); - self.window_functions = Some(SessionStateDefaults::default_window_functions()); - self + /// Create default builder with defaults for table_factories, file formats, expr_planners and builtin + /// scalar, aggregate and windows functions. + pub fn with_default_features(self) -> Self { + self.with_table_factories(SessionStateDefaults::default_table_factories()) + .with_file_formats(SessionStateDefaults::default_file_formats()) + .with_expr_planners(SessionStateDefaults::default_expr_planners()) + .with_scalar_functions(SessionStateDefaults::default_scalar_functions()) + .with_aggregate_functions(SessionStateDefaults::default_aggregate_functions()) + .with_window_functions(SessionStateDefaults::default_window_functions()) } /// Set the session id. diff --git a/datafusion/core/src/physical_optimizer/sanity_checker.rs b/datafusion/core/src/physical_optimizer/sanity_checker.rs index bd80d31224ef..e392105fbcb7 100644 --- a/datafusion/core/src/physical_optimizer/sanity_checker.rs +++ b/datafusion/core/src/physical_optimizer/sanity_checker.rs @@ -120,32 +120,36 @@ pub fn check_plan_sanity( ) -> Result>> { check_finiteness_requirements(plan.clone(), optimizer_options)?; - for (child, child_sort_req, child_dist_req) in izip!( - plan.children().iter(), + for ((idx, child), sort_req, dist_req) in izip!( + plan.children().iter().enumerate(), plan.required_input_ordering().iter(), plan.required_input_distribution().iter() ) { let child_eq_props = child.equivalence_properties(); - if let Some(child_sort_req) = child_sort_req { - if !child_eq_props.ordering_satisfy_requirement(child_sort_req) { - let child_plan_str = get_plan_string(child); + if let Some(sort_req) = sort_req { + if !child_eq_props.ordering_satisfy_requirement(sort_req) { + let plan_str = get_plan_string(&plan); return plan_err!( - "Child: {:?} does not satisfy parent order requirements: {:?}", - child_plan_str, - child_sort_req + "Plan: {:?} does not satisfy order requirements: {:?}. Child-{} order: {:?}", + plan_str, + sort_req, + idx, + child_eq_props.oeq_class ); } } if !child .output_partitioning() - .satisfy(child_dist_req, child_eq_props) + .satisfy(dist_req, child_eq_props) { - let child_plan_str = get_plan_string(child); + let plan_str = get_plan_string(&plan); return plan_err!( - "Child: {:?} does not satisfy parent distribution requirements: {:?}", - child_plan_str, - child_dist_req + "Plan: {:?} does not satisfy distribution requirements: {:?}. Child-{} output partitioning: {:?}", + plan_str, + dist_req, + idx, + child.output_partitioning() ); } } diff --git a/datafusion/core/tests/data/cr_terminator.csv b/datafusion/core/tests/data/cr_terminator.csv new file mode 100644 index 000000000000..f2a5d09a4c19 --- /dev/null +++ b/datafusion/core/tests/data/cr_terminator.csv @@ -0,0 +1 @@ +c1,c2 id0,value0 id1,value1 id2,value2 id3,value3 \ No newline at end of file diff --git a/datafusion/core/tests/data/newlines_in_values_cr_terminator.csv b/datafusion/core/tests/data/newlines_in_values_cr_terminator.csv new file mode 100644 index 000000000000..2f6557d60ec5 --- /dev/null +++ b/datafusion/core/tests/data/newlines_in_values_cr_terminator.csv @@ -0,0 +1 @@ +id,message 1,"hello world" 2,"something else" 3," many lines make good test " 4,unquoted value,end \ No newline at end of file diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index d0c1c3b2b3d6..18a624dd9cb2 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -799,40 +799,49 @@ impl LogicalPlan { pub fn with_new_exprs( &self, mut expr: Vec, - mut inputs: Vec, + inputs: Vec, ) -> Result { match self { // Since expr may be different than the previous expr, schema of the projection // may change. We need to use try_new method instead of try_new_with_schema method. LogicalPlan::Projection(Projection { .. }) => { - Projection::try_new(expr, Arc::new(inputs.swap_remove(0))) - .map(LogicalPlan::Projection) + let input = self.only_input(inputs)?; + Projection::try_new(expr, Arc::new(input)).map(LogicalPlan::Projection) } LogicalPlan::Dml(DmlStatement { table_name, table_schema, op, .. - }) => Ok(LogicalPlan::Dml(DmlStatement::new( - table_name.clone(), - Arc::clone(table_schema), - op.clone(), - Arc::new(inputs.swap_remove(0)), - ))), + }) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Dml(DmlStatement::new( + table_name.clone(), + Arc::clone(table_schema), + op.clone(), + Arc::new(input), + ))) + } LogicalPlan::Copy(CopyTo { input: _, output_url, file_type, options, partition_by, - }) => Ok(LogicalPlan::Copy(CopyTo { - input: Arc::new(inputs.swap_remove(0)), - output_url: output_url.clone(), - file_type: Arc::clone(file_type), - options: options.clone(), - partition_by: partition_by.clone(), - })), + }) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Copy(CopyTo { + input: Arc::new(input), + output_url: output_url.clone(), + file_type: Arc::clone(file_type), + options: options.clone(), + partition_by: partition_by.clone(), + })) + } LogicalPlan::Values(Values { schema, .. }) => { + self.assert_no_inputs(inputs)?; Ok(LogicalPlan::Values(Values { schema: Arc::clone(schema), values: expr @@ -842,54 +851,63 @@ impl LogicalPlan { })) } LogicalPlan::Filter { .. } => { - assert_eq!(1, expr.len()); - let predicate = expr.pop().unwrap(); + let predicate = self.only_expr(expr)?; + let input = self.only_input(inputs)?; - Filter::try_new(predicate, Arc::new(inputs.swap_remove(0))) - .map(LogicalPlan::Filter) + Filter::try_new(predicate, Arc::new(input)).map(LogicalPlan::Filter) } LogicalPlan::Repartition(Repartition { partitioning_scheme, .. }) => match partitioning_scheme { Partitioning::RoundRobinBatch(n) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; Ok(LogicalPlan::Repartition(Repartition { partitioning_scheme: Partitioning::RoundRobinBatch(*n), - input: Arc::new(inputs.swap_remove(0)), + input: Arc::new(input), + })) + } + Partitioning::Hash(_, n) => { + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Repartition(Repartition { + partitioning_scheme: Partitioning::Hash(expr, *n), + input: Arc::new(input), })) } - Partitioning::Hash(_, n) => Ok(LogicalPlan::Repartition(Repartition { - partitioning_scheme: Partitioning::Hash(expr, *n), - input: Arc::new(inputs.swap_remove(0)), - })), Partitioning::DistributeBy(_) => { + let input = self.only_input(inputs)?; Ok(LogicalPlan::Repartition(Repartition { partitioning_scheme: Partitioning::DistributeBy(expr), - input: Arc::new(inputs.swap_remove(0)), + input: Arc::new(input), })) } }, LogicalPlan::Window(Window { window_expr, .. }) => { assert_eq!(window_expr.len(), expr.len()); - Window::try_new(expr, Arc::new(inputs.swap_remove(0))) - .map(LogicalPlan::Window) + let input = self.only_input(inputs)?; + Window::try_new(expr, Arc::new(input)).map(LogicalPlan::Window) } LogicalPlan::Aggregate(Aggregate { group_expr, .. }) => { + let input = self.only_input(inputs)?; // group exprs are the first expressions let agg_expr = expr.split_off(group_expr.len()); - Aggregate::try_new(Arc::new(inputs.swap_remove(0)), expr, agg_expr) + Aggregate::try_new(Arc::new(input), expr, agg_expr) .map(LogicalPlan::Aggregate) } LogicalPlan::Sort(Sort { expr: sort_expr, fetch, .. - }) => Ok(LogicalPlan::Sort(Sort { - expr: replace_sort_expressions(sort_expr.clone(), expr), - input: Arc::new(inputs.swap_remove(0)), - fetch: *fetch, - })), + }) => { + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Sort(Sort { + expr: replace_sort_expressions(sort_expr.clone(), expr), + input: Arc::new(input), + fetch: *fetch, + })) + } LogicalPlan::Join(Join { join_type, join_constraint, @@ -897,8 +915,8 @@ impl LogicalPlan { null_equals_null, .. }) => { - let schema = - build_join_schema(inputs[0].schema(), inputs[1].schema(), join_type)?; + let (left, right) = self.only_two_inputs(inputs)?; + let schema = build_join_schema(left.schema(), right.schema(), join_type)?; let equi_expr_count = on.len(); assert!(expr.len() >= equi_expr_count); @@ -927,8 +945,8 @@ impl LogicalPlan { }).collect::>>()?; Ok(LogicalPlan::Join(Join { - left: Arc::new(inputs.swap_remove(0)), - right: Arc::new(inputs.swap_remove(0)), + left: Arc::new(left), + right: Arc::new(right), join_type: *join_type, join_constraint: *join_constraint, on: new_on, @@ -938,28 +956,34 @@ impl LogicalPlan { })) } LogicalPlan::CrossJoin(_) => { - let left = inputs.swap_remove(0); - let right = inputs.swap_remove(0); + self.assert_no_expressions(expr)?; + let (left, right) = self.only_two_inputs(inputs)?; LogicalPlanBuilder::from(left).cross_join(right)?.build() } LogicalPlan::Subquery(Subquery { outer_ref_columns, .. }) => { - let subquery = LogicalPlanBuilder::from(inputs.swap_remove(0)).build()?; + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + let subquery = LogicalPlanBuilder::from(input).build()?; Ok(LogicalPlan::Subquery(Subquery { subquery: Arc::new(subquery), outer_ref_columns: outer_ref_columns.clone(), })) } LogicalPlan::SubqueryAlias(SubqueryAlias { alias, .. }) => { - SubqueryAlias::try_new(Arc::new(inputs.swap_remove(0)), alias.clone()) + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + SubqueryAlias::try_new(Arc::new(input), alias.clone()) .map(LogicalPlan::SubqueryAlias) } LogicalPlan::Limit(Limit { skip, fetch, .. }) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; Ok(LogicalPlan::Limit(Limit { skip: *skip, fetch: *fetch, - input: Arc::new(inputs.swap_remove(0)), + input: Arc::new(input), })) } LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(CreateMemoryTable { @@ -968,31 +992,40 @@ impl LogicalPlan { or_replace, column_defaults, .. - })) => Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable( - CreateMemoryTable { - input: Arc::new(inputs.swap_remove(0)), - constraints: Constraints::empty(), - name: name.clone(), - if_not_exists: *if_not_exists, - or_replace: *or_replace, - column_defaults: column_defaults.clone(), - }, - ))), + })) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Ddl(DdlStatement::CreateMemoryTable( + CreateMemoryTable { + input: Arc::new(input), + constraints: Constraints::empty(), + name: name.clone(), + if_not_exists: *if_not_exists, + or_replace: *or_replace, + column_defaults: column_defaults.clone(), + }, + ))) + } LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { name, or_replace, definition, .. - })) => Ok(LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { - input: Arc::new(inputs.swap_remove(0)), - name: name.clone(), - or_replace: *or_replace, - definition: definition.clone(), - }))), + })) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Ddl(DdlStatement::CreateView(CreateView { + input: Arc::new(input), + name: name.clone(), + or_replace: *or_replace, + definition: definition.clone(), + }))) + } LogicalPlan::Extension(e) => Ok(LogicalPlan::Extension(Extension { node: e.node.with_exprs_and_inputs(expr, inputs)?, })), LogicalPlan::Union(Union { schema, .. }) => { + self.assert_no_expressions(expr)?; let input_schema = inputs[0].schema(); // If inputs are not pruned do not change schema. let schema = if schema.fields().len() == input_schema.fields().len() { @@ -1007,12 +1040,17 @@ impl LogicalPlan { } LogicalPlan::Distinct(distinct) => { let distinct = match distinct { - Distinct::All(_) => Distinct::All(Arc::new(inputs.swap_remove(0))), + Distinct::All(_) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Distinct::All(Arc::new(input)) + } Distinct::On(DistinctOn { on_expr, select_expr, .. }) => { + let input = self.only_input(inputs)?; let sort_expr = expr.split_off(on_expr.len() + select_expr.len()); let select_expr = expr.split_off(on_expr.len()); assert!(sort_expr.is_empty(), "with_new_exprs for Distinct does not support sort expressions"); @@ -1020,7 +1058,7 @@ impl LogicalPlan { expr, select_expr, None, // no sort expressions accepted - Arc::new(inputs.swap_remove(0)), + Arc::new(input), )?) } }; @@ -1028,30 +1066,31 @@ impl LogicalPlan { } LogicalPlan::RecursiveQuery(RecursiveQuery { name, is_distinct, .. - }) => Ok(LogicalPlan::RecursiveQuery(RecursiveQuery { - name: name.clone(), - static_term: Arc::new(inputs.swap_remove(0)), - recursive_term: Arc::new(inputs.swap_remove(0)), - is_distinct: *is_distinct, - })), + }) => { + self.assert_no_expressions(expr)?; + let (static_term, recursive_term) = self.only_two_inputs(inputs)?; + Ok(LogicalPlan::RecursiveQuery(RecursiveQuery { + name: name.clone(), + static_term: Arc::new(static_term), + recursive_term: Arc::new(recursive_term), + is_distinct: *is_distinct, + })) + } LogicalPlan::Analyze(a) => { - assert!(expr.is_empty()); - assert_eq!(inputs.len(), 1); + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; Ok(LogicalPlan::Analyze(Analyze { verbose: a.verbose, schema: Arc::clone(&a.schema), - input: Arc::new(inputs.swap_remove(0)), + input: Arc::new(input), })) } LogicalPlan::Explain(e) => { - assert!( - expr.is_empty(), - "Invalid EXPLAIN command. Expression should empty" - ); - assert_eq!(inputs.len(), 1, "Invalid EXPLAIN command. Inputs are empty"); + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; Ok(LogicalPlan::Explain(Explain { verbose: e.verbose, - plan: Arc::new(inputs.swap_remove(0)), + plan: Arc::new(input), stringified_plans: e.stringified_plans.clone(), schema: Arc::clone(&e.schema), logical_optimization_succeeded: e.logical_optimization_succeeded, @@ -1059,13 +1098,17 @@ impl LogicalPlan { } LogicalPlan::Prepare(Prepare { name, data_types, .. - }) => Ok(LogicalPlan::Prepare(Prepare { - name: name.clone(), - data_types: data_types.clone(), - input: Arc::new(inputs.swap_remove(0)), - })), + }) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; + Ok(LogicalPlan::Prepare(Prepare { + name: name.clone(), + data_types: data_types.clone(), + input: Arc::new(input), + })) + } LogicalPlan::TableScan(ts) => { - assert!(inputs.is_empty(), "{self:?} should have no inputs"); + self.assert_no_inputs(inputs)?; Ok(LogicalPlan::TableScan(TableScan { filters: expr, ..ts.clone() @@ -1073,26 +1116,89 @@ impl LogicalPlan { } LogicalPlan::EmptyRelation(_) | LogicalPlan::Ddl(_) - | LogicalPlan::Statement(_) => { + | LogicalPlan::Statement(_) + | LogicalPlan::DescribeTable(_) => { // All of these plan types have no inputs / exprs so should not be called - assert!(expr.is_empty(), "{self:?} should have no exprs"); - assert!(inputs.is_empty(), "{self:?} should have no inputs"); + self.assert_no_expressions(expr)?; + self.assert_no_inputs(inputs)?; Ok(self.clone()) } - LogicalPlan::DescribeTable(_) => Ok(self.clone()), LogicalPlan::Unnest(Unnest { exec_columns: columns, options, .. }) => { + self.assert_no_expressions(expr)?; + let input = self.only_input(inputs)?; // Update schema with unnested column type. - let input = inputs.swap_remove(0); let new_plan = unnest_with_options(input, columns.clone(), options.clone())?; Ok(new_plan) } } } + + /// Helper for [Self::with_new_exprs] to use when no expressions are expected. + #[inline] + #[allow(clippy::needless_pass_by_value)] // expr is moved intentionally to ensure it's not used again + fn assert_no_expressions(&self, expr: Vec) -> Result<()> { + if !expr.is_empty() { + return internal_err!("{self:?} should have no exprs, got {:?}", expr); + } + Ok(()) + } + + /// Helper for [Self::with_new_exprs] to use when no inputs are expected. + #[inline] + #[allow(clippy::needless_pass_by_value)] // inputs is moved intentionally to ensure it's not used again + fn assert_no_inputs(&self, inputs: Vec) -> Result<()> { + if !inputs.is_empty() { + return internal_err!("{self:?} should have no inputs, got: {:?}", inputs); + } + Ok(()) + } + + /// Helper for [Self::with_new_exprs] to use when exactly one expression is expected. + #[inline] + fn only_expr(&self, mut expr: Vec) -> Result { + if expr.len() != 1 { + return internal_err!( + "{self:?} should have exactly one expr, got {:?}", + expr + ); + } + Ok(expr.remove(0)) + } + + /// Helper for [Self::with_new_exprs] to use when exactly one input is expected. + #[inline] + fn only_input(&self, mut inputs: Vec) -> Result { + if inputs.len() != 1 { + return internal_err!( + "{self:?} should have exactly one input, got {:?}", + inputs + ); + } + Ok(inputs.remove(0)) + } + + /// Helper for [Self::with_new_exprs] to use when exactly two inputs are expected. + #[inline] + fn only_two_inputs( + &self, + mut inputs: Vec, + ) -> Result<(LogicalPlan, LogicalPlan)> { + if inputs.len() != 2 { + return internal_err!( + "{self:?} should have exactly two inputs, got {:?}", + inputs + ); + } + let right = inputs.remove(1); + let left = inputs.remove(0); + Ok((left, right)) + } + /// Replaces placeholder param values (like `$1`, `$2`) in [`LogicalPlan`] /// with the specified `param_values`. /// diff --git a/datafusion/functions/benches/repeat.rs b/datafusion/functions/benches/repeat.rs index 916c8374e5fb..e45313660ea2 100644 --- a/datafusion/functions/benches/repeat.rs +++ b/datafusion/functions/benches/repeat.rs @@ -67,7 +67,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, true); group.bench_function( - &format!( + format!( "repeat_string_view [size={}, repeat_times={}]", size, repeat_times ), @@ -76,7 +76,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, false); group.bench_function( - &format!( + format!( "repeat_string [size={}, repeat_times={}]", size, repeat_times ), @@ -85,7 +85,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, false); group.bench_function( - &format!( + format!( "repeat_large_string [size={}, repeat_times={}]", size, repeat_times ), @@ -103,7 +103,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, true); group.bench_function( - &format!( + format!( "repeat_string_view [size={}, repeat_times={}]", size, repeat_times ), @@ -112,7 +112,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, false); group.bench_function( - &format!( + format!( "repeat_string [size={}, repeat_times={}]", size, repeat_times ), @@ -121,7 +121,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args::(size, 32, repeat_times, false); group.bench_function( - &format!( + format!( "repeat_large_string [size={}, repeat_times={}]", size, repeat_times ), diff --git a/datafusion/functions/benches/substr.rs b/datafusion/functions/benches/substr.rs index 14a3389da380..1a696520c3ad 100644 --- a/datafusion/functions/benches/substr.rs +++ b/datafusion/functions/benches/substr.rs @@ -106,19 +106,19 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_without_count::(size, len, true, true); group.bench_function( - &format!("substr_string_view [size={}, strlen={}]", size, len), + format!("substr_string_view [size={}, strlen={}]", size, len), |b| b.iter(|| black_box(substr.invoke(&args))), ); let args = create_args_without_count::(size, len, false, false); group.bench_function( - &format!("substr_string [size={}, strlen={}]", size, len), + format!("substr_string [size={}, strlen={}]", size, len), |b| b.iter(|| black_box(substr.invoke(&args))), ); let args = create_args_without_count::(size, len, true, false); group.bench_function( - &format!("substr_large_string [size={}, strlen={}]", size, len), + format!("substr_large_string [size={}, strlen={}]", size, len), |b| b.iter(|| black_box(substr.invoke(&args))), ); @@ -133,7 +133,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, true); group.bench_function( - &format!( + format!( "substr_string_view [size={}, count={}, strlen={}]", size, count, len, ), @@ -142,7 +142,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, false); group.bench_function( - &format!( + format!( "substr_string [size={}, count={}, strlen={}]", size, count, len, ), @@ -151,7 +151,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, false); group.bench_function( - &format!( + format!( "substr_large_string [size={}, count={}, strlen={}]", size, count, len, ), @@ -169,7 +169,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, true); group.bench_function( - &format!( + format!( "substr_string_view [size={}, count={}, strlen={}]", size, count, len, ), @@ -178,7 +178,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, false); group.bench_function( - &format!( + format!( "substr_string [size={}, count={}, strlen={}]", size, count, len, ), @@ -187,7 +187,7 @@ fn criterion_benchmark(c: &mut Criterion) { let args = create_args_with_count::(size, len, count, false); group.bench_function( - &format!( + format!( "substr_large_string [size={}, count={}, strlen={}]", size, count, len, ), diff --git a/datafusion/functions/src/regex/regexpreplace.rs b/datafusion/functions/src/regex/regexpreplace.rs index d28c6cd36d65..0b0f7287e1ec 100644 --- a/datafusion/functions/src/regex/regexpreplace.rs +++ b/datafusion/functions/src/regex/regexpreplace.rs @@ -401,8 +401,7 @@ fn _regexp_replace_static_pattern_replace( DataType::Utf8View => { let string_view_array = as_string_view_array(&args[0])?; - let mut builder = StringViewBuilder::with_capacity(string_view_array.len()) - .with_block_size(1024 * 1024 * 2); + let mut builder = StringViewBuilder::with_capacity(string_view_array.len()); for val in string_view_array.iter() { if let Some(val) = val { diff --git a/datafusion/physical-expr-common/src/binary_view_map.rs b/datafusion/physical-expr-common/src/binary_view_map.rs index 18bc6801aa60..bdcf7bbacc69 100644 --- a/datafusion/physical-expr-common/src/binary_view_map.rs +++ b/datafusion/physical-expr-common/src/binary_view_map.rs @@ -149,7 +149,7 @@ where output_type, map: hashbrown::raw::RawTable::with_capacity(INITIAL_MAP_CAPACITY), map_size: 0, - builder: GenericByteViewBuilder::new().with_block_size(2 * 1024 * 1024), + builder: GenericByteViewBuilder::new(), random_state: RandomState::new(), hashes_buffer: vec![], null: None, diff --git a/datafusion/physical-plan/src/coalesce/mod.rs b/datafusion/physical-plan/src/coalesce/mod.rs index ce5a1e53abfd..46875fae94fc 100644 --- a/datafusion/physical-plan/src/coalesce/mod.rs +++ b/datafusion/physical-plan/src/coalesce/mod.rs @@ -248,7 +248,7 @@ fn gc_string_view_batch(batch: &RecordBatch) -> RecordBatch { // See https://github.com/apache/arrow-rs/issues/6094 for more details. let mut builder = StringViewBuilder::with_capacity(s.len()); if ideal_buffer_size > 0 { - builder = builder.with_block_size(ideal_buffer_size as u32); + builder = builder.with_fixed_block_size(ideal_buffer_size as u32); } for v in s.iter() { @@ -580,7 +580,8 @@ mod tests { impl StringViewTest { /// Create a `StringViewArray` with the parameters specified in this struct fn build(self) -> StringViewArray { - let mut builder = StringViewBuilder::with_capacity(100).with_block_size(8192); + let mut builder = + StringViewBuilder::with_capacity(100).with_fixed_block_size(8192); loop { for &v in self.strings.iter() { builder.append_option(v); diff --git a/datafusion/proto-common/Cargo.toml b/datafusion/proto-common/Cargo.toml index 33a7ecd2daab..7ba503171520 100644 --- a/datafusion/proto-common/Cargo.toml +++ b/datafusion/proto-common/Cargo.toml @@ -44,7 +44,7 @@ arrow = { workspace = true } chrono = { workspace = true } datafusion-common = { workspace = true } object_store = { workspace = true } -pbjson = { version = "0.6.0", optional = true } +pbjson = { workspace = true, optional = true } prost = { workspace = true } serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } diff --git a/datafusion/proto-common/gen/Cargo.toml b/datafusion/proto-common/gen/Cargo.toml index 54ec0e44694b..cca49dba7ed3 100644 --- a/datafusion/proto-common/gen/Cargo.toml +++ b/datafusion/proto-common/gen/Cargo.toml @@ -34,5 +34,5 @@ workspace = true [dependencies] # Pin these dependencies so that the generated output is deterministic -pbjson-build = "=0.6.2" -prost-build = "=0.12.6" +pbjson-build = "=0.7.0" +prost-build = "=0.13.1" diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 9268ccca0b70..51e94d2caaf4 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -423,6 +423,7 @@ message CsvOptions { bytes comment = 13; // Optional comment character as a byte bytes double_quote = 14; // Indicates if quotes are doubled bytes newlines_in_values = 15; // Indicates if newlines are supported in values + bytes terminator = 16; // Optional terminator character as a byte } // Options controlling CSV format diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index feb4c11aa809..45d275fb488e 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -863,6 +863,7 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions { has_header: proto_opts.has_header.first().map(|h| *h != 0), delimiter: proto_opts.delimiter[0], quote: proto_opts.quote[0], + terminator: proto_opts.terminator.first().copied(), escape: proto_opts.escape.first().copied(), double_quote: proto_opts.has_header.first().map(|h| *h != 0), newlines_in_values: proto_opts.newlines_in_values.first().map(|h| *h != 0), diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index 05e57f5585a6..78ba829f8c50 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -1542,6 +1542,9 @@ impl serde::Serialize for CsvOptions { if !self.newlines_in_values.is_empty() { len += 1; } + if !self.terminator.is_empty() { + len += 1; + } let mut struct_ser = serializer.serialize_struct("datafusion_common.CsvOptions", len)?; if !self.has_header.is_empty() { #[allow(clippy::needless_borrow)] @@ -1598,6 +1601,10 @@ impl serde::Serialize for CsvOptions { #[allow(clippy::needless_borrow)] struct_ser.serialize_field("newlinesInValues", pbjson::private::base64::encode(&self.newlines_in_values).as_str())?; } + if !self.terminator.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("terminator", pbjson::private::base64::encode(&self.terminator).as_str())?; + } struct_ser.end() } } @@ -1633,6 +1640,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { "doubleQuote", "newlines_in_values", "newlinesInValues", + "terminator", ]; #[allow(clippy::enum_variant_names)] @@ -1652,6 +1660,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { Comment, DoubleQuote, NewlinesInValues, + Terminator, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -1688,6 +1697,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { "comment" => Ok(GeneratedField::Comment), "doubleQuote" | "double_quote" => Ok(GeneratedField::DoubleQuote), "newlinesInValues" | "newlines_in_values" => Ok(GeneratedField::NewlinesInValues), + "terminator" => Ok(GeneratedField::Terminator), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -1722,6 +1732,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { let mut comment__ = None; let mut double_quote__ = None; let mut newlines_in_values__ = None; + let mut terminator__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::HasHeader => { @@ -1830,6 +1841,14 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) ; } + GeneratedField::Terminator => { + if terminator__.is_some() { + return Err(serde::de::Error::duplicate_field("terminator")); + } + terminator__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } } } Ok(CsvOptions { @@ -1848,6 +1867,7 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { comment: comment__.unwrap_or_default(), double_quote: double_quote__.unwrap_or_default(), newlines_in_values: newlines_in_values__.unwrap_or_default(), + terminator: terminator__.unwrap_or_default(), }) } } diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index ebc05718a458..cb8f86a022a6 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -652,6 +652,9 @@ pub struct CsvOptions { /// Indicates if newlines are supported in values #[prost(bytes = "vec", tag = "15")] pub newlines_in_values: ::prost::alloc::vec::Vec, + /// Optional terminator character as a byte + #[prost(bytes = "vec", tag = "16")] + pub terminator: ::prost::alloc::vec::Vec, } /// Options controlling CSV format #[allow(clippy::derive_partial_eq_without_eq)] diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 4cf7e73ac912..3718ccbb0f85 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -910,6 +910,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions { has_header: opts.has_header.map_or_else(Vec::new, |h| vec![h as u8]), delimiter: vec![opts.delimiter], quote: vec![opts.quote], + terminator: opts.terminator.map_or_else(Vec::new, |e| vec![e]), escape: opts.escape.map_or_else(Vec::new, |e| vec![e]), double_quote: opts.double_quote.map_or_else(Vec::new, |h| vec![h as u8]), newlines_in_values: opts diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 2804ed019b61..32678246c005 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -52,7 +52,7 @@ datafusion-common = { workspace = true, default-features = true } datafusion-expr = { workspace = true } datafusion-proto-common = { workspace = true } object_store = { workspace = true } -pbjson = { version = "0.6.0", optional = true } +pbjson = { workspace = true, optional = true } prost = { workspace = true } serde = { version = "1.0", optional = true } serde_json = { workspace = true, optional = true } diff --git a/datafusion/proto/gen/Cargo.toml b/datafusion/proto/gen/Cargo.toml index 401c51c94563..1dc5f7e0dddc 100644 --- a/datafusion/proto/gen/Cargo.toml +++ b/datafusion/proto/gen/Cargo.toml @@ -34,5 +34,5 @@ workspace = true [dependencies] # Pin these dependencies so that the generated output is deterministic -pbjson-build = "=0.6.2" -prost-build = "=0.12.6" +pbjson-build = "=0.7.0" +prost-build = "=0.13.1" diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index ebc05718a458..dc8d0017d3fd 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -45,10 +45,10 @@ pub struct ParquetFormat { pub options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct AvroFormat {} #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct NdJsonFormat { #[prost(message, optional, tag = "1")] pub options: ::core::option::Option, @@ -89,10 +89,10 @@ pub struct Constraints { pub constraints: ::prost::alloc::vec::Vec, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct AvroOptions {} #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ArrowOptions {} #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] @@ -137,7 +137,7 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct Decimal { #[prost(uint32, tag = "3")] pub precision: u32, @@ -145,7 +145,7 @@ pub struct Decimal { pub scale: i32, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct Decimal256Type { #[prost(uint32, tag = "3")] pub precision: u32, @@ -223,7 +223,7 @@ pub mod scalar_nested_value { } } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ScalarTime32Value { #[prost(oneof = "scalar_time32_value::Value", tags = "1, 2")] pub value: ::core::option::Option, @@ -231,7 +231,7 @@ pub struct ScalarTime32Value { /// Nested message and enum types in `ScalarTime32Value`. pub mod scalar_time32_value { #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum Value { #[prost(int32, tag = "1")] Time32SecondValue(i32), @@ -240,7 +240,7 @@ pub mod scalar_time32_value { } } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ScalarTime64Value { #[prost(oneof = "scalar_time64_value::Value", tags = "1, 2")] pub value: ::core::option::Option, @@ -248,7 +248,7 @@ pub struct ScalarTime64Value { /// Nested message and enum types in `ScalarTime64Value`. pub mod scalar_time64_value { #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] Time64MicrosecondValue(i64), @@ -267,7 +267,7 @@ pub struct ScalarTimestampValue { /// Nested message and enum types in `ScalarTimestampValue`. pub mod scalar_timestamp_value { #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum Value { #[prost(int64, tag = "1")] TimeMicrosecondValue(i64), @@ -288,7 +288,7 @@ pub struct ScalarDictionaryValue { pub value: ::core::option::Option<::prost::alloc::boxed::Box>, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct IntervalDayTimeValue { #[prost(int32, tag = "1")] pub days: i32, @@ -296,7 +296,7 @@ pub struct IntervalDayTimeValue { pub milliseconds: i32, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct IntervalMonthDayNanoValue { #[prost(int32, tag = "1")] pub months: i32, @@ -558,10 +558,10 @@ pub mod arrow_type { /// } /// } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct EmptyMessage {} #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct JsonWriterOptions { #[prost(enumeration = "CompressionTypeVariant", tag = "1")] pub compression: i32, @@ -652,10 +652,13 @@ pub struct CsvOptions { /// Indicates if newlines are supported in values #[prost(bytes = "vec", tag = "15")] pub newlines_in_values: ::prost::alloc::vec::Vec, + /// Optional terminator character as a byte + #[prost(bytes = "vec", tag = "16")] + pub terminator: ::prost::alloc::vec::Vec, } /// Options controlling CSV format #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct JsonOptions { /// Compression type #[prost(enumeration = "CompressionTypeVariant", tag = "1")] @@ -720,7 +723,7 @@ pub struct ParquetColumnOptions { /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum BloomFilterEnabledOpt { #[prost(bool, tag = "1")] BloomFilterEnabled(bool), @@ -732,7 +735,7 @@ pub mod parquet_column_options { Encoding(::prost::alloc::string::String), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "3")] DictionaryEnabled(bool), @@ -750,19 +753,19 @@ pub mod parquet_column_options { StatisticsEnabled(::prost::alloc::string::String), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum BloomFilterFppOpt { #[prost(double, tag = "6")] BloomFilterFpp(f64), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum MaxStatisticsSizeOpt { #[prost(uint32, tag = "8")] MaxStatisticsSize(u32), @@ -857,7 +860,7 @@ pub struct ParquetOptions { /// Nested message and enum types in `ParquetOptions`. pub mod parquet_options { #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum MetadataSizeHintOpt { #[prost(uint64, tag = "4")] MetadataSizeHint(u64), @@ -869,7 +872,7 @@ pub mod parquet_options { Compression(::prost::alloc::string::String), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum DictionaryEnabledOpt { #[prost(bool, tag = "11")] DictionaryEnabled(bool), @@ -881,13 +884,13 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum MaxStatisticsSizeOpt { #[prost(uint64, tag = "14")] MaxStatisticsSize(u64), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), @@ -899,13 +902,13 @@ pub mod parquet_options { Encoding(::prost::alloc::string::String), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum BloomFilterFppOpt { #[prost(double, tag = "21")] BloomFilterFpp(f64), } #[allow(clippy::derive_partial_eq_without_eq)] - #[derive(Clone, PartialEq, ::prost::Oneof)] + #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum BloomFilterNdvOpt { #[prost(uint64, tag = "22")] BloomFilterNdv(u64), diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index cff58d3ddc4a..1c433c2572c4 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -12,6 +12,7 @@ impl serde::Serialize for AggLimit { let mut struct_ser = serializer.serialize_struct("datafusion.AggLimit", len)?; if self.limit != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("limit", ToString::to_string(&self.limit).as_str())?; } struct_ser.end() @@ -613,6 +614,7 @@ impl serde::Serialize for AggregateUdfExprNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } struct_ser.end() @@ -2348,6 +2350,7 @@ impl serde::Serialize for CopyToNode { } if !self.file_type.is_empty() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fileType", pbjson::private::base64::encode(&self.file_type).as_str())?; } if !self.partition_by.is_empty() { @@ -3953,6 +3956,7 @@ impl serde::Serialize for CustomTableScanNode { } if !self.custom_table_data.is_empty() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("customTableData", pbjson::private::base64::encode(&self.custom_table_data).as_str())?; } struct_ser.end() @@ -5037,10 +5041,12 @@ impl serde::Serialize for FileRange { let mut struct_ser = serializer.serialize_struct("datafusion.FileRange", len)?; if self.start != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("start", ToString::to_string(&self.start).as_str())?; } if self.end != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("end", ToString::to_string(&self.end).as_str())?; } struct_ser.end() @@ -5922,6 +5928,7 @@ impl serde::Serialize for GlobalLimitExecNode { } if self.fetch != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fetch", ToString::to_string(&self.fetch).as_str())?; } struct_ser.end() @@ -6357,6 +6364,7 @@ impl serde::Serialize for HashRepartition { } if self.partition_count != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("partitionCount", ToString::to_string(&self.partition_count).as_str())?; } struct_ser.end() @@ -8409,10 +8417,12 @@ impl serde::Serialize for LimitNode { } if self.skip != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("skip", ToString::to_string(&self.skip).as_str())?; } if self.fetch != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fetch", ToString::to_string(&self.fetch).as_str())?; } struct_ser.end() @@ -9860,6 +9870,7 @@ impl serde::Serialize for LogicalExtensionNode { let mut struct_ser = serializer.serialize_struct("datafusion.LogicalExtensionNode", len)?; if !self.node.is_empty() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("node", pbjson::private::base64::encode(&self.node).as_str())?; } if !self.inputs.is_empty() { @@ -11982,14 +11993,17 @@ impl serde::Serialize for PartitionStats { let mut struct_ser = serializer.serialize_struct("datafusion.PartitionStats", len)?; if self.num_rows != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("numRows", ToString::to_string(&self.num_rows).as_str())?; } if self.num_batches != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("numBatches", ToString::to_string(&self.num_batches).as_str())?; } if self.num_bytes != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("numBytes", ToString::to_string(&self.num_bytes).as_str())?; } if !self.column_stats.is_empty() { @@ -12146,10 +12160,12 @@ impl serde::Serialize for PartitionedFile { } if self.size != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("size", ToString::to_string(&self.size).as_str())?; } if self.last_modified_ns != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("lastModifiedNs", ToString::to_string(&self.last_modified_ns).as_str())?; } if !self.partition_values.is_empty() { @@ -12314,6 +12330,7 @@ impl serde::Serialize for Partitioning { match v { partitioning::PartitionMethod::RoundRobin(v) => { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?; } partitioning::PartitionMethod::Hash(v) => { @@ -12321,6 +12338,7 @@ impl serde::Serialize for Partitioning { } partitioning::PartitionMethod::Unknown(v) => { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("unknown", ToString::to_string(&v).as_str())?; } } @@ -12462,6 +12480,7 @@ impl serde::Serialize for PhysicalAggregateExprNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } if let Some(v) = self.aggregate_function.as_ref() { @@ -13644,6 +13663,7 @@ impl serde::Serialize for PhysicalExtensionExprNode { let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExtensionExprNode", len)?; if !self.expr.is_empty() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("expr", pbjson::private::base64::encode(&self.expr).as_str())?; } if !self.inputs.is_empty() { @@ -13755,6 +13775,7 @@ impl serde::Serialize for PhysicalExtensionNode { let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExtensionNode", len)?; if !self.node.is_empty() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("node", pbjson::private::base64::encode(&self.node).as_str())?; } if !self.inputs.is_empty() { @@ -13869,6 +13890,7 @@ impl serde::Serialize for PhysicalHashRepartition { } if self.partition_count != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("partitionCount", ToString::to_string(&self.partition_count).as_str())?; } struct_ser.end() @@ -15085,6 +15107,7 @@ impl serde::Serialize for PhysicalScalarUdfNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } if let Some(v) = self.return_type.as_ref() { @@ -15687,6 +15710,7 @@ impl serde::Serialize for PhysicalWindowExprNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } if let Some(v) = self.window_function.as_ref() { @@ -16901,6 +16925,7 @@ impl serde::Serialize for RepartitionNode { match v { repartition_node::PartitionMethod::RoundRobin(v) => { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("roundRobin", ToString::to_string(&v).as_str())?; } repartition_node::PartitionMethod::Hash(v) => { @@ -17123,6 +17148,7 @@ impl serde::Serialize for ScalarUdfExprNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } struct_ser.end() @@ -17691,6 +17717,7 @@ impl serde::Serialize for SortExecNode { } if self.fetch != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fetch", ToString::to_string(&self.fetch).as_str())?; } if self.preserve_partitioning { @@ -18052,6 +18079,7 @@ impl serde::Serialize for SortNode { } if self.fetch != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fetch", ToString::to_string(&self.fetch).as_str())?; } struct_ser.end() @@ -18180,6 +18208,7 @@ impl serde::Serialize for SortPreservingMergeExecNode { } if self.fetch != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("fetch", ToString::to_string(&self.fetch).as_str())?; } struct_ser.end() @@ -19625,6 +19654,7 @@ impl serde::Serialize for ValuesNode { let mut struct_ser = serializer.serialize_struct("datafusion.ValuesNode", len)?; if self.n_cols != 0 { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("nCols", ToString::to_string(&self.n_cols).as_str())?; } if !self.values_list.is_empty() { @@ -20299,6 +20329,7 @@ impl serde::Serialize for WindowExprNode { } if let Some(v) = self.fun_definition.as_ref() { #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] struct_ser.serialize_field("funDefinition", pbjson::private::base64::encode(&v).as_str())?; } if let Some(v) = self.window_function.as_ref() { diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 2ce8004e3248..dbcf7672a48c 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -239,7 +239,7 @@ pub struct HashRepartition { pub partition_count: u64, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct EmptyRelationNode { #[prost(bool, tag = "1")] pub produce_one_row: bool, @@ -443,7 +443,7 @@ pub struct UnnestNode { pub options: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct UnnestOptions { #[prost(bool, tag = "1")] pub preserve_nulls: bool, @@ -912,7 +912,7 @@ pub struct WindowFrameBound { pub bound_value: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct FixedSizeBinary { #[prost(int32, tag = "1")] pub length: i32, @@ -1474,7 +1474,7 @@ pub struct FileGroup { pub files: ::prost::alloc::vec::Vec, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ScanLimit { /// wrap into a message to make it optional #[prost(uint32, tag = "1")] @@ -1721,7 +1721,7 @@ pub struct MaybePhysicalSortExprs { pub sort_expr: ::prost::alloc::vec::Vec, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct AggLimit { /// wrap into a message to make it optional #[prost(uint64, tag = "1")] @@ -1877,7 +1877,7 @@ pub struct JoinFilter { pub schema: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct ColumnIndex { #[prost(uint32, tag = "1")] pub index: u32, @@ -1903,7 +1903,7 @@ pub struct PartitionedFile { pub statistics: ::core::option::Option, } #[allow(clippy::derive_partial_eq_without_eq)] -#[derive(Clone, PartialEq, ::prost::Message)] +#[derive(Clone, Copy, PartialEq, ::prost::Message)] pub struct FileRange { #[prost(int64, tag = "1")] pub start: i64, diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 607a3d8642fd..2e3476da6ac0 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -53,6 +53,7 @@ impl CsvOptionsProto { has_header: options.has_header.map_or(vec![], |v| vec![v as u8]), delimiter: vec![options.delimiter], quote: vec![options.quote], + terminator: options.terminator.map_or(vec![], |v| vec![v]), escape: options.escape.map_or(vec![], |v| vec![v]), double_quote: options.double_quote.map_or(vec![], |v| vec![v as u8]), compression: options.compression as i32, @@ -87,6 +88,11 @@ impl From<&CsvOptionsProto> for CsvOptions { }, delimiter: proto.delimiter.first().copied().unwrap_or(b','), quote: proto.quote.first().copied().unwrap_or(b'"'), + terminator: if !proto.terminator.is_empty() { + Some(proto.terminator[0]) + } else { + None + }, escape: if !proto.escape.is_empty() { Some(proto.escape[0]) } else { diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index fe34d87bfeae..516833a39f1e 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -2083,49 +2083,49 @@ mod tests { "1 YEAR 1 MONTH 1 DAY 3 HOUR 10 MINUTE 20 SECOND", ), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 13 MONS 1 DAYS 3 HOURS 10 MINS 20.000000000 SECS'"#, + r#"INTERVAL '13 MONS 1 DAYS 3 HOURS 10 MINS 20.000000000 SECS'"#, ), ( interval_month_day_nano_lit("1.5 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 1 MONS 15 DAYS 0 HOURS 0 MINS 0.000000000 SECS'"#, + r#"INTERVAL '1 MONS 15 DAYS'"#, ), ( interval_month_day_nano_lit("-3 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS -3 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS'"#, + r#"INTERVAL '-3 MONS'"#, ), ( interval_month_day_nano_lit("1 MONTH") .add(interval_month_day_nano_lit("1 DAY")), IntervalStyle::PostgresVerbose, - r#"(INTERVAL '0 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS' + INTERVAL '0 YEARS 0 MONS 1 DAYS 0 HOURS 0 MINS 0.000000000 SECS')"#, + r#"(INTERVAL '1 MONS' + INTERVAL '1 DAYS')"#, ), ( interval_month_day_nano_lit("1 MONTH") .sub(interval_month_day_nano_lit("1 DAY")), IntervalStyle::PostgresVerbose, - r#"(INTERVAL '0 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.000000000 SECS' - INTERVAL '0 YEARS 0 MONS 1 DAYS 0 HOURS 0 MINS 0.000000000 SECS')"#, + r#"(INTERVAL '1 MONS' - INTERVAL '1 DAYS')"#, ), ( interval_datetime_lit("10 DAY 1 HOUR 10 MINUTE 20 SECOND"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 0 MONS 10 DAYS 1 HOURS 10 MINS 20.000 SECS'"#, + r#"INTERVAL '10 DAYS 1 HOURS 10 MINS 20.000 SECS'"#, ), ( interval_datetime_lit("10 DAY 1.5 HOUR 10 MINUTE 20 SECOND"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '0 YEARS 0 MONS 10 DAYS 1 HOURS 40 MINS 20.000 SECS'"#, + r#"INTERVAL '10 DAYS 1 HOURS 40 MINS 20.000 SECS'"#, ), ( interval_year_month_lit("1 YEAR 1 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '1 YEARS 1 MONS 0 DAYS 0 HOURS 0 MINS 0.00 SECS'"#, + r#"INTERVAL '1 YEARS 1 MONS'"#, ), ( interval_year_month_lit("1.5 YEAR 1 MONTH"), IntervalStyle::PostgresVerbose, - r#"INTERVAL '1 YEARS 7 MONS 0 DAYS 0 HOURS 0 MINS 0.00 SECS'"#, + r#"INTERVAL '1 YEARS 7 MONS'"#, ), ( interval_year_month_lit("1 YEAR 1 MONTH"), diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index c32acecaae5f..2531795a1630 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -268,8 +268,8 @@ pub(crate) fn value_to_string(value: &Value) -> Option { Value::DollarQuotedString(s) => Some(s.to_string()), Value::Number(_, _) | Value::Boolean(_) => Some(value.to_string()), Value::UnicodeStringLiteral(s) => Some(s.to_string()), + Value::EscapedStringLiteral(s) => Some(s.to_string()), Value::DoubleQuotedString(_) - | Value::EscapedStringLiteral(_) | Value::NationalStringLiteral(_) | Value::SingleQuotedByteStringLiteral(_) | Value::DoubleQuotedByteStringLiteral(_) diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index cdc7bef06afd..d4e189f5f66a 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -611,7 +611,7 @@ fn sql_round_trip(query: &str, expect: &str) { fn test_interval_lhs_eq() { sql_round_trip( "select interval '2 seconds' = interval '2 seconds'", - "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' = INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + "SELECT (INTERVAL '2.000000000 SECS' = INTERVAL '2.000000000 SECS')", ); } @@ -619,6 +619,6 @@ fn test_interval_lhs_eq() { fn test_interval_lhs_lt() { sql_round_trip( "select interval '2 seconds' < interval '2 seconds'", - "SELECT (INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS' < INTERVAL '0 YEARS 0 MONS 0 DAYS 0 HOURS 0 MINS 2.000000000 SECS')", + "SELECT (INTERVAL '2.000000000 SECS' < INTERVAL '2.000000000 SECS')", ); } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index c52445c561ee..83f4e4f03055 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -1916,7 +1916,7 @@ from values (interval '2 month 15 days'), (interval '-2 month') ---- -Interval(MonthDayNano) 0 years -2 mons 0 days 0 hours 0 mins 0.000000000 secs 0 years 2 mons 15 days 0 hours 0 mins 0.000000000 secs +Interval(MonthDayNano) -2 mons 2 mons 15 days # aggregate Interval(DayTime) min/max query T?? @@ -1927,7 +1927,7 @@ from values (arrow_cast('-3 minutes', 'Interval(DayTime)')), (arrow_cast('30 minutes', 'Interval(DayTime)')); ---- -Interval(DayTime) 0 years 0 mons 0 days 0 hours -3 mins 0.000 secs 0 years 0 mons 0 days 1 hours 0 mins 0.000 secs +Interval(DayTime) -3 mins 1 hours # aggregate Interval(YearMonth) min/max query T?? @@ -1938,7 +1938,7 @@ from values (arrow_cast('13 months', 'Interval(YearMonth)')), (arrow_cast('1 year', 'Interval(YearMonth)')); ---- -Interval(YearMonth) -1 years 0 mons 0 days 0 hours 0 mins 0.00 secs 1 years 1 mons 0 days 0 hours 0 mins 0.00 secs +Interval(YearMonth) -1 years 0 mons 1 years 1 mons # aggregate query II diff --git a/datafusion/sqllogictest/test_files/arrow_typeof.slt b/datafusion/sqllogictest/test_files/arrow_typeof.slt index bae6dc33c8cf..77b10b41ccb3 100644 --- a/datafusion/sqllogictest/test_files/arrow_typeof.slt +++ b/datafusion/sqllogictest/test_files/arrow_typeof.slt @@ -290,22 +290,22 @@ query ? --- select arrow_cast(interval '30 minutes', 'Interval(MonthDayNano)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000000000 secs +30 mins query ? select arrow_cast('30 minutes', 'Interval(DayTime)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000 secs +30 mins query ? select arrow_cast('1 year 5 months', 'Interval(YearMonth)'); ---- -1 years 5 mons 0 days 0 hours 0 mins 0.00 secs +1 years 5 mons query ? select arrow_cast('30 minutes', 'Interval(MonthDayNano)'); ---- -0 years 0 mons 0 days 0 hours 30 mins 0.000000000 secs +30 mins ## Duration @@ -432,5 +432,7 @@ MyAwesomeString Utf8View # Fails until we update to use the arrow-cast release with support for casting utf8 types to BinaryView # refer to merge commit https://github.com/apache/arrow-rs/commit/4bd737dab2aa17aca200259347909d48ed793ba1 -query error DataFusion error: This feature is not implemented: Unsupported CAST from Utf8 to BinaryView +query ?T select arrow_cast('MyAwesomeString', 'BinaryView'), arrow_typeof(arrow_cast('MyAwesomeString', 'BinaryView')) +---- +4d79417765736f6d65537472696e67 BinaryView diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 7cb21abdba10..d6600e06dc1c 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -336,3 +336,24 @@ id message 05)good test 4 unquoted value end + +statement ok +CREATE EXTERNAL TABLE stored_table_with_cr_terminator ( +col1 TEXT, +col2 TEXT +) STORED AS CSV +LOCATION '../core/tests/data/cr_terminator.csv' +OPTIONS ('format.terminator' E'\r', 'format.has_header' 'true'); + +# TODO: It should be passed but got the error: External error: query failed: DataFusion error: Object Store error: Generic LocalFileSystem error: Requested range was invalid +# See the issue: https://github.com/apache/datafusion/issues/12328 +# query TT +# select * from stored_table_with_cr_terminator; +# ---- +# id0 value0 +# id1 value1 +# id2 value2 +# id3 value3 + +statement ok +drop table stored_table_with_cr_terminator; diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 7164425fc0f5..21edb458fe56 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -710,7 +710,7 @@ create table t (i interval, x int) as values (interval '5 days 3 nanoseconds', C query ?I select * from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs 1 +5 days 0.000000003 secs 1 statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 81ae60f3ba93..002e8db2132d 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -122,203 +122,203 @@ SELECT query ? SELECT interval '1' ---- -0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs +1.000000000 secs query ? SELECT interval '1 second' ---- -0 years 0 mons 0 days 0 hours 0 mins 1.000000000 secs +1.000000000 secs query ? SELECT interval '500 milliseconds' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.500000000 secs +0.500000000 secs query ? SELECT interval '5 second' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? SELECT interval '0.5 minute' ---- -0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs +30.000000000 secs query ? SELECT interval '.5 minute' ---- -0 years 0 mons 0 days 0 hours 0 mins 30.000000000 secs +30.000000000 secs query ? SELECT interval '5 minute' ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? SELECT interval '5 minute 1 second' ---- -0 years 0 mons 0 days 0 hours 5 mins 1.000000000 secs +5 mins 1.000000000 secs query ? SELECT interval '1 hour' ---- -0 years 0 mons 0 days 1 hours 0 mins 0.000000000 secs +1 hours query ? SELECT interval '5 hour' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? SELECT interval '1 day' ---- -0 years 0 mons 1 days 0 hours 0 mins 0.000000000 secs +1 days query ? SELECT interval '1 week' ---- -0 years 0 mons 7 days 0 hours 0 mins 0.000000000 secs +7 days query ? SELECT interval '2 weeks' ---- -0 years 0 mons 14 days 0 hours 0 mins 0.000000000 secs +14 days query ? SELECT interval '1 day 1' ---- -0 years 0 mons 1 days 0 hours 0 mins 1.000000000 secs +1 days 1.000000000 secs query ? SELECT interval '0.5' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.500000000 secs +0.500000000 secs query ? SELECT interval '0.5 day 1' ---- -0 years 0 mons 0 days 12 hours 0 mins 1.000000000 secs +12 hours 1.000000000 secs query ? SELECT interval '0.49 day' ---- -0 years 0 mons 0 days 11 hours 45 mins 36.000000000 secs +11 hours 45 mins 36.000000000 secs query ? SELECT interval '0.499 day' ---- -0 years 0 mons 0 days 11 hours 58 mins 33.600000000 secs +11 hours 58 mins 33.600000000 secs query ? SELECT interval '0.4999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 51.360000000 secs +11 hours 59 mins 51.360000000 secs query ? SELECT interval '0.49999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 59.136000000 secs +11 hours 59 mins 59.136000000 secs query ? SELECT interval '0.49999999999 day' ---- -0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs +11 hours 59 mins 59.999999136 secs query ? SELECT interval '5 day' ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days # Hour is ignored, this matches PostgreSQL query ? SELECT interval '5 day' hour ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? SELECT interval '5 day 4 hours 3 minutes 2 seconds 100 milliseconds' ---- -0 years 0 mons 5 days 4 hours 3 mins 2.100000000 secs +5 days 4 hours 3 mins 2.100000000 secs query ? SELECT interval '0.5 month' ---- -0 years 0 mons 15 days 0 hours 0 mins 0.000000000 secs +15 days query ? SELECT interval '0.5' month ---- -0 years 0 mons 15 days 0 hours 0 mins 0.000000000 secs +15 days query ? SELECT interval '1 month' ---- -0 years 1 mons 0 days 0 hours 0 mins 0.000000000 secs +1 mons query ? SELECT interval '1' MONTH ---- -0 years 1 mons 0 days 0 hours 0 mins 0.000000000 secs +1 mons query ? SELECT interval '5 month' ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? SELECT interval '13 month' ---- -0 years 13 mons 0 days 0 hours 0 mins 0.000000000 secs +13 mons query ? SELECT interval '0.5 year' ---- -0 years 6 mons 0 days 0 hours 0 mins 0.000000000 secs +6 mons query ? SELECT interval '1 year' ---- -0 years 12 mons 0 days 0 hours 0 mins 0.000000000 secs +12 mons query ? SELECT interval '1 decade' ---- -0 years 120 mons 0 days 0 hours 0 mins 0.000000000 secs +120 mons query ? SELECT interval '2 decades' ---- -0 years 240 mons 0 days 0 hours 0 mins 0.000000000 secs +240 mons query ? SELECT interval '1 century' ---- -0 years 1200 mons 0 days 0 hours 0 mins 0.000000000 secs +1200 mons query ? SELECT interval '2 year' ---- -0 years 24 mons 0 days 0 hours 0 mins 0.000000000 secs +24 mons query ? SELECT interval '1 year 1 day' ---- -0 years 12 mons 1 days 0 hours 0 mins 0.000000000 secs +12 mons 1 days query ? SELECT interval '1 year 1 day 1 hour' ---- -0 years 12 mons 1 days 1 hours 0 mins 0.000000000 secs +12 mons 1 days 1 hours query ? SELECT interval '1 year 1 day 1 hour 1 minute' ---- -0 years 12 mons 1 days 1 hours 1 mins 0.000000000 secs +12 mons 1 days 1 hours 1 mins query ? SELECT interval '1 year 1 day 1 hour 1 minute 1 second' ---- -0 years 12 mons 1 days 1 hours 1 mins 1.000000000 secs +12 mons 1 days 1 hours 1 mins 1.000000000 secs query I SELECT ascii('') diff --git a/datafusion/sqllogictest/test_files/interval.slt b/datafusion/sqllogictest/test_files/interval.slt index afb262cf95a5..077f38d5d5bb 100644 --- a/datafusion/sqllogictest/test_files/interval.slt +++ b/datafusion/sqllogictest/test_files/interval.slt @@ -45,250 +45,250 @@ Interval(MonthDayNano) Interval(MonthDayNano) query ? select interval '5' years ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs # check all different kinds of intervals query ? select interval '5' year ---- -0 years 60 mons 0 days 0 hours 0 mins 0.000000000 secs +60 mons query ? select interval '5' month ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? select interval '5' months ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' week ---- -0 years 0 mons 35 days 0 hours 0 mins 0.000000000 secs +35 days query ? select interval '5' day ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? select interval '5' hour ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours ## This seems wrong (5 mons) query ? select interval '5' hours ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' minute ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? select interval '5' second ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5' millisecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5' milliseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5' microsecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5' microseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5' nanosecond ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5' nanoseconds ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 YEAR' ---- -0 years 60 mons 0 days 0 hours 0 mins 0.000000000 secs +60 mons query ? select interval '5 MONTH' ---- -0 years 5 mons 0 days 0 hours 0 mins 0.000000000 secs +5 mons query ? select interval '5 WEEK' ---- -0 years 0 mons 35 days 0 hours 0 mins 0.000000000 secs +35 days query ? select interval '5 DAY' ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000000 secs +5 days query ? select interval '5 HOUR' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? select interval '5 HOURS' ---- -0 years 0 mons 0 days 5 hours 0 mins 0.000000000 secs +5 hours query ? select interval '5 MINUTE' ---- -0 years 0 mons 0 days 0 hours 5 mins 0.000000000 secs +5 mins query ? select interval '5 SECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5 SECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 5.000000000 secs +5.000000000 secs query ? select interval '5 MILLISECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5 MILLISECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.005000000 secs +0.005000000 secs query ? select interval '5 MICROSECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5 MICROSECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000005000 secs +0.000005000 secs query ? select interval '5 NANOSECOND' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 NANOSECONDS' ---- -0 years 0 mons 0 days 0 hours 0 mins 0.000000005 secs +0.000000005 secs query ? select interval '5 YEAR 5 MONTH 5 DAY 5 HOUR 5 MINUTE 5 SECOND 5 MILLISECOND 5 MICROSECOND 5 NANOSECOND' ---- -0 years 65 mons 5 days 5 hours 5 mins 5.005005005 secs +65 mons 5 days 5 hours 5 mins 5.005005005 secs # Interval with string literal addition query ? select interval '1 month' + '1 month' ---- -0 years 2 mons 0 days 0 hours 0 mins 0.000000000 secs +2 mons # Interval with string literal addition and leading field query ? select interval '1' + '1' month ---- -0 years 2 mons 0 days 0 hours 0 mins 0.000000000 secs +2 mons # Interval with nested string literal addition query ? select interval '1 month' + '1 month' + '1 month' ---- -0 years 3 mons 0 days 0 hours 0 mins 0.000000000 secs +3 mons # Interval with nested string literal addition and leading field query ? select interval '1' + '1' + '1' month ---- -0 years 3 mons 0 days 0 hours 0 mins 0.000000000 secs +3 mons # Interval mega nested string literal addition query ? select interval '1 year' + '1 month' + '1 day' + '1 hour' + '1 minute' + '1 second' + '1 millisecond' + '1 microsecond' + '1 nanosecond' ---- -0 years 13 mons 1 days 1 hours 1 mins 1.001001001 secs +13 mons 1 days 1 hours 1 mins 1.001001001 secs # Interval with string literal subtraction query ? select interval '1 month' - '1 day'; ---- -0 years 1 mons -1 days 0 hours 0 mins 0.000000000 secs +1 mons -1 days # Interval with string literal subtraction and leading field query ? select interval '5' - '1' - '2' year; ---- -0 years 24 mons 0 days 0 hours 0 mins 0.000000000 secs +24 mons # Interval with nested string literal subtraction query ? select interval '1 month' - '1 day' - '1 hour'; ---- -0 years 1 mons -1 days -1 hours 0 mins 0.000000000 secs +1 mons -1 days -1 hours # Interval with nested string literal subtraction and leading field query ? select interval '10' - '1' - '1' month; ---- -0 years 8 mons 0 days 0 hours 0 mins 0.000000000 secs +8 mons # Interval mega nested string literal subtraction query ? select interval '1 year' - '1 month' - '1 day' - '1 hour' - '1 minute' - '1 second' - '1 millisecond' - '1 microsecond' - '1 nanosecond' ---- -0 years 11 mons -1 days -1 hours -1 mins -1.001001001 secs +11 mons -1 days -1 hours -1 mins -1.001001001 secs # Interval with string literal negation and leading field query ? select -interval '5' - '1' - '2' year; ---- -0 years -96 mons 0 days 0 hours 0 mins 0.000000000 secs +-96 mons # Interval with nested string literal negation query ? select -interval '1 month' + '1 day' + '1 hour'; ---- -0 years -1 mons 1 days 1 hours 0 mins 0.000000000 secs +-1 mons 1 days 1 hours # Interval with nested string literal negation and leading field query ? select -interval '10' - '1' - '1' month; ---- -0 years -12 mons 0 days 0 hours 0 mins 0.000000000 secs +-12 mons # Interval mega nested string literal negation query ? select -interval '1 year' - '1 month' - '1 day' - '1 hour' - '1 minute' - '1 second' - '1 millisecond' - '1 microsecond' - '1 nanosecond' ---- -0 years -13 mons -1 days -1 hours -1 mins -1.001001001 secs +-13 mons -1 days -1 hours -1 mins -1.001001001 secs # Interval string literal + date query D @@ -343,7 +343,7 @@ select arrow_typeof(i) from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs Interval(MonthDayNano) +5 days 0.000000003 secs Interval(MonthDayNano) statement ok @@ -359,8 +359,8 @@ insert into t values ('6 days 7 nanoseconds'::interval) query ? rowsort select -i from t order by 1; ---- -0 years 0 mons -5 days 0 hours 0 mins -0.000000003 secs -0 years 0 mons -6 days 0 hours 0 mins -0.000000007 secs +-5 days -0.000000003 secs +-6 days -0.000000007 secs query ?T rowsort select @@ -368,8 +368,8 @@ select arrow_typeof(i) from t; ---- -0 years 0 mons 5 days 0 hours 0 mins 0.000000003 secs Interval(MonthDayNano) -0 years 0 mons 6 days 0 hours 0 mins 0.000000007 secs Interval(MonthDayNano) +5 days 0.000000003 secs Interval(MonthDayNano) +6 days 0.000000007 secs Interval(MonthDayNano) statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 6884d762612d..eece56942317 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -673,7 +673,7 @@ query error DataFusion error: Arrow error: Compute error: Signed integer overflo select lcm(2, 9223372036854775803); -query error DataFusion error: Arrow error: Compute error: Overflow happened on: 2107754225 \^ 1221660777 +query error DataFusion error: Arrow error: Arithmetic overflow: Overflow happened on: 2107754225 \^ 1221660777 select power(2107754225, 1221660777); # factorial overflow diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 6b9cb521f5f8..4c86312f9e51 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..104], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:104..208], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:208..312], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:312..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..104], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:104..208], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:208..312], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:312..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +03)----ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..87], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:87..174], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:174..261], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:261..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..205], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:205..405, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..5], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:5..210], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:210..414]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +05)--------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..172], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:172..338, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..178], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:178..347]]}, projection=[column1], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..202], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..207], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:207..414], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:202..405]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] +04)------ParquetExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..169], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..173], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:173..347], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:169..338]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], predicate=column1@0 != 42, pruning_predicate=CASE WHEN column1_null_count@2 = column1_row_count@3 THEN false ELSE column1_min@0 != 42 OR 42 != column1_max@1 END, required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index fb0fd8397f2d..4b11e338da70 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1509,19 +1509,19 @@ SELECT val, ts1 - ts2 FROM foo ORDER BY ts2 - ts1; query ? SELECT i1 - i2 FROM bar; ---- -0 years 0 mons -1 days 0 hours 0 mins 0.000000000 secs -0 years 2 mons -13 days 0 hours 0 mins 0.000000000 secs -0 years 0 mons 1 days 2 hours 56 mins 0.000000000 secs -0 years 0 mons 1 days 0 hours 0 mins -3.999999993 secs +-1 days +2 mons -13 days +1 days 2 hours 56 mins +1 days -3.999999993 secs # Interval + Interval query ? SELECT i1 + i2 FROM bar; ---- -0 years 0 mons 3 days 0 hours 0 mins 0.000000000 secs -0 years 2 mons 13 days 0 hours 0 mins 0.000000000 secs -0 years 0 mons 1 days 3 hours 4 mins 0.000000000 secs -0 years 0 mons 1 days 0 hours 0 mins 4.000000007 secs +3 days +2 mons 13 days +1 days 3 hours 4 mins +1 days 4.000000007 secs # Timestamp - Interval query P