From 6637b842e53e2ada3fdc6d43d0c574a1d7680f96 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 14 Jun 2023 12:59:17 -0400 Subject: [PATCH 01/89] Bump ADP -> 26.0.0 --- dask_planner/Cargo.lock | 1483 ++++------------- dask_planner/Cargo.toml | 2 +- dask_planner/src/dialect.rs | 5 + dask_planner/src/expression.rs | 46 +- dask_planner/src/sql/logical.rs | 17 +- dask_planner/src/sql/logical/aggregate.rs | 6 +- .../src/sql/logical/create_memory_table.rs | 9 +- dask_planner/src/sql/logical/drop_table.rs | 7 +- .../src/sql/logical/subquery_alias.rs | 2 +- dask_planner/src/sql/logical/table_scan.rs | 6 +- dask_planner/src/sql/optimizer.rs | 4 - dask_planner/src/sql/table.rs | 4 +- dask_planner/src/sql/types.rs | 14 +- 13 files changed, 366 insertions(+), 1239 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 3f501e47d..615d90ab5 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -29,9 +29,9 @@ dependencies = [ [[package]] name = "aho-corasick" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67fc08ce920c31afb70f013dcce1bfc3a3195de6a228474e45e1f145b36f8d04" +checksum = "43f6cb1bf222025340178f382c426f13757b2960e89779dfcb319c32542a5a41" dependencies = [ "memchr", ] @@ -51,6 +51,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -107,15 +113,15 @@ checksum = "6b4930d2cb77ce62f89ee5d5289b4ac049559b1c45539271f5ed4fdc7db34545" [[package]] name = "arrayvec" -version = "0.7.2" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" +checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "990dfa1a9328504aa135820da1c95066537b69ad94c04881b785f64328e0fa6b" +checksum = "6619cab21a0cdd8c9b9f1d9e09bfaa9b1974e5ef809a6566aef0b998caf38ace" dependencies = [ "ahash", "arrow-arith", @@ -136,9 +142,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b2e52de0ab54173f9b08232b7184c26af82ee7ab4ac77c83396633c90199fa" +checksum = "e0dc95485623a76e00929bda8caa40c1f838190952365c4f43a7b9ae86d03e94" dependencies = [ "arrow-array", "arrow-buffer", @@ -151,9 +157,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e10849b60c17dbabb334be1f4ef7550701aa58082b71335ce1ed586601b2f423" +checksum = "3267847f53d3042473cfd2c769afd8d74a6d7d201fc3a34f5cb84c0282ef47a7" dependencies = [ "ahash", "arrow-buffer", @@ -168,9 +174,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0746ae991b186be39933147117f8339eb1c4bbbea1c8ad37e7bf5851a1a06ba" +checksum = "c5f66553e66e120ac4b21570368ee9ebf35ff3f5399f872b0667699e145678f5" dependencies = [ "half", "num", @@ -178,9 +184,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b88897802515d7b193e38b27ddd9d9e43923d410a9e46307582d756959ee9595" +checksum = "65e6f3579dbf0d97c683d451b2550062b0f0e62a3169bf74238b5f59f44ad6d8" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,9 +201,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1c8220d9741fc37961262710ceebd8451a5b393de57c464f0267ffdda1775c0a" +checksum = "373579c4c1a8f5307d3125b7a89c700fcf8caf85821c77eb4baab3855ae0aba5" dependencies = [ "arrow-array", "arrow-buffer", @@ -214,9 +220,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "533f937efa1aaad9dc86f6a0e382c2fa736a4943e2090c946138079bdf060cef" +checksum = "61bc8df9912cca6642665fdf989d6fa0de2570f18a7f709bcf59d29de96d2097" dependencies = [ "arrow-buffer", "arrow-schema", @@ -226,9 +232,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18b75296ff01833f602552dff26a423fc213db8e5049b540ca4a00b1c957e41c" +checksum = "0105dcf5f91daa7182d87b713ee0b32b3bfc88e0c48e7dc3e9d6f1277a07d1ae" dependencies = [ "arrow-array", "arrow-buffer", @@ -240,9 +246,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e501d3de4d612c90677594896ca6c0fa075665a7ff980dc4189bb531c17e19f6" +checksum = "e73134fb5b5ec8770f8cbb214c2c487b2d350081e403ca4eeeb6f8f5e19846ac" dependencies = [ "arrow-array", "arrow-buffer", @@ -254,14 +260,15 @@ dependencies = [ "indexmap", "lexical-core", "num", + "serde", "serde_json", ] [[package]] name = "arrow-ord" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d2671eb3793f9410230ac3efb0e6d36307be8a2dac5fad58ac9abde8e9f01e" +checksum = "89f25bc66e18d4c2aa1fe2f9bb03e2269da60e636213210385ae41a107f9965a" dependencies = [ "arrow-array", "arrow-buffer", @@ -274,9 +281,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc11fa039338cebbf4e29cf709c8ac1d6a65c7540063d4a25f991ab255ca85c8" +checksum = "1095ff85ea4f5ff02d17b30b089de31b51a50be01c6b674f0a0509ab771232f1" dependencies = [ "ahash", "arrow-array", @@ -289,18 +296,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d04f17f7b86ded0b5baf98fe6123391c4343e031acc3ccc5fa604cc180bff220" +checksum = "25187bbef474151a2e4ddec67b9e34bda5cbfba292dc571392fa3a1f71ff5a82" dependencies = [ - "bitflags 2.2.1", + "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "163e35de698098ff5f5f672ada9dc1f82533f10407c7a11e2cd09f3bcf31d18a" +checksum = "fd0d4ee884aec3aa05e41478e3cd312bf609de9babb5d187a43fb45931da4da4" dependencies = [ "arrow-array", "arrow-buffer", @@ -311,9 +318,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdfbed1b10209f0dc68e6aa4c43dc76079af65880965c7c3b73f641f23d4aba" +checksum = "d6d71c3ffe4c07e66ce8fdc6aed5b00e0e60c5144911879b10546f5b72d8fa1c" dependencies = [ "arrow-array", "arrow-buffer", @@ -321,14 +328,14 @@ dependencies = [ "arrow-schema", "arrow-select", "regex", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "async-compression" -version = "0.3.15" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "942c7cd7ae39e91bde4820d74132e9862e62c2f386c3aa90ccf55949f5bad63a" +checksum = "5b0122885821398cc923ece939e24d1056a2384ee719432397fa9db87230ff11" dependencies = [ "bzip2", "flate2", @@ -338,8 +345,8 @@ dependencies = [ "pin-project-lite", "tokio", "xz2", - "zstd 0.11.2+zstd.1.5.2", - "zstd-safe 5.0.2+zstd.1.5.2", + "zstd", + "zstd-safe", ] [[package]] @@ -350,7 +357,7 @@ checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -361,7 +368,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -372,9 +379,9 @@ checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" [[package]] name = "base64" -version = "0.21.0" +version = "0.21.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a4ddaa51a5bc52a6948f74c06d20aaaddb71924eab79b8c97a8c556e942d6a" +checksum = "604178f6c5c21f02dc555784810edfb88d34ac2c73b2eae109655649ee73ce3d" [[package]] name = "bitflags" @@ -384,9 +391,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.2.1" +version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a6904aef64d73cf10ab17ebace7befb918b82164785cb89907993be7f83813" +checksum = "6dbe3c979c178231552ecba20214a8272df4e09f232a87aef4320cf06539aded" [[package]] name = "blake2" @@ -399,9 +406,9 @@ dependencies = [ [[package]] name = "blake3" -version = "1.3.3" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42ae2468a89544a466886840aa467a25b766499f4f04bf7d9fcd10ecee9fccef" +checksum = "729b71f35bd3fa1a4c86b85d32c8b9069ea7fe14f7a53cfabb65f62d4265b888" dependencies = [ "arrayref", "arrayvec", @@ -441,32 +448,11 @@ dependencies = [ "alloc-stdlib", ] -[[package]] -name = "bstr" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3d4260bcc2e8fc9df1eac4919a720effeb63a3f0952f5bf4944adfa18897f09" -dependencies = [ - "memchr", - "once_cell", - "regex-automata", - "serde", -] - -[[package]] -name = "btoi" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dd6407f73a9b8b6162d8a2ef999fe6afd7cc15902ebf42c5cd296addf17e0ad" -dependencies = [ - "num-traits", -] - [[package]] name = "bumpalo" -version = "3.12.1" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b1ce199063694f33ffb7dd4e0ee620741495c32833cde5aa08f02a0bf96f0c8" +checksum = "a3e2c3daef883ecc1b5d58c15adae93470a91d425f3532ba1695849656af3fc1" [[package]] name = "byteorder" @@ -518,17 +504,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "chrono" -version = "0.4.24" +version = "0.4.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b" +checksum = "ec837a71355b28f6556dbd569b37b3f363091c0bd4b2e735674521b4c5fd9bc5" dependencies = [ + "android-tzdata", "iana-time-zone", - "js-sys", - "num-integer", "num-traits", "serde", - "time 0.1.45", - "wasm-bindgen", "winapi", ] @@ -554,27 +537,11 @@ dependencies = [ "phf_codegen", ] -[[package]] -name = "clru" -version = "0.6.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8191fa7302e03607ff0e237d4246cc043ff5b3cb9409d995172ba3bea16b807" - -[[package]] -name = "codespan-reporting" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3538270d33cc669650c4b093848450d380def10c331d38c768e34cac80576e6e" -dependencies = [ - "termcolor", - "unicode-width", -] - [[package]] name = "comfy-table" -version = "6.1.4" +version = "6.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e7b787b0dc42e8111badfdbe4c3059158ccb2db8780352fa1b01e8ccf45cc4d" +checksum = "7e959d788268e3bf9d35ace83e81b124190378e4c91c9067524675e33394b8ba" dependencies = [ "strum", "strum_macros", @@ -605,9 +572,9 @@ dependencies = [ [[package]] name = "constant_time_eq" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13418e745008f7349ec7e449155f419a61b92b58a99cc3616942b926825ec76b" +checksum = "21a53c0a4d288377e7415b53dcfc3c04da5cdc2cc95c8d5ac178b58f0b861ad6" [[package]] name = "core-foundation-sys" @@ -651,9 +618,9 @@ dependencies = [ [[package]] name = "csv" -version = "1.2.1" +version = "1.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b015497079b9a9d69c02ad25de6c0a6edef051ea6360a327d0bd05802ef64ad" +checksum = "626ae34994d3d8d668f4269922248239db4ae42d538b14c398b74a52208e8086" dependencies = [ "csv-core", "itoa", @@ -670,50 +637,6 @@ dependencies = [ "memchr", ] -[[package]] -name = "cxx" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f61f1b6389c3fe1c316bf8a4dccc90a38208354b330925bce1f74a6c4756eb93" -dependencies = [ - "cc", - "cxxbridge-flags", - "cxxbridge-macro", - "link-cplusplus", -] - -[[package]] -name = "cxx-build" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12cee708e8962df2aeb38f594aae5d827c022b6460ac71a7a3e2c3c2aae5a07b" -dependencies = [ - "cc", - "codespan-reporting", - "once_cell", - "proc-macro2", - "quote", - "scratch", - "syn 2.0.15", -] - -[[package]] -name = "cxxbridge-flags" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7944172ae7e4068c533afbb984114a56c46e9ccddda550499caa222902c7f7bb" - -[[package]] -name = "cxxbridge-macro" -version = "1.0.94" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2345488264226bf682893e25de0769f3360aac9957980ec49361b083ddaa5bc5" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.15", -] - [[package]] name = "dashmap" version = "5.4.0" @@ -742,13 +665,15 @@ dependencies = [ [[package]] name = "datafusion" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9bdb93fee4f30368f1f71bfd5cd28882ec9fab0183db7924827b76129d33227c" +checksum = "9992c267436551d40b52d65289b144712e7b0ebdc62c8c859fd1574e5f73efbb" dependencies = [ "ahash", "apache-avro", "arrow", + "arrow-array", + "arrow-schema", "async-compression", "async-trait", "bytes", @@ -787,14 +712,14 @@ dependencies = [ "url", "uuid", "xz2", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] name = "datafusion-common" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82401ce129e601d406012b6d718f8978ba84c386e1c342fa155877120d68824" +checksum = "c3be97f7a7c720cdbb71e9eeabf814fa6ad8102b9022390f6cac74d3b4af6392" dependencies = [ "apache-avro", "arrow", @@ -809,9 +734,9 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b08b2078aed21a27239cd93f3015e492a58b0d50ebeeaf8d2236cf108ef583ce" +checksum = "c77c4b14b809b0e4c5bb101b6834504f06cdbb0d3c643400c61d0d844b33264e" dependencies = [ "dashmap", "datafusion-common", @@ -827,21 +752,24 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16b5b977ce9695fb4c67614266ec57f384fc11e9a9f9b3e6d0e62b9c5a9f2c1f" +checksum = "e6ec7409bd45cf4fae6395d7d1024c8a97e543cadc88363e405d2aad5330e5e7" dependencies = [ "ahash", "arrow", "datafusion-common", + "lazy_static", "sqlparser", + "strum", + "strum_macros", ] [[package]] name = "datafusion-optimizer" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0b2bb9e73ed778d1bc5af63a270f0154bf6eab5099c77668a6362296888e46b" +checksum = "64b537c93f87989c212db92a448a0f5eb4f0995e27199bb7687ae94f8b64a7a8" dependencies = [ "arrow", "async-trait", @@ -852,14 +780,14 @@ dependencies = [ "hashbrown 0.13.2", "itertools", "log", - "regex-syntax 0.6.29", + "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80cd8ea5ab0a07b1b2a3e17d5909f1b1035bd129ffeeb5c66842a32e682f8f79" +checksum = "f60ee3f53340fdef36ee54d9e12d446ae2718b1d0196ac581f791d34808ec876" dependencies = [ "ahash", "arrow", @@ -877,6 +805,7 @@ dependencies = [ "indexmap", "itertools", "lazy_static", + "libc", "md-5", "paste", "petgraph", @@ -889,8 +818,8 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "22.0.0" -source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=9493638#94936380e58a266f5dd5de6b70a06d3aa36fbe22" +version = "26.0.0" +source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#ce83d8a837ac2bd8473cbe3432cebd3eb90a6e1c" dependencies = [ "async-trait", "datafusion", @@ -903,11 +832,13 @@ dependencies = [ "mimalloc", "object_store", "parking_lot", + "prost", + "prost-types", "pyo3", "pyo3-build-config 0.18.3", "rand", - "regex-syntax 0.6.29", - "syn 2.0.15", + "regex-syntax", + "syn 2.0.18", "tokio", "url", "uuid", @@ -915,9 +846,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a95d6badab19fd6e9195fdc5209ac0a7e5ce9bcdedc67767b9ffc1b4e645760" +checksum = "d58fc64058aa3bcb00077a0d19474a0d584d31dec8c7ac3406868f485f659af9" dependencies = [ "arrow", "datafusion-common", @@ -927,9 +858,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37a78f8fc67123c4357e63bc0c87622a2a663d26f074958d749a633d0ecde90f" +checksum = "1531f0314151a34bf6c0a83c7261525688b7c729876f53e7896b8f4ca8f57d07" dependencies = [ "arrow", "arrow-schema", @@ -941,9 +872,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "22.0.0" +version = "26.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae6ed64a2005f0d78f2b1b3ec3f8148183f4523d5d364e5367115f8d8a82b7df" +checksum = "079d5be5ec59580777bfa16d79187fea99b6498e3e8e07eb36d504a5fe708f13" dependencies = [ "async-recursion", "chrono", @@ -951,53 +882,28 @@ dependencies = [ "itertools", "object_store", "prost", + "prost-types", "substrait", "tokio", ] [[package]] name = "digest" -version = "0.10.6" +version = "0.10.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8168378f4e5023e7218c89c891c0fd8ecdb5e5e4f18cb78f38cf245dd021e76f" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", "subtle", ] -[[package]] -name = "dirs" -version = "4.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca3aa72a6f96ea37bbc5aa912f6788242832f75369bdfdadcb0e38423f100059" -dependencies = [ - "dirs-sys", -] - -[[package]] -name = "dirs-sys" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b1d1d91c932ef41c0f2663aa8b0ca0342d444d842c06914aa0a7e352d0bada6" -dependencies = [ - "libc", - "redox_users", - "winapi", -] - [[package]] name = "doc-comment" version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fea41bba32d969b513997752735605054bc0dfa92b4c56bf1189f2e174be7a10" -[[package]] -name = "dunce" -version = "1.0.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56ce8c6da7551ec6c462cbaf3bfbc75131ebbfa1c944aeaa9dab51ca1c5f0c3b" - [[package]] name = "dyn-clone" version = "1.0.11" @@ -1040,7 +946,7 @@ checksum = "4bcfec3a70f97c962c307b2d2c56e358cf1d00b558d74262b5f929ee8cc7e73a" dependencies = [ "errno-dragonfly", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -1062,18 +968,6 @@ dependencies = [ "instant", ] -[[package]] -name = "filetime" -version = "0.2.21" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cbc844cecaee9d4443931972e1289c8ff485cb4cc2767cb03ca139ed6885153" -dependencies = [ - "cfg-if", - "libc", - "redox_syscall 0.2.16", - "windows-sys 0.48.0", -] - [[package]] name = "fixedbitset" version = "0.4.2" @@ -1082,9 +976,9 @@ checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" [[package]] name = "flatbuffers" -version = "23.1.21" +version = "23.5.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77f5399c2c9c50ae9418e522842ad362f61ee48b346ac106807bd355a8a7c619" +checksum = "4dac53e22462d78c16d64a1cd22371b54cc3fe94aa15e7886a2fa6e5d1ab8640" dependencies = [ "bitflags 1.3.2", "rustc_version", @@ -1108,9 +1002,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" [[package]] name = "form_urlencoded" -version = "1.1.0" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9c384f161156f5260c24a097c56119f9be8c798586aecc13afbcbe7b7e26bf8" +checksum = "a62bc1cf6f830c2ec14a513a9fb124d0a213a629668a4186f329db21fe045652" dependencies = [ "percent-encoding", ] @@ -1171,7 +1065,7 @@ checksum = "89ca545a94061b6365f2c7355b4b32bd20df3ff95f02da9329b34ccc3bd6ee72" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -1216,553 +1110,26 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.9" +version = "0.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4" +checksum = "be4136b2a15dd319360be1c07d9933517ccf0be8f16bf62a3bee4f0d618df427" dependencies = [ "cfg-if", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", -] - -[[package]] -name = "gix" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c256ea71cc1967faaefdaad15f334146b7c806f12460dcafd3afed845c8c78dd" -dependencies = [ - "gix-actor", - "gix-attributes", - "gix-config", - "gix-credentials", - "gix-date", - "gix-diff", - "gix-discover", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-index", - "gix-lock", - "gix-mailmap", - "gix-object", - "gix-odb", - "gix-pack", - "gix-path", - "gix-prompt", - "gix-ref", - "gix-refspec", - "gix-revision", - "gix-sec", - "gix-tempfile", - "gix-traverse", - "gix-url", - "gix-validate", - "gix-worktree", - "log", - "once_cell", - "signal-hook", - "smallvec", - "thiserror", - "unicode-normalization", -] - -[[package]] -name = "gix-actor" -version = "0.19.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc22b0cdc52237667c301dd7cdc6ead8f8f73c9f824e9942c8ebd6b764f6c0bf" -dependencies = [ - "bstr", - "btoi", - "gix-date", - "itoa", - "nom", - "thiserror", -] - -[[package]] -name = "gix-attributes" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2231a25934a240d0a4b6f4478401c73ee81d8be52de0293eedbc172334abf3e1" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-quote", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-bitmap" -version = "0.2.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55a95f4942360766c3880bdb2b4b57f1ef73b190fc424755e7fdf480430af618" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-chunk" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b0d39583cab06464b8bf73b3f1707458270f0e7383cb24c3c9c1a16e6f792978" -dependencies = [ - "thiserror", -] - -[[package]] -name = "gix-command" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2c6f75c1e0f924de39e750880a6e21307194bb1ab773efe3c7d2d787277f8ab" -dependencies = [ - "bstr", -] - -[[package]] -name = "gix-config" -version = "0.20.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7fbad5ce54a8fc997acc50febd89ec80fa6e97cb7f8d0654cb229936407489d8" -dependencies = [ - "bstr", - "gix-config-value", - "gix-features 0.28.1", - "gix-glob", - "gix-path", - "gix-ref", - "gix-sec", - "log", - "memchr", - "nom", - "once_cell", - "smallvec", - "thiserror", - "unicode-bom", -] - -[[package]] -name = "gix-config-value" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d09154c0c8677e4da0ec35e896f56ee3e338e741b9599fae06075edd83a4081c" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "gix-path", - "libc", - "thiserror", -] - -[[package]] -name = "gix-credentials" -version = "0.12.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "750b684197374518ea057e0a0594713e07683faa0a3f43c0f93d97f64130ad8d" -dependencies = [ - "bstr", - "gix-command", - "gix-config-value", - "gix-path", - "gix-prompt", - "gix-sec", - "gix-url", - "thiserror", -] - -[[package]] -name = "gix-date" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b96271912ce39822501616f177dea7218784e6c63be90d5f36322ff3a722aae2" -dependencies = [ - "bstr", - "itoa", - "thiserror", - "time 0.3.20", -] - -[[package]] -name = "gix-diff" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "103a0fa79b0d438f5ecb662502f052e530ace4fe1fe8e1c83c0c6da76d728e67" -dependencies = [ - "gix-hash 0.10.4", - "gix-object", - "imara-diff", - "thiserror", -] - -[[package]] -name = "gix-discover" -version = "0.16.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6eba8ba458cb8f4a6c33409b0fe650b1258655175a7ffd1d24fafd3ed31d880b" -dependencies = [ - "bstr", - "dunce", - "gix-hash 0.10.4", - "gix-path", - "gix-ref", - "gix-sec", - "thiserror", -] - -[[package]] -name = "gix-features" -version = "0.28.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b76f9a80f6dd7be66442ae86e1f534effad9546676a392acc95e269d0c21c22" -dependencies = [ - "crc32fast", - "flate2", - "gix-hash 0.10.4", - "libc", - "once_cell", - "prodash", - "sha1_smol", - "thiserror", - "walkdir", -] - -[[package]] -name = "gix-features" -version = "0.29.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf69b0f5c701cc3ae22d3204b671907668f6437ca88862d355eaf9bc47a4f897" -dependencies = [ - "gix-hash 0.11.1", - "libc", + "wasi", ] [[package]] -name = "gix-fs" -version = "0.1.1" +name = "git2" +version = "0.17.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b37a1832f691fdc09910bd267f9a2e413737c1f9ec68c6e31f9e802616278a9" -dependencies = [ - "gix-features 0.29.0", -] - -[[package]] -name = "gix-glob" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93e43efd776bc543f46f0fd0ca3d920c37af71a764a16f2aebd89765e9ff2993" +checksum = "7b989d6a7ca95a362cf2cfc5ad688b3a467be1f87e480b8dad07fee8c79b0044" dependencies = [ "bitflags 1.3.2", - "bstr", -] - -[[package]] -name = "gix-hash" -version = "0.10.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a258595457bc192d1f1c59d0d168a1e34e2be9b97a614e14995416185de41a7" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hash" -version = "0.11.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "078eec3ac2808cc03f0bddd2704cb661da5c5dc33b41a9d7947b141d499c7c42" -dependencies = [ - "hex", - "thiserror", -] - -[[package]] -name = "gix-hashtable" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e55e40dfd694884f0eb78796c5bddcf2f8b295dace47039099dd7e76534973" -dependencies = [ - "gix-hash 0.10.4", - "hashbrown 0.13.2", - "parking_lot", -] - -[[package]] -name = "gix-index" -version = "0.15.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "717ab601ece7921f59fe86849dbe27d44a46ebb883b5885732c4f30df4996177" -dependencies = [ - "bitflags 1.3.2", - "bstr", - "btoi", - "filetime", - "gix-bitmap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-traverse", - "itoa", - "memmap2", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-lock" -version = "5.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c693d7f05730fa74a7c467150adc7cea393518410c65f0672f80226b8111555" -dependencies = [ - "gix-tempfile", - "gix-utils", - "thiserror", -] - -[[package]] -name = "gix-mailmap" -version = "0.11.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b66aea5e52875cd4915f4957a6f4b75831a36981e2ec3f5fad9e370e444fe1a" -dependencies = [ - "bstr", - "gix-actor", - "thiserror", -] - -[[package]] -name = "gix-object" -version = "0.28.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8df068db9180ee935fbb70504848369e270bdcb576b05c0faa8b9fd3b86fc017" -dependencies = [ - "bstr", - "btoi", - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-validate", - "hex", - "itoa", - "nom", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-odb" -version = "0.43.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e83af2e3e36005bfe010927f0dff41fb5acc3e3d89c6f1174135b3a34086bda2" -dependencies = [ - "arc-swap", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-object", - "gix-pack", - "gix-path", - "gix-quote", - "parking_lot", - "tempfile", - "thiserror", -] - -[[package]] -name = "gix-pack" -version = "0.33.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9401911c7fe032ad7b31c6a6b5be59cb283d1d6c999417a8215056efe6d635f3" -dependencies = [ - "clru", - "gix-chunk", - "gix-diff", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-traverse", - "memmap2", - "parking_lot", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-path" -version = "0.7.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32370dce200bb951df013e03dff35b4233fc7a89458642b047629b91734a7e19" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-prompt" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3034d4d935aef2c7bf719aaa54b88c520e82413118d886ae880a31d5bdee57" -dependencies = [ - "gix-command", - "gix-config-value", - "nix", - "parking_lot", - "thiserror", -] - -[[package]] -name = "gix-quote" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a282f5a8d9ee0b09ec47390ac727350c48f2f5c76d803cd8da6b3e7ad56e0bcb" -dependencies = [ - "bstr", - "btoi", - "thiserror", -] - -[[package]] -name = "gix-ref" -version = "0.27.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4e909396ed3b176823991ccc391c276ae2a015e54edaafa3566d35123cfac9d" -dependencies = [ - "gix-actor", - "gix-features 0.28.1", - "gix-hash 0.10.4", - "gix-lock", - "gix-object", - "gix-path", - "gix-tempfile", - "gix-validate", - "memmap2", - "nom", - "thiserror", -] - -[[package]] -name = "gix-refspec" -version = "0.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba332462bda2e8efeae4302b39a6ed01ad56ef772fd5b7ef197cf2798294d65" -dependencies = [ - "bstr", - "gix-hash 0.10.4", - "gix-revision", - "gix-validate", - "smallvec", - "thiserror", -] - -[[package]] -name = "gix-revision" -version = "0.12.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c6f6ff53f888858afc24bf12628446a14279ceec148df6194481f306f553ad2" -dependencies = [ - "bstr", - "gix-date", - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-sec" -version = "0.6.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8ffa5bf0772f9b01de501c035b6b084cf9b8bb07dec41e3afc6a17336a65f47" -dependencies = [ - "bitflags 1.3.2", - "dirs", - "gix-path", - "libc", - "windows 0.43.0", -] - -[[package]] -name = "gix-tempfile" -version = "5.0.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d71a0d32f34e71e86586124225caefd78dabc605d0486de580d717653addf182" -dependencies = [ - "gix-fs", "libc", - "once_cell", - "parking_lot", - "signal-hook", - "signal-hook-registry", - "tempfile", -] - -[[package]] -name = "gix-traverse" -version = "0.24.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9a4a07bb22168dc79c60e1a6a41919d198187ca83d8a5940ad8d7122a45df3" -dependencies = [ - "gix-hash 0.10.4", - "gix-hashtable", - "gix-object", - "thiserror", -] - -[[package]] -name = "gix-url" -version = "0.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a22b4b32ad14d68f7b7fb6458fa58d44b01797d94c1b8f4db2d9c7b3c366b5" -dependencies = [ - "bstr", - "gix-features 0.28.1", - "gix-path", - "home", - "thiserror", - "url", -] - -[[package]] -name = "gix-utils" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c10b69beac219acb8df673187a1f07dde2d74092f974fb3f9eb385aeb667c909" -dependencies = [ - "fastrand", -] - -[[package]] -name = "gix-validate" -version = "0.7.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7bd629d3680773e1785e585d76fd4295b740b559cad9141517300d99a0c8c049" -dependencies = [ - "bstr", - "thiserror", -] - -[[package]] -name = "gix-worktree" -version = "0.15.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54ec9a000b4f24af706c3cc680c7cda235656cbe3216336522f5692773b8a301" -dependencies = [ - "bstr", - "gix-attributes", - "gix-features 0.28.1", - "gix-glob", - "gix-hash 0.10.4", - "gix-index", - "gix-object", - "gix-path", - "io-close", - "thiserror", + "libgit2-sys", + "log", + "url", ] [[package]] @@ -1773,9 +1140,9 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" [[package]] name = "h2" -version = "0.3.18" +version = "0.3.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f8a914c2987b688368b5138aa05321db91f4090cf26118185672ad588bce21" +checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782" dependencies = [ "bytes", "fnv", @@ -1836,21 +1203,6 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" -[[package]] -name = "hex" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" - -[[package]] -name = "home" -version = "0.5.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5444c27eef6923071f7ebcc33e3444508466a76f7a2b93da00ed6e19f30c1ddb" -dependencies = [ - "windows-sys 0.48.0", -] - [[package]] name = "http" version = "0.2.9" @@ -1917,9 +1269,9 @@ dependencies = [ [[package]] name = "hyper-rustls" -version = "0.23.2" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +checksum = "0646026eb1b3eea4cd9ba47912ea5ce9cc07713d105b1a14698f4e6433d348b7" dependencies = [ "http", "hyper", @@ -1930,48 +1282,37 @@ dependencies = [ [[package]] name = "iana-time-zone" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0722cd7114b7de04316e7ea5456a0bbb20e4adb46fd27a3697adb812cff0f37c" +checksum = "2fad5b825842d2b38bd206f3e81d6957625fd7f0a361e345c30e01a0ae2dd613" dependencies = [ "android_system_properties", "core-foundation-sys", "iana-time-zone-haiku", "js-sys", "wasm-bindgen", - "windows 0.48.0", + "windows", ] [[package]] name = "iana-time-zone-haiku" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0703ae284fc167426161c2e3f1da3ea71d94b21bedbcc9494e92b28e334e3dca" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" dependencies = [ - "cxx", - "cxx-build", + "cc", ] [[package]] name = "idna" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e14ddfc70884202db2244c223200c204c2bda1bc6e0998d11b5e024d657209e6" +checksum = "7d20d6b07bfbc108882d88ed8e37d39636dcc260e15e30c45e6ba089610b917c" dependencies = [ "unicode-bidi", "unicode-normalization", ] -[[package]] -name = "imara-diff" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e98c1d0ad70fc91b8b9654b1f33db55e59579d3b3de2bffdced0fdb810570cb8" -dependencies = [ - "ahash", - "hashbrown 0.12.3", -] - [[package]] name = "indexmap" version = "1.9.3" @@ -2003,25 +1344,15 @@ version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" -[[package]] -name = "io-close" -version = "0.3.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cadcf447f06744f8ce713d2d6239bb5bde2c357a452397a9ed90c625da390bc" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "io-lifetimes" -version = "1.0.10" +version = "1.0.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c66c74d2ae7e79a5a8f7ac924adbe38ee42a859c6539ad869eb51f0b52dc220" +checksum = "eae7b9aee968036d54dce06cebaefd919e4472e753296daccd6d344e3e2df0c2" dependencies = [ "hermit-abi 0.3.1", "libc", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2039,7 +1370,7 @@ dependencies = [ "hermit-abi 0.3.1", "io-lifetimes", "rustix", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -2068,9 +1399,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" +checksum = "c5f195fe497f702db0f318b07fdd68edb16955aed830df8363d837542f8f935a" dependencies = [ "wasm-bindgen", ] @@ -2147,15 +1478,15 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.142" +version = "0.2.146" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a987beff54b60ffa6d51982e1aa1146bc42f19bd26be28b0586f252fccf5317" +checksum = "f92be4933c13fd498862a9e02a3055f8a8d9c039ce33db97306fd5a6caa7f29b" [[package]] name = "libflate" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97822bf791bd4d5b403713886a5fbe8bf49520fe78e323b0dc480ca1a03e50b0" +checksum = "5ff4ae71b685bbad2f2f391fe74f6b7659a34871c08b210fdc039e43bee07d18" dependencies = [ "adler32", "crc32fast", @@ -2171,11 +1502,23 @@ dependencies = [ "rle-decode-fast", ] +[[package]] +name = "libgit2-sys" +version = "0.15.2+1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a80df2e11fb4a61f4ba2ab42dbe7f74468da143f1a75c74e11dee7c813f694fa" +dependencies = [ + "cc", + "libc", + "libz-sys", + "pkg-config", +] + [[package]] name = "libm" -version = "0.2.6" +version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "348108ab3fba42ec82ff6e9564fc4ca0247bdccdc68dd8af9764bbc79c3c8ffb" +checksum = "f7012b1bbb0719e1097c47611d3898568c546d597c2e74d66f6087edd5233ff4" [[package]] name = "libmimalloc-sys" @@ -2188,25 +1531,28 @@ dependencies = [ ] [[package]] -name = "link-cplusplus" -version = "1.0.8" +name = "libz-sys" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ecd207c9c713c34f95a097a5b029ac2ce6010530c7b49d7fea24d977dede04f5" +checksum = "56ee889ecc9568871456d42f603d6a0ce59ff328d291063a45cbdf0036baf6db" dependencies = [ "cc", + "libc", + "pkg-config", + "vcpkg", ] [[package]] name = "linux-raw-sys" -version = "0.3.7" +version = "0.3.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ece97ea872ece730aed82664c424eb4c8291e1ff2480247ccf7409044bc6479f" +checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "lock_api" -version = "0.4.9" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "435011366fe56583b16cf956f9df0095b405b82d76425bc8981c0e22e60ec4df" +checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" dependencies = [ "autocfg", "scopeguard", @@ -2264,15 +1610,6 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" -[[package]] -name = "memmap2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" -dependencies = [ - "libc", -] - [[package]] name = "memoffset" version = "0.8.0" @@ -2297,12 +1634,6 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" -[[package]] -name = "minimal-lexical" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" - [[package]] name = "miniz_oxide" version = "0.7.1" @@ -2314,14 +1645,13 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.6" +version = "0.8.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b9d9a46eff5b4ff64b45a9e316a6d1e0bc719ef429cbec4dc630684212bfdf9" +checksum = "927a765cd3fc26206e66b296465fa9d3e5ab003e651c1b3c060e7956d96b19d2" dependencies = [ "libc", - "log", - "wasi 0.11.0+wasi-snapshot-preview1", - "windows-sys 0.45.0", + "wasi", + "windows-sys", ] [[package]] @@ -2330,28 +1660,6 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" -[[package]] -name = "nix" -version = "0.26.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfdda3d196821d6af13126e40375cdf7da646a96114af134d5f417a9a1dc8e1a" -dependencies = [ - "bitflags 1.3.2", - "cfg-if", - "libc", - "static_assertions", -] - -[[package]] -name = "nom" -version = "7.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" -dependencies = [ - "memchr", - "minimal-lexical", -] - [[package]] name = "num" version = "0.4.0" @@ -2439,15 +1747,6 @@ dependencies = [ "libc", ] -[[package]] -name = "num_threads" -version = "0.1.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2819ce041d2ee131036f4fc9d6ae7ae125a3a40e97ba64d04fe799ad9dabbb44" -dependencies = [ - "libc", -] - [[package]] name = "object_store" version = "0.5.6" @@ -2478,9 +1777,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.17.1" +version = "1.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" +checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "ordered-float" @@ -2503,22 +1802,22 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.7" +version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9069cbb9f99e3a5083476ccb29ceb1de18b9118cafa53e90c9551235de2b9521" +checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if", "libc", - "redox_syscall 0.2.16", + "redox_syscall", "smallvec", - "windows-sys 0.45.0", + "windows-targets", ] [[package]] name = "parquet" -version = "36.0.0" +version = "40.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "321a15f8332645759f29875b07f8233d16ed8ec1b3582223de81625a9f8506b7" +checksum = "d6a656fcc17e641657c955742c689732684e096f790ff30865d9f8dcc39f7c4a" dependencies = [ "ahash", "arrow-array", @@ -2538,13 +1837,14 @@ dependencies = [ "lz4", "num", "num-bigint", + "object_store", "paste", "seq-macro", "snap", "thrift", "tokio", "twox-hash", - "zstd 0.12.3+zstd.1.5.2", + "zstd", ] [[package]] @@ -2564,9 +1864,9 @@ checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79" [[package]] name = "percent-encoding" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +checksum = "9b2a4787296e9989611394c33f193f676704af1686e70b8f8033ab5ba9a35a94" [[package]] name = "petgraph" @@ -2642,12 +1942,12 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "prettyplease" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1ceca8aaf45b5c46ec7ed39fff75f57290368c1846d33d24a122ca81416ab058" +checksum = "3b69d39aab54d069e7f2fe8cb970493e7834601ca2d8c65fd7bbd183578080d1" dependencies = [ "proc-macro2", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -2658,19 +1958,13 @@ checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" [[package]] name = "proc-macro2" -version = "1.0.56" +version = "1.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435" +checksum = "dec2b086b7a862cf4de201096214fa870344cf922b2b30c167badb3af3195406" dependencies = [ "unicode-ident", ] -[[package]] -name = "prodash" -version = "23.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9516b775656bc3e8985e19cd4b8c0c0de045095074e453d2c0a513b5f978392d" - [[package]] name = "prost" version = "0.11.9" @@ -2822,9 +2116,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.26" +version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc" +checksum = "1b9ab9c7eadfd8df19006f1cf1a4aed13540ed5cbc047010ece5826e10825488" dependencies = [ "proc-macro2", ] @@ -2859,15 +2153,6 @@ dependencies = [ "getrandom", ] -[[package]] -name = "redox_syscall" -version = "0.2.16" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" -dependencies = [ - "bitflags 1.3.2", -] - [[package]] name = "redox_syscall" version = "0.3.5" @@ -2877,51 +2162,28 @@ dependencies = [ "bitflags 1.3.2", ] -[[package]] -name = "redox_users" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" -dependencies = [ - "getrandom", - "redox_syscall 0.2.16", - "thiserror", -] - [[package]] name = "regex" -version = "1.8.1" +version = "1.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af83e617f331cc6ae2da5443c602dfa5af81e517212d9d611a5b3ba1777b5370" +checksum = "d0ab3ca65655bb1e41f2a8c8cd662eb4fb035e67c3f78da1d61dffe89d07300f" dependencies = [ "aho-corasick", "memchr", - "regex-syntax 0.7.1", + "regex-syntax", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" - -[[package]] -name = "regex-syntax" -version = "0.6.29" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" - [[package]] name = "regex-syntax" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5996294f19bd3aae0453a862ad728f60e6600695733dd5df01da90c54363a3c" +checksum = "436b050e76ed2903236f032a59761c1eb99e1b0aead2c257922771dab1fc8c78" [[package]] name = "regress" -version = "0.5.0" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d995d590bd8ec096d1893f414bf3f5e8b0ee4c9eed9a5642b9766ef2c8e2e8e9" +checksum = "82a9ecfa0cb04d0b04dddb99b8ccf4f66bc8dfd23df694b398570bd8ae3a50fb" dependencies = [ "hashbrown 0.13.2", "memchr", @@ -2929,9 +2191,9 @@ dependencies = [ [[package]] name = "reqwest" -version = "0.11.17" +version = "0.11.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13293b639a097af28fc8a90f22add145a9c954e49d77da06263d58cf44d5fb91" +checksum = "cde824a14b7c14f85caff81225f411faacc04a2013f41670f41443742b1c1c55" dependencies = [ "base64", "bytes", @@ -3000,28 +2262,28 @@ dependencies = [ [[package]] name = "rustix" -version = "0.37.19" +version = "0.37.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "acf8729d8542766f1b2cf77eb034d52f40d375bb8b615d0b147089946e16613d" +checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags 1.3.2", "errno", "io-lifetimes", "libc", "linux-raw-sys", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] name = "rustls" -version = "0.20.8" +version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fff78fc74d175294f4e83b28343315ffcfb114b156f0185e9741cb5570f50e2f" +checksum = "c911ba11bc8433e811ce56fde130ccf32f5127cab0e0194e9c68c5a5b671791e" dependencies = [ "log", "ring", + "rustls-webpki", "sct", - "webpki", ] [[package]] @@ -3033,6 +2295,16 @@ dependencies = [ "base64", ] +[[package]] +name = "rustls-webpki" +version = "0.100.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6207cd5ed3d8dca7816f8f3725513a34609c0c765bf652b8c3cb4cfd87db46b" +dependencies = [ + "ring", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.12" @@ -3084,12 +2356,6 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" -[[package]] -name = "scratch" -version = "1.0.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1792db035ce95be60c3f8853017b3999209281c24e2ba5bc8e59bf97a0c590c1" - [[package]] name = "sct" version = "0.7.0" @@ -3114,22 +2380,22 @@ checksum = "e6b44e8fc93a14e66336d230954dda83d18b4605ccace8fe09bc7514a71ad0bc" [[package]] name = "serde" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb2f3770c8bce3bcda7e149193a069a0f4365bda1fa5cd88e03bca26afc1216c" +checksum = "9e8c8cf938e98f769bc164923b06dce91cea1751522f46f8466461af04c9027d" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.160" +version = "1.0.164" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291a097c63d8497e00160b166a967a4a79c64f3facdd01cbd7502231688d77df" +checksum = "d9735b638ccc51c28bf6914d90a2e9725b377144fc612c49a611fddd1b631d68" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -3156,13 +2422,14 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.1.7" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "797ba1d80299b264f3aac68ab5d12e5825a561749db4df7cd7c8083900c5d4e9" +checksum = "8a00ffd23fd882d096f09fcaae2a9de8329a328628e86027e049ee051dc1621f" dependencies = [ "proc-macro2", + "quote", "serde", - "syn 1.0.109", + "syn 2.0.18", ] [[package]] @@ -3190,12 +2457,6 @@ dependencies = [ "unsafe-libyaml", ] -[[package]] -name = "sha1_smol" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012" - [[package]] name = "sha2" version = "0.10.6" @@ -3207,25 +2468,6 @@ dependencies = [ "digest", ] -[[package]] -name = "signal-hook" -version = "0.3.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "732768f1176d21d09e076c23a93123d40bba92d50c4058da34d45c8de8e682b9" -dependencies = [ - "libc", - "signal-hook-registry", -] - -[[package]] -name = "signal-hook-registry" -version = "1.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8229b473baa5980ac72ef434c4415e70c4b5e71b423043adb4ba059f89c99a1" -dependencies = [ - "libc", -] - [[package]] name = "siphasher" version = "0.3.10" @@ -3293,9 +2535,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.32.0" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0366f270dbabb5cc2e4c88427dc4c08bba144f81e32fbd459a013f26a4d16aa0" +checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59" dependencies = [ "log", "sqlparser_derive", @@ -3323,6 +2565,9 @@ name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" +dependencies = [ + "strum_macros", +] [[package]] name = "strum_macros" @@ -3339,11 +2584,11 @@ dependencies = [ [[package]] name = "substrait" -version = "0.7.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3ae64fb7ad0670c7d6d53d57b1b91beb2212afc30e164cc8edb02d6b2cff32a" +checksum = "9df5d9e071804204172dc77e707c363f187e7f6566f9c78e5100c9a8f5ea434e" dependencies = [ - "gix", + "git2", "heck", "prettyplease", "prost", @@ -3354,16 +2599,16 @@ dependencies = [ "serde", "serde_json", "serde_yaml", - "syn 2.0.15", + "syn 2.0.18", "typify", "walkdir", ] [[package]] name = "subtle" -version = "2.4.1" +version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6bdef32e8150c2a081110b42772ffe7d7c9032b606bc226c8260fd97e0976601" +checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" [[package]] name = "syn" @@ -3378,9 +2623,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.15" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a34fcf3e8b60f57e6a14301a2e916d323af98b0ea63c599441eec8558660c822" +checksum = "32d41677bcbe24c20c52e7c70b0d8db04134c5d1066bf98662e2871ad200ea3e" dependencies = [ "proc-macro2", "quote", @@ -3395,15 +2640,16 @@ checksum = "fd1ba337640d60c3e96bc6f0638a939b9c9a7f2c316a1598c279828b3d1dc8c5" [[package]] name = "tempfile" -version = "3.5.0" +version = "3.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b9fbec84f381d5795b08656e4912bec604d162bff9291d6189a78f4c8ab87998" +checksum = "31c0432476357e58790aaa47a8efb0c5138f137343f3b5f23bd36a27e3b0a6d6" dependencies = [ + "autocfg", "cfg-if", "fastrand", - "redox_syscall 0.3.5", + "redox_syscall", "rustix", - "windows-sys 0.45.0", + "windows-sys", ] [[package]] @@ -3432,7 +2678,7 @@ checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] @@ -3446,46 +2692,6 @@ dependencies = [ "ordered-float", ] -[[package]] -name = "time" -version = "0.1.45" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b797afad3f312d1c66a56d11d0316f916356d11bd158fbc6ca6389ff6bf805a" -dependencies = [ - "libc", - "wasi 0.10.0+wasi-snapshot-preview1", - "winapi", -] - -[[package]] -name = "time" -version = "0.3.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0cbfecb4d19b5ea75bb31ad904eb5b9fa13f21079c3b92017ebdf4999a5890" -dependencies = [ - "itoa", - "libc", - "num_threads", - "serde", - "time-core", - "time-macros", -] - -[[package]] -name = "time-core" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e153e1f1acaef8acc537e68b44906d2db6436e2b35ac2c6b42640fff91f00fd" - -[[package]] -name = "time-macros" -version = "0.2.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd80a657e71da814b8e5d60d3374fc6d35045062245d80224748ae522dd76f36" -dependencies = [ - "time-core", -] - [[package]] name = "tiny-keccak" version = "2.0.2" @@ -3512,9 +2718,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.28.0" +version = "1.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3c786bf8134e5a3a166db9b29ab8f48134739014a3eca7bc6bfa95d673b136f" +checksum = "94d7b1cfd2aa4011f2de74c2c4c63665e27a71006b0a192dcd2710272e73dfa2" dependencies = [ "autocfg", "bytes", @@ -3525,7 +2731,7 @@ dependencies = [ "pin-project-lite", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys", ] [[package]] @@ -3536,18 +2742,17 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] name = "tokio-rustls" -version = "0.23.4" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", "tokio", - "webpki", ] [[package]] @@ -3601,14 +2806,14 @@ checksum = "0f57e3ca2a01450b1a921183a9c9cbfda207fd822cef4ccb00a65402cbba7a74" dependencies = [ "proc-macro2", "quote", - "syn 2.0.15", + "syn 2.0.18", ] [[package]] name = "tracing-core" -version = "0.1.30" +version = "0.1.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24eb03ba0eab1fd845050058ce5e616558e8f8d8fca633e6b163fe25c797213a" +checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", ] @@ -3648,9 +2853,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30bfde96849e25d7feef1bbf652e9cfc51deb63203fdc07b115b8bc3bcfe20b9" +checksum = "a6658d09e71bfe59e7987dc95ee7f71809fdb5793ab0cdc1503cc0073990484d" dependencies = [ "typify-impl", "typify-macro", @@ -3658,9 +2863,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95d27d749378ceab6ec22188ed7ad102205c89ddb92ab662371c850ffc71aa1a" +checksum = "34d3bb47587b13edf526d6ed02bf360ecefe083ab47a4ef29fc43112828b2bef" dependencies = [ "heck", "log", @@ -3669,16 +2874,16 @@ dependencies = [ "regress", "schemars", "serde_json", - "syn 1.0.109", + "syn 2.0.18", "thiserror", "unicode-ident", ] [[package]] name = "typify-macro" -version = "0.0.11" +version = "0.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35db6fc2bd9220ecdac6eeb88158824b83610de3dda0c6d0f2142b49efd858b0" +checksum = "d3f7e627c18be12d53bc1f261830b9c2763437b6a86ac57293b9085af2d32ffe" dependencies = [ "proc-macro2", "quote", @@ -3686,7 +2891,7 @@ dependencies = [ "serde", "serde_json", "serde_tokenstream", - "syn 1.0.109", + "syn 2.0.18", "typify-impl", ] @@ -3696,17 +2901,11 @@ version = "0.3.13" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460" -[[package]] -name = "unicode-bom" -version = "1.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63ec69f541d875b783ca40184d655f2927c95f0bffd486faa83cd3ac3529ec32" - [[package]] name = "unicode-ident" -version = "1.0.8" +version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4" +checksum = "b15811caf2415fb889178633e7724bad2509101cde276048e013b9def5e51fa0" [[package]] name = "unicode-normalization" @@ -3749,9 +2948,9 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "url" -version = "2.3.1" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d68c799ae75762b8c3fe375feb6600ef5602c883c5d21eb51c09f22b83c4643" +checksum = "50bff7831e19200a85b17131d085c25d7811bc4e186efdaf54bbd132994a88cb" dependencies = [ "form_urlencoded", "idna", @@ -3760,14 +2959,20 @@ dependencies = [ [[package]] name = "uuid" -version = "1.3.2" +version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4dad5567ad0cf5b760e5665964bec1b47dfd077ba8a2544b513f3556d3d239a2" +checksum = "0fa2982af2eec27de306107c027578ff7f423d65f7250e40ce0fea8f45248b81" dependencies = [ "getrandom", "serde", ] +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.4" @@ -3794,12 +2999,6 @@ dependencies = [ "try-lock", ] -[[package]] -name = "wasi" -version = "0.10.0+wasi-snapshot-preview1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a143597ca7c7793eff794def352d41792a93c481eb1042423ff7ff72ba2c31f" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" @@ -3808,9 +3007,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "7706a72ab36d8cb1f80ffbf0e071533974a60d0a308d01a5d0375bf60499a342" dependencies = [ "cfg-if", "wasm-bindgen-macro", @@ -3818,24 +3017,24 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "5ef2b6d3c510e9625e5fe6f509ab07d66a760f0885d858736483c32ed7809abd" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.34" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" +checksum = "c02dbc21516f9f1f04f187958890d7e6026df8d16540b7ad9492bc34a67cea03" dependencies = [ "cfg-if", "js-sys", @@ -3845,9 +3044,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "dee495e55982a3bd48105a7b947fd2a9b4a8ae3010041b9e0faab3f9cd028f1d" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -3855,22 +3054,22 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b" dependencies = [ "proc-macro2", "quote", - "syn 1.0.109", + "syn 2.0.18", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" +version = "0.2.87" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" +checksum = "ca6ad05a4870b2bf5fe995117d3728437bd27d7cd5f06f13c17443ef369775a1" [[package]] name = "wasm-streams" @@ -3887,9 +3086,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.61" +version = "0.3.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e33b99f4b23ba3eec1a53ac264e35a755f00e966e0065077d6027c0f575b0b97" +checksum = "9b85cbef8c220a6abc02aefd892dfc0fc23afb1c6a426316ec33253a3877249b" dependencies = [ "js-sys", "wasm-bindgen", @@ -3956,37 +3155,13 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" -[[package]] -name = "windows" -version = "0.43.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04662ed0e3e5630dfa9b26e4cb823b817f1a9addda855d973a9458c236556244" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", -] - [[package]] name = "windows" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e686886bc078bc1b0b600cac0147aadb815089b6e4da64016cbd754b6342700f" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-sys" -version = "0.45.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "75283be5efb2831d37ea142365f009c02ec203cd29a3ebecbc093d52315b66d0" -dependencies = [ - "windows-targets 0.42.2", + "windows-targets", ] [[package]] @@ -3995,22 +3170,7 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets 0.48.0", -] - -[[package]] -name = "windows-targets" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e5180c00cd44c9b1c88adb3693291f1cd93605ded80c250a75d472756b4d071" -dependencies = [ - "windows_aarch64_gnullvm 0.42.2", - "windows_aarch64_msvc 0.42.2", - "windows_i686_gnu 0.42.2", - "windows_i686_msvc 0.42.2", - "windows_x86_64_gnu 0.42.2", - "windows_x86_64_gnullvm 0.42.2", - "windows_x86_64_msvc 0.42.2", + "windows-targets", ] [[package]] @@ -4019,93 +3179,51 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7b1eb6f0cd7c80c79759c929114ef071b87354ce476d9d94271031c0497adfd5" dependencies = [ - "windows_aarch64_gnullvm 0.48.0", - "windows_aarch64_msvc 0.48.0", - "windows_i686_gnu 0.48.0", - "windows_i686_msvc 0.48.0", - "windows_x86_64_gnu 0.48.0", - "windows_x86_64_gnullvm 0.48.0", - "windows_x86_64_msvc 0.48.0", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "91ae572e1b79dba883e0d315474df7305d12f569b400fcf90581b06062f7e1bc" -[[package]] -name = "windows_aarch64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43" - [[package]] name = "windows_aarch64_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2ef27e0d7bdfcfc7b868b317c1d32c641a6fe4629c171b8928c7b08d98d7cf3" -[[package]] -name = "windows_i686_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f" - [[package]] name = "windows_i686_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "622a1962a7db830d6fd0a69683c80a18fda201879f0f447f065a3b7467daa241" -[[package]] -name = "windows_i686_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060" - [[package]] name = "windows_i686_msvc" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4542c6e364ce21bf45d69fdd2a8e455fa38d316158cfd43b3ac1c5b1b19f8e00" -[[package]] -name = "windows_x86_64_gnu" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36" - [[package]] name = "windows_x86_64_gnu" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca2b8a661f7628cbd23440e50b05d705db3686f894fc9580820623656af974b1" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3" - [[package]] name = "windows_x86_64_gnullvm" version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7896dbc1f41e08872e9d5e8f8baa8fdd2677f29468c4e156210174edc7f7b953" -[[package]] -name = "windows_x86_64_msvc" -version = "0.42.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0" - [[package]] name = "windows_x86_64_msvc" version = "0.48.0" @@ -4151,32 +3269,13 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "zstd" -version = "0.11.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" -dependencies = [ - "zstd-safe 5.0.2+zstd.1.5.2", -] - [[package]] name = "zstd" version = "0.12.3+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" dependencies = [ - "zstd-safe 6.0.5+zstd.1.5.4", -] - -[[package]] -name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" -dependencies = [ - "libc", - "zstd-sys", + "zstd-safe", ] [[package]] diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index eb12bff27..3afada895 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "9493638" } +datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "logical_extension" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/dask_planner/src/dialect.rs b/dask_planner/src/dialect.rs index 9fe013f3d..4876c8097 100644 --- a/dask_planner/src/dialect.rs +++ b/dask_planner/src/dialect.rs @@ -77,6 +77,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "floor" => { @@ -108,6 +109,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampadd" => { @@ -136,6 +138,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "timestampdiff" => { @@ -163,6 +166,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } Token::Word(w) if w.value.to_lowercase() == "to_timestamp" => { @@ -192,6 +196,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } _ => Ok(None), diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs index aa1a60a9b..80e7b88fa 100644 --- a/dask_planner/src/expression.rs +++ b/dask_planner/src/expression.rs @@ -4,7 +4,20 @@ use datafusion_python::{ datafusion::arrow::datatypes::DataType, datafusion_common::{Column, DFField, DFSchema, ScalarValue}, datafusion_expr::{ - expr::{AggregateFunction, BinaryExpr, Cast, Sort, TryCast, WindowFunction}, + expr::{ + AggregateFunction, + AggregateUDF, + BinaryExpr, + Cast, + Exists, + InList, + InSubquery, + ScalarFunction, + ScalarUDF, + Sort, + TryCast, + WindowFunction, + }, lit, utils::exprlist_to_fields, Between, @@ -330,15 +343,15 @@ impl PyExpr { | Expr::Cast(Cast { expr, .. }) | Expr::TryCast(TryCast { expr, .. }) | Expr::Sort(Sort { expr, .. }) - | Expr::InSubquery { expr, .. } => { + | Expr::InSubquery(InSubquery { expr, .. }) => { Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) } // Expr variants containing a collection of Expr(s) for operands Expr::AggregateFunction(AggregateFunction { args, .. }) - | Expr::AggregateUDF { args, .. } - | Expr::ScalarFunction { args, .. } - | Expr::ScalarUDF { args, .. } + | Expr::AggregateUDF(AggregateUDF { args, .. }) + | Expr::ScalarFunction(ScalarFunction { args, .. }) + | Expr::ScalarUDF(ScalarUDF { args, .. }) | Expr::WindowFunction(WindowFunction { args, .. }) => Ok(args .iter() .map(|arg| PyExpr::from(arg.clone(), self.input_plan.clone())) @@ -377,7 +390,7 @@ impl PyExpr { Ok(operands) } - Expr::InList { expr, list, .. } => { + Expr::InList(InList { expr, list, .. }) => { let mut operands: Vec = vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]; for list_elem in list { @@ -435,8 +448,8 @@ impl PyExpr { op, right: _, }) => format!("{op}"), - Expr::ScalarFunction { fun, args: _ } => format!("{fun}"), - Expr::ScalarUDF { fun, .. } => fun.name.clone(), + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => format!("{fun}"), + Expr::ScalarUDF(ScalarUDF { fun, .. }) => fun.name.clone(), Expr::Cast { .. } => "cast".to_string(), Expr::Between { .. } => "between".to_string(), Expr::Case { .. } => "case".to_string(), @@ -557,7 +570,7 @@ impl PyExpr { ScalarValue::Struct(..) => "Struct", ScalarValue::FixedSizeBinary(_, _) => "FixedSizeBinary", }, - Expr::ScalarFunction { fun, args: _ } => match fun { + Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => match fun { BuiltinScalarFunction::Abs => "Abs", BuiltinScalarFunction::DatePart => "DatePart", _ => { @@ -639,7 +652,7 @@ impl PyExpr { match &self.expr { Expr::Alias(expr, _) => match expr.as_ref() { Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => { Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))) } @@ -650,7 +663,7 @@ impl PyExpr { )), }, Expr::AggregateFunction(AggregateFunction { filter, .. }) - | Expr::AggregateUDF { filter, .. } => match filter { + | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => Ok(Some(PyExpr::from(*filter.clone(), self.input_plan.clone()))), None => Ok(None), }, @@ -739,7 +752,10 @@ impl PyExpr { ScalarValue::TimestampNanosecond(iv, tz) | ScalarValue::TimestampMicrosecond(iv, tz) | ScalarValue::TimestampMillisecond(iv, tz) - | ScalarValue::TimestampSecond(iv, tz) => Ok((*iv, tz.clone())), + | ScalarValue::TimestampSecond(iv, tz) => match tz { + Some(time_zone) => Ok((*iv, Some(time_zone.to_string()))), + None => Ok((*iv, None)), + }, other => Err(unexpected_literal_value(other)), } } @@ -790,9 +806,9 @@ impl PyExpr { pub fn is_negated(&self) -> PyResult { match &self.expr { Expr::Between(Between { negated, .. }) - | Expr::Exists { negated, .. } - | Expr::InList { negated, .. } - | Expr::InSubquery { negated, .. } => Ok(*negated), + | Expr::Exists(Exists { negated, .. }) + | Expr::InList(InList { negated, .. }) + | Expr::InSubquery(InSubquery { negated, .. }) => Ok(*negated), _ => Err(py_type_err(format!( "unknown Expr type {:?} encountered", &self.expr diff --git a/dask_planner/src/sql/logical.rs b/dask_planner/src/sql/logical.rs index d2096ba9b..890f9aacb 100644 --- a/dask_planner/src/sql/logical.rs +++ b/dask_planner/src/sql/logical.rs @@ -37,7 +37,7 @@ pub mod window; use datafusion_python::{ datafusion_common::{DFSchemaRef, DataFusionError}, - datafusion_expr::LogicalPlan, + datafusion_expr::{DdlStatement, LogicalPlan}, }; use pyo3::prelude::*; @@ -315,18 +315,19 @@ impl PyLogicalPlan { LogicalPlan::TableScan(_table_scan) => "TableScan", LogicalPlan::EmptyRelation(_empty_relation) => "EmptyRelation", LogicalPlan::Limit(_limit) => "Limit", - LogicalPlan::CreateExternalTable(_create_external_table) => "CreateExternalTable", - LogicalPlan::CreateMemoryTable(_create_memory_table) => "CreateMemoryTable", - LogicalPlan::DropTable(_drop_table) => "DropTable", - LogicalPlan::DropView(_drop_view) => "DropView", + LogicalPlan::Ddl(DdlStatement::CreateExternalTable { .. }) => "CreateExternalTable", + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable { .. }) => "CreateMemoryTable", + LogicalPlan::Ddl(DdlStatement::DropTable { .. }) => "DropTable", + LogicalPlan::Ddl(DdlStatement::DropView { .. }) => "DropView", LogicalPlan::Values(_values) => "Values", LogicalPlan::Explain(_explain) => "Explain", LogicalPlan::Analyze(_analyze) => "Analyze", LogicalPlan::Subquery(_sub_query) => "Subquery", LogicalPlan::SubqueryAlias(_sqalias) => "SubqueryAlias", - LogicalPlan::CreateCatalogSchema(_create) => "CreateCatalogSchema", - LogicalPlan::CreateCatalog(_create_catalog) => "CreateCatalog", - LogicalPlan::CreateView(_create_view) => "CreateView", + LogicalPlan::Ddl(DdlStatement::CreateCatalogSchema { .. }) => "CreateCatalogSchema", + LogicalPlan::Ddl(DdlStatement::DropCatalogSchema { .. }) => "DropCatalogSchema", + LogicalPlan::Ddl(DdlStatement::CreateCatalog { .. }) => "CreateCatalog", + LogicalPlan::Ddl(DdlStatement::CreateView { .. }) => "CreateView", LogicalPlan::Statement(_) => "Statement", // Further examine and return the name that is a possible Dask-SQL Extension type LogicalPlan::Extension(extension) => { diff --git a/dask_planner/src/sql/logical/aggregate.rs b/dask_planner/src/sql/logical/aggregate.rs index 0acc8b86e..870d8d7ab 100644 --- a/dask_planner/src/sql/logical/aggregate.rs +++ b/dask_planner/src/sql/logical/aggregate.rs @@ -1,5 +1,5 @@ use datafusion_python::datafusion_expr::{ - expr::AggregateFunction, + expr::{AggregateFunction, AggregateUDF}, logical_plan::{Aggregate, Distinct}, Expr, LogicalPlan, @@ -75,7 +75,7 @@ impl PyAggregate { match expr { Expr::Alias(expr, _) => self._aggregation_arguments(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun: _, args, .. }) - | Expr::AggregateUDF { fun: _, args, .. } => match &self.aggregate { + | Expr::AggregateUDF(AggregateUDF { fun: _, args, .. }) => match &self.aggregate { Some(e) => py_expr_list(&e.input, args), None => Ok(vec![]), }, @@ -90,7 +90,7 @@ fn _agg_func_name(expr: &Expr) -> PyResult { match expr { Expr::Alias(expr, _) => _agg_func_name(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun, .. }) => Ok(fun.to_string()), - Expr::AggregateUDF { fun, .. } => Ok(fun.name.clone()), + Expr::AggregateUDF(AggregateUDF { fun, .. }) => Ok(fun.name.clone()), _ => Err(py_type_err( "Encountered a non Aggregate type in agg_func_name", )), diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/dask_planner/src/sql/logical/create_memory_table.rs index 668295e0f..dd3d0753d 100644 --- a/dask_planner/src/sql/logical/create_memory_table.rs +++ b/dask_planner/src/sql/logical/create_memory_table.rs @@ -1,5 +1,6 @@ use datafusion_python::datafusion_expr::{ logical_plan::{CreateMemoryTable, CreateView}, + DdlStatement, LogicalPlan, }; use pyo3::prelude::*; @@ -85,13 +86,13 @@ impl TryFrom for PyCreateMemoryTable { fn try_from(logical_plan: LogicalPlan) -> Result { Ok(match logical_plan { - LogicalPlan::CreateMemoryTable(create_memory_table) => PyCreateMemoryTable { - create_memory_table: Some(create_memory_table), + LogicalPlan::Ddl(DdlStatement::CreateMemoryTable(cmt)) => PyCreateMemoryTable { + create_memory_table: Some(cmt), create_view: None, }, - LogicalPlan::CreateView(create_view) => PyCreateMemoryTable { + LogicalPlan::Ddl(DdlStatement::CreateView(cv)) => PyCreateMemoryTable { create_memory_table: None, - create_view: Some(create_view), + create_view: Some(cv), }, _ => return Err(py_type_err("unexpected plan")), }) diff --git a/dask_planner/src/sql/logical/drop_table.rs b/dask_planner/src/sql/logical/drop_table.rs index 7d58e8a47..f91baf28a 100644 --- a/dask_planner/src/sql/logical/drop_table.rs +++ b/dask_planner/src/sql/logical/drop_table.rs @@ -1,4 +1,7 @@ -use datafusion_python::datafusion_expr::logical_plan::{DropTable, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + logical_plan::{DropTable, LogicalPlan}, + DdlStatement, +}; use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; @@ -27,7 +30,7 @@ impl TryFrom for PyDropTable { fn try_from(logical_plan: LogicalPlan) -> Result { match logical_plan { - LogicalPlan::DropTable(drop_table) => Ok(PyDropTable { drop_table }), + LogicalPlan::Ddl(DdlStatement::DropTable(drop_table)) => Ok(PyDropTable { drop_table }), _ => Err(py_type_err("unexpected plan")), } } diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/dask_planner/src/sql/logical/subquery_alias.rs index 1b23e5dc4..003e02045 100644 --- a/dask_planner/src/sql/logical/subquery_alias.rs +++ b/dask_planner/src/sql/logical/subquery_alias.rs @@ -14,7 +14,7 @@ impl PySubqueryAlias { /// Returns a Vec of the sort expressions #[pyo3(name = "getAlias")] pub fn alias(&self) -> PyResult { - Ok(self.subquery_alias.alias.clone()) + Ok(self.subquery_alias.alias.clone().to_string()) } } diff --git a/dask_planner/src/sql/logical/table_scan.rs b/dask_planner/src/sql/logical/table_scan.rs index 679d24c49..c54b53556 100644 --- a/dask_planner/src/sql/logical/table_scan.rs +++ b/dask_planner/src/sql/logical/table_scan.rs @@ -2,7 +2,7 @@ use std::sync::Arc; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{logical_plan::TableScan, Expr, LogicalPlan}, + datafusion_expr::{expr::InList, logical_plan::TableScan, Expr, LogicalPlan}, }; use pyo3::prelude::*; @@ -50,11 +50,11 @@ impl PyTableScan { let mut filter_tuple: Vec<(String, String, Vec)> = Vec::new(); match filter { - Expr::InList { + Expr::InList(InList { expr, list, negated, - } => { + }) => { // Only handle simple Expr(s) for InList operations for now if PyTableScan::_valid_expr_type(list) { // While ANSI SQL would not allow for anything other than a Column or Literal diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index 68577cf2c..cc86d2387 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -4,8 +4,6 @@ use datafusion_python::{ datafusion_common::DataFusionError, datafusion_expr::LogicalPlan, datafusion_optimizer::{ - decorrelate_where_exists::DecorrelateWhereExists, - decorrelate_where_in::DecorrelateWhereIn, eliminate_cross_join::EliminateCrossJoin, eliminate_limit::EliminateLimit, eliminate_outer_join::EliminateOuterJoin, @@ -43,8 +41,6 @@ impl DaskSqlOptimizer { Arc::new(SimplifyExpressions::new()), Arc::new(UnwrapCastInComparison::new()), // Arc::new(ReplaceDistinctWithAggregate::new()), - Arc::new(DecorrelateWhereExists::new()), - Arc::new(DecorrelateWhereIn::new()), Arc::new(ScalarSubqueryToJoin::new()), //Arc::new(ExtractEquijoinPredicate::new()), diff --git a/dask_planner/src/sql/table.rs b/dask_planner/src/sql/table.rs index f25f891ec..10fe97b1a 100644 --- a/dask_planner/src/sql/table.rs +++ b/dask_planner/src/sql/table.rs @@ -2,7 +2,7 @@ use std::{any::Any, sync::Arc}; use async_trait::async_trait; use datafusion_python::{ - datafusion::arrow::datatypes::{DataType, Field, SchemaRef}, + datafusion::arrow::datatypes::{DataType, Fields, SchemaRef}, datafusion_common::DFField, datafusion_expr::{Expr, LogicalPlan, TableProviderFilterPushDown, TableSource}, datafusion_optimizer::utils::split_conjunction, @@ -195,7 +195,7 @@ pub(crate) fn table_from_logical_plan( // Get the TableProvider for this Table instance let tbl_provider: Arc = table_scan.source.clone(); let tbl_schema: SchemaRef = tbl_provider.schema(); - let fields: &Vec = tbl_schema.fields(); + let fields: &Fields = tbl_schema.fields(); let mut cols: Vec<(String, DaskTypeMap)> = Vec::new(); for field in fields { diff --git a/dask_planner/src/sql/types.rs b/dask_planner/src/sql/types.rs index ceff904a6..4642a4eb0 100644 --- a/dask_planner/src/sql/types.rs +++ b/dask_planner/src/sql/types.rs @@ -1,6 +1,8 @@ pub mod rel_data_type; pub mod rel_data_type_field; +use std::sync::Arc; + use datafusion_python::{ datafusion::arrow::datatypes::{DataType, IntervalUnit, TimeUnit}, datafusion_sql::sqlparser::{ast::DataType as SQLType, parser::Parser, tokenizer::Tokenizer}, @@ -54,10 +56,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP_WITH_LOCAL_TIME_ZONE => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; @@ -85,10 +89,12 @@ impl DaskTypeMap { SqlTypeName::TIMESTAMP => { let (unit, tz) = match py_kwargs { Some(dict) => { - let tz: Option = match dict.get_item("tz") { + let tz: Option> = match dict.get_item("tz") { Some(e) => { let res: PyResult = e.extract(); - Some(res.unwrap()) + Some(Arc::from(>::as_ref( + &res.unwrap(), + ))) } None => None, }; From c59cdbd838ce868051aa88f9d275197c31547cfe Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 14 Jun 2023 14:52:02 -0400 Subject: [PATCH 02/89] warn on optimization failure instead of erroring and exiting --- dask_sql/context.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/dask_sql/context.py b/dask_sql/context.py index 837f7cd1c..d2247b1ac 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -42,7 +42,7 @@ from dask_sql.mappings import python_to_sql_type from dask_sql.physical.rel import RelConverter, custom, logical from dask_sql.physical.rex import RexConverter, core -from dask_sql.utils import OptimizationException, ParsingException +from dask_sql.utils import ParsingException logger = logging.getLogger(__name__) @@ -824,8 +824,9 @@ def _get_ral(self, sql): try: rel = self.context.optimize_relational_algebra(nonOptimizedRel) except DFOptimizationException as oe: + # Use original plan and warn about inability to optimize plan rel = nonOptimizedRel - raise OptimizationException(str(oe)) from None + logger.warn(str(oe)) else: rel = nonOptimizedRel From 5c02c5aafb46bbb377ae227e6448980a350008a9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 30 Jun 2023 14:17:45 -0700 Subject: [PATCH 03/89] Resolve initial build errors --- dask_planner/src/dialect.rs | 1 + .../src/sql/optimizer/dynamic_partition_pruning.rs | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dask_planner/src/dialect.rs b/dask_planner/src/dialect.rs index e1067d3f0..da4e213e1 100644 --- a/dask_planner/src/dialect.rs +++ b/dask_planner/src/dialect.rs @@ -226,6 +226,7 @@ impl Dialect for DaskDialect { over: None, distinct: false, special: false, + order_by: vec![], }))) } _ => Ok(None), diff --git a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs b/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs index 0ff48a682..ac931b560 100644 --- a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs @@ -22,6 +22,7 @@ use datafusion_python::{ }, datafusion_common::{Column, Result, ScalarValue}, datafusion_expr::{ + expr::InList, logical_plan::LogicalPlan, utils::from_plan, Expr, @@ -433,13 +434,13 @@ fn gather_aliases(plan: &LogicalPlan) -> HashMap { if let LogicalPlan::SubqueryAlias(ref s) = current_plan { match *s.input { LogicalPlan::TableScan(ref t) => { - aliases.insert(s.alias.clone(), t.table_name.to_string().clone()); + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); } // Sometimes a TableScan is immediately followed by a Projection, so we can // still use the alias for the table LogicalPlan::Projection(ref p) => { if let LogicalPlan::TableScan(ref t) = *p.input { - aliases.insert(s.alias.clone(), t.table_name.to_string().clone()); + aliases.insert(s.alias.to_string(), t.table_name.to_string().clone()); } } _ => (), @@ -1053,11 +1054,11 @@ fn format_inlist_expr( if list.is_empty() { None } else { - Some(Expr::InList { + Some(Expr::InList(InList { expr, list, negated: false, - }) + })) } } From 515dae6a9ffdbc7f2e240bcaf94f7154e90c65c5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 7 Jul 2023 10:45:38 -0700 Subject: [PATCH 04/89] Switch to crates release, add zlib to host/build deps --- continuous_integration/recipe/meta.yaml | 1 + dask_planner/Cargo.lock | 3 ++- dask_planner/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 5152cfc4e..d8ee1e0d1 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -32,6 +32,7 @@ requirements: - python - setuptools-rust - libprotobuf + - zlib run: - python - dask >=2022.3.0 diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 615d90ab5..073939f3f 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -819,7 +819,8 @@ dependencies = [ [[package]] name = "datafusion-python" version = "26.0.0" -source = "git+https://github.com/jdye64/arrow-datafusion-python.git?branch=logical_extension#ce83d8a837ac2bd8473cbe3432cebd3eb90a6e1c" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d406c7f116547044c2039be6d055c19c680a4ab8b1a550f0403c0ae276dff3c5" dependencies = [ "async-trait", "datafusion", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 3afada895..f688956dc 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,7 +10,7 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.68" -datafusion-python = { git = "https://github.com/jdye64/arrow-datafusion-python.git", branch = "logical_extension" } +datafusion-python = "26.0.0" env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } From ef399e897fd7278ed55c8a644f78fa160561ae83 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 7 Jul 2023 11:59:01 -0700 Subject: [PATCH 05/89] Add zlib to aarch build deps --- continuous_integration/recipe/meta.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index d8ee1e0d1..02e58d1fb 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,6 +25,7 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: From 68585782059ba8f3acd03df54a9b42e4a99c5b3f Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 11 Jul 2023 21:31:05 -0400 Subject: [PATCH 06/89] Bump to ADP 27 and introduce support for wildcard expressions, a wildcard expression name will be subbed with the first column in the incoming schema plan --- dask_planner/Cargo.lock | 267 +++++++++++---------- dask_planner/Cargo.toml | 4 +- dask_planner/src/expression.rs | 112 +++++---- dask_planner/src/sql.rs | 32 ++- dask_sql/physical/rel/logical/aggregate.py | 36 ++- dask_sql/physical/rex/core/call.py | 4 + tests/integration/test_join.py | 4 +- tests/integration/test_select.py | 12 + 8 files changed, 287 insertions(+), 184 deletions(-) diff --git a/dask_planner/Cargo.lock b/dask_planner/Cargo.lock index 8fdffdab2..1a75a215d 100644 --- a/dask_planner/Cargo.lock +++ b/dask_planner/Cargo.lock @@ -51,6 +51,12 @@ dependencies = [ "alloc-no-stdlib", ] +[[package]] +name = "allocator-api2" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56fc6cf8dc8c4158eed8649f9b8b0ea1518eb62b544fe9490d66fa0b349eafe9" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -91,8 +97,8 @@ dependencies = [ "serde", "serde_json", "snap", - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "thiserror", "typed-builder", "uuid", @@ -119,9 +125,9 @@ checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6619cab21a0cdd8c9b9f1d9e09bfaa9b1974e5ef809a6566aef0b998caf38ace" +checksum = "773d18d72cd290f3f9e2149a714c8ac404b6c3fd614c684f0015449940fca899" dependencies = [ "ahash", "arrow-arith", @@ -142,9 +148,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0dc95485623a76e00929bda8caa40c1f838190952365c4f43a7b9ae86d03e94" +checksum = "93bc0da4b22ba63807fa2a74998e21209179c93c67856ae65d9218b81f3ef918" dependencies = [ "arrow-array", "arrow-buffer", @@ -157,9 +163,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3267847f53d3042473cfd2c769afd8d74a6d7d201fc3a34f5cb84c0282ef47a7" +checksum = "ea9a0fd21121304cad96f307c938d861cb1e7f0c151b93047462cd9817d760fb" dependencies = [ "ahash", "arrow-buffer", @@ -168,15 +174,15 @@ dependencies = [ "chrono", "chrono-tz", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "num", ] [[package]] name = "arrow-buffer" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5f66553e66e120ac4b21570368ee9ebf35ff3f5399f872b0667699e145678f5" +checksum = "30ce342ecf5971004e23cef8b5fb3bacd2bbc48a381464144925074e1472e9eb" dependencies = [ "half", "num", @@ -184,9 +190,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "65e6f3579dbf0d97c683d451b2550062b0f0e62a3169bf74238b5f59f44ad6d8" +checksum = "4b94a0ce7d27abbb02e2ee4db770f593127610f57b32625b0bc6a1a90d65f085" dependencies = [ "arrow-array", "arrow-buffer", @@ -195,15 +201,16 @@ dependencies = [ "arrow-select", "chrono", "comfy-table", + "half", "lexical-core", "num", ] [[package]] name = "arrow-csv" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "373579c4c1a8f5307d3125b7a89c700fcf8caf85821c77eb4baab3855ae0aba5" +checksum = "0f3be10a00a43c4bf0d243c070754ebdde17c5d576b4928d9c3efbe3005a3853" dependencies = [ "arrow-array", "arrow-buffer", @@ -220,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61bc8df9912cca6642665fdf989d6fa0de2570f18a7f709bcf59d29de96d2097" +checksum = "1d9a83dad6a53d6907765106d3bc61d6d9d313cfe1751701b3ef0948e7283dc2" dependencies = [ "arrow-buffer", "arrow-schema", @@ -232,9 +239,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0105dcf5f91daa7182d87b713ee0b32b3bfc88e0c48e7dc3e9d6f1277a07d1ae" +checksum = "a46da5e438a854e0386b38774da88a98782c0973c6dbc5c949ca4e02faf9b016" dependencies = [ "arrow-array", "arrow-buffer", @@ -246,9 +253,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e73134fb5b5ec8770f8cbb214c2c487b2d350081e403ca4eeeb6f8f5e19846ac" +checksum = "d5f27a1fbc76553ad92dc1a9583e56b7058d8c418c4089b0b689f5b87e2da5e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -266,9 +273,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89f25bc66e18d4c2aa1fe2f9bb03e2269da60e636213210385ae41a107f9965a" +checksum = "f2373661f6c2233e18f6fa69c40999a9440231d1e8899be8bbbe73c7e24aa3b4" dependencies = [ "arrow-array", "arrow-buffer", @@ -281,9 +288,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1095ff85ea4f5ff02d17b30b089de31b51a50be01c6b674f0a0509ab771232f1" +checksum = "377cd5158b7de4034a175e296726c40c3236e65d71d90a5dab2fb4fab526a8f4" dependencies = [ "ahash", "arrow-array", @@ -291,23 +298,23 @@ dependencies = [ "arrow-data", "arrow-schema", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", ] [[package]] name = "arrow-schema" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25187bbef474151a2e4ddec67b9e34bda5cbfba292dc571392fa3a1f71ff5a82" +checksum = "ba9ed245bd2d7d97ad1457cb281d4296e8b593588758b8fec6d67b2b2b0f2265" dependencies = [ "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd0d4ee884aec3aa05e41478e3cd312bf609de9babb5d187a43fb45931da4da4" +checksum = "0dc9bd6aebc565b1d04bae64a0f4dda3abc677190eb7d960471b1b20e1cebed0" dependencies = [ "arrow-array", "arrow-buffer", @@ -318,9 +325,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6d71c3ffe4c07e66ce8fdc6aed5b00e0e60c5144911879b10546f5b72d8fa1c" +checksum = "23cf2baea2ef53787332050decf7d71aca836a352e188c8ad062892405955d2b" dependencies = [ "arrow-array", "arrow-buffer", @@ -539,12 +546,12 @@ dependencies = [ [[package]] name = "comfy-table" -version = "6.2.0" +version = "7.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7e959d788268e3bf9d35ace83e81b124190378e4c91c9067524675e33394b8ba" +checksum = "9ab77dbd8adecaf3f0db40581631b995f312a8a5ae3aa9993188bb8f23d83a5b" dependencies = [ - "strum", - "strum_macros", + "strum 0.24.1", + "strum_macros 0.24.3", "unicode-width", ] @@ -659,15 +666,15 @@ dependencies = [ "env_logger", "log", "pyo3", - "pyo3-build-config 0.19.1", + "pyo3-build-config", "pyo3-log", ] [[package]] name = "datafusion" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9992c267436551d40b52d65289b144712e7b0ebdc62c8c859fd1574e5f73efbb" +checksum = "e96f6e4eb10bd3e6b709686858246466983e8c5354a928ff77ee34919aa60d00" dependencies = [ "ahash", "apache-avro", @@ -690,9 +697,9 @@ dependencies = [ "flate2", "futures", "glob", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "indexmap", - "itertools", + "itertools 0.11.0", "lazy_static", "log", "num-traits", @@ -707,7 +714,6 @@ dependencies = [ "sqlparser", "tempfile", "tokio", - "tokio-stream", "tokio-util", "url", "uuid", @@ -717,9 +723,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3be97f7a7c720cdbb71e9eeabf814fa6ad8102b9022390f6cac74d3b4af6392" +checksum = "00e5fddcc0dd49bbe199e43aa406f39c46c790bb2a43c7b36a478e5f3f971235" dependencies = [ "apache-avro", "arrow", @@ -734,14 +740,14 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c77c4b14b809b0e4c5bb101b6834504f06cdbb0d3c643400c61d0d844b33264e" +checksum = "cfd50b6cb17acc78d2473c0d28014b8fd4e2e0a2c067c07645d6547b33b0aeeb" dependencies = [ "dashmap", "datafusion-common", "datafusion-expr", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "log", "object_store", "parking_lot", @@ -752,24 +758,24 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ec7409bd45cf4fae6395d7d1024c8a97e543cadc88363e405d2aad5330e5e7" +checksum = "e1a35dc2cd9eac18063d636f7ddf4f090fe1f34284d80192ac7ade38cc3c6991" dependencies = [ "ahash", "arrow", "datafusion-common", "lazy_static", "sqlparser", - "strum", - "strum_macros", + "strum 0.25.0", + "strum_macros 0.25.1", ] [[package]] name = "datafusion-optimizer" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64b537c93f87989c212db92a448a0f5eb4f0995e27199bb7687ae94f8b64a7a8" +checksum = "5f5043afeb45ec1c0f45519e1eed6a477f2d30732e8f975d9cf9a75fba0ca716" dependencies = [ "arrow", "async-trait", @@ -777,17 +783,17 @@ dependencies = [ "datafusion-common", "datafusion-expr", "datafusion-physical-expr", - "hashbrown 0.13.2", - "itertools", + "hashbrown 0.14.0", + "itertools 0.11.0", "log", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f60ee3f53340fdef36ee54d9e12d446ae2718b1d0196ac581f791d34808ec876" +checksum = "6cc892a24f4b829ee7718ad3950884c0346dbdf1517f3df153af4bcf54d8ca4d" dependencies = [ "ahash", "arrow", @@ -801,9 +807,9 @@ dependencies = [ "datafusion-expr", "datafusion-row", "half", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "indexmap", - "itertools", + "itertools 0.11.0", "lazy_static", "libc", "md-5", @@ -818,9 +824,9 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d406c7f116547044c2039be6d055c19c680a4ab8b1a550f0403c0ae276dff3c5" +checksum = "3311b157d1afe2a363d37a5ccb675384aa76e6033572ef9246f8af1579e6f0b2" dependencies = [ "async-trait", "datafusion", @@ -836,7 +842,7 @@ dependencies = [ "prost", "prost-types", "pyo3", - "pyo3-build-config 0.18.3", + "pyo3-build-config", "rand", "regex-syntax", "syn 2.0.23", @@ -847,9 +853,9 @@ dependencies = [ [[package]] name = "datafusion-row" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d58fc64058aa3bcb00077a0d19474a0d584d31dec8c7ac3406868f485f659af9" +checksum = "ce75c660bbddfdd254109e668e5b5bd69df31ea26e3768e15cef0c68015e650e" dependencies = [ "arrow", "datafusion-common", @@ -859,9 +865,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1531f0314151a34bf6c0a83c7261525688b7c729876f53e7896b8f4ca8f57d07" +checksum = "49cab87e4933a452e0b7b3f0cbd0e760daf7d33fb54d09d70d3ffba229eaa652" dependencies = [ "arrow", "arrow-schema", @@ -873,14 +879,14 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "26.0.0" +version = "27.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "079d5be5ec59580777bfa16d79187fea99b6498e3e8e07eb36d504a5fe708f13" +checksum = "ba77d22232053f6cdd98bd6f5328940850844450253f25b8c50bfc5199c505d4" dependencies = [ "async-recursion", "chrono", "datafusion", - "itertools", + "itertools 0.11.0", "object_store", "prost", "prost-types", @@ -1183,6 +1189,16 @@ dependencies = [ "ahash", ] +[[package]] +name = "hashbrown" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" +dependencies = [ + "ahash", + "allocator-api2", +] + [[package]] name = "heck" version = "0.4.1" @@ -1383,6 +1399,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.6" @@ -1613,9 +1638,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memoffset" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1" +checksum = "5a634b1c61a95585bd15607c6ab0c4e5b226e695ff2800ba0cdccddf208c406c" dependencies = [ "autocfg", ] @@ -1750,16 +1775,18 @@ dependencies = [ [[package]] name = "object_store" -version = "0.5.6" +version = "0.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec9cd6ca25e796a49fa242876d1c4de36a24a6da5258e9f0bc062dbf5e81c53b" +checksum = "27c776db4f332b571958444982ff641d2531417a326ca368995073b639205d58" dependencies = [ "async-trait", "base64", "bytes", "chrono", "futures", - "itertools", + "humantime", + "hyper", + "itertools 0.10.5", "parking_lot", "percent-encoding", "quick-xml", @@ -1816,9 +1843,9 @@ dependencies = [ [[package]] name = "parquet" -version = "40.0.0" +version = "42.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d6a656fcc17e641657c955742c689732684e096f790ff30865d9f8dcc39f7c4a" +checksum = "baab9c36b1c8300b81b4d577d306a0a733f9d34021363098d3548e37757ed6c8" dependencies = [ "ahash", "arrow-array", @@ -1834,7 +1861,7 @@ dependencies = [ "chrono", "flate2", "futures", - "hashbrown 0.13.2", + "hashbrown 0.14.0", "lz4", "num", "num-bigint", @@ -1984,7 +2011,7 @@ checksum = "119533552c9a7ffacc21e099c24a0ac8bb19c2a2a3f363de84cd9b844feab270" dependencies = [ "bytes", "heck", - "itertools", + "itertools 0.10.5", "lazy_static", "log", "multimap", @@ -2003,7 +2030,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" dependencies = [ "anyhow", - "itertools", + "itertools 0.10.5", "proc-macro2", "quote", "syn 1.0.109", @@ -2020,31 +2047,21 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3b1ac5b3731ba34fdaa9785f8d74d17448cd18f30cf19e0c7e7b1fdb5272109" +checksum = "ffb88ae05f306b4bfcde40ac4a51dc0b05936a9207a4b75b798c7729c4258a59" dependencies = [ "cfg-if", "indoc", "libc", "memoffset", "parking_lot", - "pyo3-build-config 0.18.3", + "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "unindent", ] -[[package]] -name = "pyo3-build-config" -version = "0.18.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cb946f5ac61bb61a5014924910d936ebd2b23b705f7a4a3c40b05c720b079a3" -dependencies = [ - "once_cell", - "target-lexicon", -] - [[package]] name = "pyo3-build-config" version = "0.19.1" @@ -2057,12 +2074,12 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd4d7c5337821916ea2a1d21d1092e8443cf34879e53a0ac653fbb98f44ff65c" +checksum = "922ede8759e8600ad4da3195ae41259654b9c55da4f7eec84a0ccc7d067a70a4" dependencies = [ "libc", - "pyo3-build-config 0.18.3", + "pyo3-build-config", ] [[package]] @@ -2078,9 +2095,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9d39c55dab3fc5a4b25bbd1ac10a2da452c4aca13bb450f22818a002e29648d" +checksum = "8a5caec6a1dd355964a841fcbeeb1b89fe4146c87295573f94228911af3cc5a2" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -2090,9 +2107,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.18.3" +version = "0.19.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97daff08a4c48320587b5224cc98d609e3c27b6d437315bd40b605c98eeb5918" +checksum = "e0b78ccbb160db1556cdb6fd96c50334c5d4ec44dc5e0a968d0a1208fa0efa8b" dependencies = [ "proc-macro2", "quote", @@ -2536,9 +2553,9 @@ checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" [[package]] name = "sqlparser" -version = "0.34.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37d3706eefb17039056234df6b566b0014f303f867f2656108334a55b8096f59" +checksum = "ca597d77c98894be1f965f2e4e2d2a61575d4998088e655476c73715c54b2b43" dependencies = [ "log", "sqlparser_derive", @@ -2566,8 +2583,14 @@ name = "strum" version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "063e6045c0e62079840579a7e47a355ae92f60eb74daaf156fb1e84ba164e63f" + +[[package]] +name = "strum" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290d54ea6f91c969195bdbcd7442c8c2a2ba87da8bf60a7ee86a235d4bc1e125" dependencies = [ - "strum_macros", + "strum_macros 0.25.1", ] [[package]] @@ -2583,11 +2606,24 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "strum_macros" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6069ca09d878a33f883cc06aaa9718ede171841d3832450354410b718b097232" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.23", +] + [[package]] name = "substrait" -version = "0.10.0" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9df5d9e071804204172dc77e707c363f187e7f6566f9c78e5100c9a8f5ea434e" +checksum = "7d3b77ddddd080d1bb5ebfe6b62d1c4e2f33c9f6a4586d5eac5306a08f3d4585" dependencies = [ "git2", "heck", @@ -2756,17 +2792,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-stream" -version = "0.1.14" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" -dependencies = [ - "futures-core", - "pin-project-lite", - "tokio", -] - [[package]] name = "tokio-util" version = "0.7.8" @@ -2854,9 +2879,9 @@ checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" [[package]] name = "typify" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6658d09e71bfe59e7987dc95ee7f71809fdb5793ab0cdc1503cc0073990484d" +checksum = "be9bb640c0eece20cac2028ebbc2ca1a3d17e3b1ddd98540309c309ed178d158" dependencies = [ "typify-impl", "typify-macro", @@ -2864,9 +2889,9 @@ dependencies = [ [[package]] name = "typify-impl" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34d3bb47587b13edf526d6ed02bf360ecefe083ab47a4ef29fc43112828b2bef" +checksum = "5c8d9ecedde2fd77e975c38eeb9ca40b34ad0247b2259c6e6bbd2a8d6cc2444f" dependencies = [ "heck", "log", @@ -2882,9 +2907,9 @@ dependencies = [ [[package]] name = "typify-macro" -version = "0.0.12" +version = "0.0.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3f7e627c18be12d53bc1f261830b9c2763437b6a86ac57293b9085af2d32ffe" +checksum = "c08942cd65d458d2da15777a649cb6400cb545f17964f1ca965583f22e9cc3a9" dependencies = [ "proc-macro2", "quote", diff --git a/dask_planner/Cargo.toml b/dask_planner/Cargo.toml index 55d3e0882..8a849628b 100644 --- a/dask_planner/Cargo.toml +++ b/dask_planner/Cargo.toml @@ -10,10 +10,10 @@ rust-version = "1.65" [dependencies] async-trait = "0.1.71" -datafusion-python = "26.0.0" +datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.18.3", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" [build-dependencies] diff --git a/dask_planner/src/expression.rs b/dask_planner/src/expression.rs index 80e7b88fa..d13f66e89 100644 --- a/dask_planner/src/expression.rs +++ b/dask_planner/src/expression.rs @@ -104,9 +104,10 @@ impl PyExpr { fn _rex_type(&self, expr: &Expr) -> RexType { match expr { Expr::Alias(..) => RexType::Alias, - Expr::Column(..) | Expr::QualifiedWildcard { .. } | Expr::GetIndexedField { .. } => { - RexType::Reference - } + Expr::Column(..) + | Expr::QualifiedWildcard { .. } + | Expr::GetIndexedField { .. } + | Expr::Wildcard => RexType::Reference, Expr::ScalarVariable(..) | Expr::Literal(..) => RexType::Literal, Expr::BinaryExpr { .. } | Expr::Not(..) @@ -126,7 +127,6 @@ impl PyExpr { | Expr::WindowFunction { .. } | Expr::AggregateUDF { .. } | Expr::InList { .. } - | Expr::Wildcard | Expr::ScalarUDF { .. } | Expr::Exists { .. } | Expr::InSubquery { .. } @@ -197,49 +197,54 @@ impl PyExpr { schema.merge(plan.schema().as_ref()); } let name = get_expr_name(&self.expr).map_err(py_runtime_err)?; - schema - .index_of_column(&Column::from_qualified_name(name.clone())) - .or_else(|_| { - // Handles cases when from_qualified_name doesn't format the Column correctly. - // "name" will always contain the name of the column. Anything in addition to - // that will be separated by a '.' and should be further referenced. - let parts = name.split('.').collect::>(); - let tbl_reference = match parts.len() { - // Single element means name contains just the column name so no TableReference - 1 => None, - // Tablename.column_name - 2 => Some( - TableReference::Bare { - table: Cow::Borrowed(parts[0]), - } - .to_owned_reference(), - ), - // Schema_name.table_name.column_name - 3 => Some( - TableReference::Partial { - schema: Cow::Borrowed(parts[0]), - table: Cow::Borrowed(parts[1]), - } - .to_owned_reference(), - ), - // catalog_name.schema_name.table_name.column_name - 4 => Some( - TableReference::Full { - catalog: Cow::Borrowed(parts[0]), - schema: Cow::Borrowed(parts[1]), - table: Cow::Borrowed(parts[2]), - } - .to_owned_reference(), - ), - _ => None, - }; - - let col = Column { - relation: tbl_reference.clone(), - name: parts[parts.len() - 1].to_string(), - }; - schema.index_of_column(&col).map_err(py_runtime_err) - }) + if name != "*" { + schema + .index_of_column(&Column::from_qualified_name(name.clone())) + .or_else(|_| { + // Handles cases when from_qualified_name doesn't format the Column correctly. + // "name" will always contain the name of the column. Anything in addition to + // that will be separated by a '.' and should be further referenced. + let parts = name.split('.').collect::>(); + let tbl_reference = match parts.len() { + // Single element means name contains just the column name so no TableReference + 1 => None, + // Tablename.column_name + 2 => Some( + TableReference::Bare { + table: Cow::Borrowed(parts[0]), + } + .to_owned_reference(), + ), + // Schema_name.table_name.column_name + 3 => Some( + TableReference::Partial { + schema: Cow::Borrowed(parts[0]), + table: Cow::Borrowed(parts[1]), + } + .to_owned_reference(), + ), + // catalog_name.schema_name.table_name.column_name + 4 => Some( + TableReference::Full { + catalog: Cow::Borrowed(parts[0]), + schema: Cow::Borrowed(parts[1]), + table: Cow::Borrowed(parts[2]), + } + .to_owned_reference(), + ), + _ => None, + }; + + let col = Column { + relation: tbl_reference.clone(), + name: parts[parts.len() - 1].to_string(), + }; + schema.index_of_column(&col).map_err(py_runtime_err) + }) + } else { + // Since this is wildcard any Column will do, just use first one + Ok(0) + } } _ => Err(py_runtime_err( "We need a valid LogicalPlan instance to get the Expr's index in the schema", @@ -425,11 +430,14 @@ impl PyExpr { PyExpr::from(*low.clone(), self.input_plan.clone()), PyExpr::from(*high.clone(), self.input_plan.clone()), ]), + Expr::Wildcard => Ok(vec![PyExpr::from( + self.expr.clone(), + self.input_plan.clone(), + )]), // Currently un-support/implemented Expr types for Rex Call operations Expr::GroupingSet(..) | Expr::OuterReferenceColumn(_, _) - | Expr::Wildcard | Expr::QualifiedWildcard { .. } | Expr::ScalarSubquery(..) | Expr::Placeholder { .. } @@ -894,6 +902,10 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { fn get_expr_name(expr: &Expr) -> Result { match expr { Expr::Alias(expr, _) => get_expr_name(expr), + Expr::Wildcard => { + // 'Wildcard' means any and all columns. We get the first valid column name here + Ok("*".to_owned()) + } _ => Ok(expr.canonical_name()), } } @@ -906,6 +918,10 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { // appear in projections) so we just delegate to the contained expression instead expr_to_field(expr, input_plan) } + Expr::Wildcard => { + // Any column will do. We use the first column to keep things consistent + Ok(input_plan.schema().field(0).clone()) + } _ => { let fields = exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; diff --git a/dask_planner/src/sql.rs b/dask_planner/src/sql.rs index a0e238727..39d4614d4 100644 --- a/dask_planner/src/sql.rs +++ b/dask_planner/src/sql.rs @@ -21,7 +21,7 @@ use datafusion_python::{ }, datafusion_expr::{ logical_plan::Extension, - AccumulatorFunctionImplementation, + AccumulatorFactoryFunction, AggregateUDF, LogicalPlan, ReturnTypeFunction, @@ -385,7 +385,7 @@ impl ContextProvider for DaskSQLContext { } fn get_aggregate_meta(&self, name: &str) -> Option> { - let acc: AccumulatorFunctionImplementation = + let acc: AccumulatorFactoryFunction = Arc::new(|_return_type| Err(DataFusionError::NotImplemented("".to_string()))); let st: StateTypeFunction = @@ -478,6 +478,13 @@ impl ContextProvider for DaskSQLContext { fn options(&self) -> &ConfigOptions { &self.options } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + unimplemented!("RUST: get_window_meta is not yet implemented for DaskSQLContext") + } } #[pymethods] @@ -592,14 +599,19 @@ impl DaskSQLContext { current_node: None, }) .map_err(py_optimization_exp); - if self.dynamic_partition_pruning { - optimizer::DaskSqlOptimizer::dynamic_partition_pruner() - .optimize_once(optimized_plan.unwrap().original_plan) - .map(|k| PyLogicalPlan { - original_plan: k, - current_node: None, - }) - .map_err(py_optimization_exp) + + if let Ok(optimized_plan) = optimized_plan { + if self.dynamic_partition_pruning { + optimizer::DaskSqlOptimizer::dynamic_partition_pruner() + .optimize_once(optimized_plan.original_plan) + .map(|k| PyLogicalPlan { + original_plan: k, + current_node: None, + }) + .map_err(py_optimization_exp) + } else { + Ok(optimized_plan) + } } else { optimized_plan } diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 84c832177..e6b6ed30b 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -127,6 +127,7 @@ class DaskAggregatePlugin(BaseRelPlugin): "avg": AggregationSpecification("mean", AggregationOnPandas("mean")), "stddev": AggregationSpecification("std", AggregationOnPandas("std")), "stddevsamp": AggregationSpecification("std", AggregationOnPandas("std")), + "stddev_samp": AggregationSpecification("std", AggregationOnPandas("std")), "stddevpop": AggregationSpecification( dd.Aggregation( "stddevpop", @@ -142,6 +143,21 @@ class DaskAggregatePlugin(BaseRelPlugin): ** (1 / 2), ) ), + "stddev_pop": AggregationSpecification( + dd.Aggregation( + "stddev_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ) + ** (1 / 2), + ) + ), "bit_and": AggregationSpecification( ReduceAggregation("bit_and", operator.and_) ), @@ -198,6 +214,20 @@ class DaskAggregatePlugin(BaseRelPlugin): ), ) ), + "variance_pop": AggregationSpecification( + dd.Aggregation( + "variance_pop", + lambda s: (s.count(), s.sum(), s.agg(lambda x: (x**2).sum())), + lambda count, sum, sum_of_squares: ( + count.sum(), + sum.sum(), + sum_of_squares.sum(), + ), + lambda count, sum, sum_of_squares: ( + (sum_of_squares / count) - (sum / count) ** 2 + ), + ) + ), } def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContainer: @@ -378,7 +408,11 @@ def _collect_aggregations( "AggregateUDF", }, "Do not know how to handle this case!" for input_expr in agg.getArgs(expr): - input_col = input_expr.column_name(input_rel) + # Wildcard expr + if input_expr.toString() != "*": + input_col = input_expr.column_name(input_rel) + else: + input_col = None if input_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 85d083d78..5ef1d7fb8 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -1077,6 +1077,9 @@ class RexCallPlugin(BaseRexPlugin): "characterlength": TensorScalarOperation( lambda x: x.str.len(), lambda x: len(x) ), + "character_length": TensorScalarOperation( + lambda x: x.str.len(), lambda x: len(x) + ), "upper": TensorScalarOperation(lambda x: x.str.upper(), lambda x: x.upper()), "lower": TensorScalarOperation(lambda x: x.str.lower(), lambda x: x.lower()), "position": PositionOperation(), @@ -1104,6 +1107,7 @@ class RexCallPlugin(BaseRexPlugin): "dsql_totimestamp": ToTimestampOperation(), # Temporary UDF functions that need to be moved after this POC "datepart": ExtractOperation(), + "date_part": ExtractOperation(), "year": YearOperation(), "timestampadd": TimeStampAddOperation(), "timestampceil": CeilFloorOperation("ceil"), diff --git a/tests/integration/test_join.py b/tests/integration/test_join.py index c46cec101..3f19a3211 100644 --- a/tests/integration/test_join.py +++ b/tests/integration/test_join.py @@ -377,7 +377,7 @@ def test_intersect(c): limit 100 """ ) - assert actual_df["COUNT(UInt8(1))"].compute()[0] == 3 + assert actual_df["COUNT(*)"].compute()[0] == 3 # Join df_simple against itself, and then that result against df_wide. Nothing should match so therefore result should be 0 actual_df = c.sql( @@ -392,7 +392,7 @@ def test_intersect(c): limit 100 """ ) - assert len(actual_df["COUNT(UInt8(1))"]) == 0 + assert len(actual_df["COUNT(*)"]) == 0 actual_df = c.sql( """ diff --git a/tests/integration/test_select.py b/tests/integration/test_select.py index 9c4331d77..53ebdc224 100644 --- a/tests/integration/test_select.py +++ b/tests/integration/test_select.py @@ -272,3 +272,15 @@ def test_multiple_column_projection(c, parquet_ddf, input_cols): "read-parquet", ).columns ) == sorted(input_cols) + + +def test_wildcard_select(c): + result_df = c.sql("SELECT COUNT(*) FROM df") + + expected_df = pd.DataFrame( + { + "COUNT(*)": [700], + } + ) + + assert_eq(result_df, expected_df) From 24e0f90c584478fc6ef936020c4851387a4f784a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 11 Jul 2023 21:40:38 -0400 Subject: [PATCH 07/89] remove bit of logic that is no longer needed to manually check the wildcard 'name' as a '*' --- dask_sql/physical/rel/logical/aggregate.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index e6b6ed30b..a14900f99 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -408,11 +408,7 @@ def _collect_aggregations( "AggregateUDF", }, "Do not know how to handle this case!" for input_expr in agg.getArgs(expr): - # Wildcard expr - if input_expr.toString() != "*": - input_col = input_expr.column_name(input_rel) - else: - input_col = None + input_col = input_expr.column_name(input_rel) if input_col not in cc._frontend_backend_mapping: random_name = new_temporary_column(df) new_columns[random_name] = RexConverter.convert( From d776229053e2a3f902ef2786cbdfc6f6ab9825ca Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 08:30:57 -0400 Subject: [PATCH 08/89] experiment with removing zlib, hoping that fixes os x build --- continuous_integration/recipe/meta.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 02e58d1fb..5152cfc4e 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,7 +25,6 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] - - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: @@ -33,7 +32,6 @@ requirements: - python - setuptools-rust - libprotobuf - - zlib run: - python - dask >=2022.3.0 From 99ec8010592a2854de21d58e0fc6b3172246f9a2 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 08:47:59 -0400 Subject: [PATCH 09/89] Change expected_df result to 1.5 from 1. 3/2 is in fact 1.5 and not 1 --- tests/integration/test_rex.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index b49a687d2..3f720e6d0 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -419,7 +419,7 @@ def test_coalesce(c, gpu): "c2": [np.nan], "c3": ["hi"], "c4": ["bye"], - "c5": ["1"], + "c5": ["1.5"], "c6": ["why"], "c7": [2.0], } From 8997f7f739c45f1b74ea024729799a56b1678bed Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 12 Jul 2023 09:03:39 -0400 Subject: [PATCH 10/89] Fix cargo test --- continuous_integration/recipe/meta.yaml | 2 ++ dask_planner/src/sql/optimizer.rs | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 5152cfc4e..02e58d1fb 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -25,6 +25,7 @@ requirements: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] + - zlib # [build_platform != target_platform] - {{ compiler('c') }} - {{ compiler('rust') }} host: @@ -32,6 +33,7 @@ requirements: - python - setuptools-rust - libprotobuf + - zlib run: - python - dask >=2022.3.0 diff --git a/dask_planner/src/sql/optimizer.rs b/dask_planner/src/sql/optimizer.rs index 8e8bc9235..a5957ac98 100644 --- a/dask_planner/src/sql/optimizer.rs +++ b/dask_planner/src/sql/optimizer.rs @@ -230,6 +230,13 @@ mod tests { fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } + + fn get_window_meta( + &self, + _name: &str, + ) -> Option> { + None + } } struct MyTableSource { From 379a97877744488ec3c4f91cdcfc05ea9b0c338c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Thu, 13 Jul 2023 13:47:19 -0400 Subject: [PATCH 11/89] add .cargo/config.toml in hopes of fixing linker build issues on osx --- .cargo/config.toml | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml new file mode 100644 index 000000000..d47f983e4 --- /dev/null +++ b/.cargo/config.toml @@ -0,0 +1,11 @@ +[target.x86_64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] + +[target.aarch64-apple-darwin] +rustflags = [ + "-C", "link-arg=-undefined", + "-C", "link-arg=dynamic_lookup", +] From e030befbedaacddd030a560ff295094e601a0a52 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 13 Jul 2023 17:28:58 -0400 Subject: [PATCH 12/89] Remove extra config.toml --- .cargo/config.toml | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 .cargo/config.toml diff --git a/.cargo/config.toml b/.cargo/config.toml deleted file mode 100644 index d47f983e4..000000000 --- a/.cargo/config.toml +++ /dev/null @@ -1,11 +0,0 @@ -[target.x86_64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] - -[target.aarch64-apple-darwin] -rustflags = [ - "-C", "link-arg=-undefined", - "-C", "link-arg=dynamic_lookup", -] From b2e85dfc5ac3f1a42c30adab2583fee89aec3c86 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jul 2023 11:57:01 -0400 Subject: [PATCH 13/89] Try overriding runner-installed toolchain --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be2d98126..c37e14244 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,6 +55,11 @@ jobs: distributed: true steps: - uses: actions/checkout@v3 + - name: Set up Rust toolchain + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: From d01088d6383b0fdd2f787a07e0d2c4a9353833ea Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 14 Jul 2023 12:32:29 -0400 Subject: [PATCH 14/89] Revert "Try overriding runner-installed toolchain" This reverts commit b2e85dfc5ac3f1a42c30adab2583fee89aec3c86. --- .github/workflows/test.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c37e14244..be2d98126 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -55,11 +55,6 @@ jobs: distributed: true steps: - uses: actions/checkout@v3 - - name: Set up Rust toolchain - uses: actions-rs/toolchain@v1 - with: - toolchain: stable - override: true - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: From ca70f0f76ca0ab34591c3679385acf52d6cd6136 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 09:51:03 -0400 Subject: [PATCH 15/89] Initial migration to maturin build system --- {dask_planner/.cargo => .cargo}/config.toml | 0 .gitignore | 10 +-- .pre-commit-config.yaml | 6 +- dask_planner/Cargo.lock => Cargo.lock | 2 +- dask_planner/Cargo.toml => Cargo.toml | 14 ++-- .../scripts}/update-dependencies.sh | 0 dask_planner/.classpath | 55 -------------- dask_planner/.gitignore | 72 ------------------- .../org.eclipse.core.resources.prefs | 5 -- .../.settings/org.eclipse.jdt.apt.core.prefs | 2 - .../.settings/org.eclipse.jdt.core.prefs | 9 --- .../.settings/org.eclipse.m2e.core.prefs | 4 -- dask_planner/MANIFEST.in | 2 - dask_planner/README.md | 0 dask_planner/pyproject.toml | 11 --- dask_sql/context.py | 3 +- dask_sql/input_utils/hive.py | 3 +- dask_sql/mappings.py | 3 +- dask_sql/physical/rel/base.py | 3 +- dask_sql/physical/rel/convert.py | 3 +- dask_sql/physical/rel/custom/alter.py | 3 +- dask_sql/physical/rel/custom/analyze_table.py | 3 +- .../rel/custom/create_catalog_schema.py | 3 +- dask_sql/physical/rel/custom/create_table.py | 3 +- .../physical/rel/custom/describe_model.py | 3 +- dask_sql/physical/rel/custom/distributeby.py | 3 +- dask_sql/physical/rel/custom/drop_schema.py | 3 +- dask_sql/physical/rel/custom/export_model.py | 3 +- dask_sql/physical/rel/custom/predict_model.py | 3 +- dask_sql/physical/rel/custom/show_models.py | 3 +- dask_sql/physical/rel/custom/use_schema.py | 3 +- dask_sql/physical/rel/logical/aggregate.py | 3 +- dask_sql/physical/rel/logical/cross_join.py | 3 +- dask_sql/physical/rel/logical/empty.py | 3 +- dask_sql/physical/rel/logical/explain.py | 3 +- dask_sql/physical/rel/logical/filter.py | 3 +- dask_sql/physical/rel/logical/join.py | 3 +- dask_sql/physical/rel/logical/limit.py | 3 +- dask_sql/physical/rel/logical/project.py | 6 +- dask_sql/physical/rel/logical/sort.py | 3 +- .../physical/rel/logical/subquery_alias.py | 3 +- dask_sql/physical/rel/logical/table_scan.py | 3 +- dask_sql/physical/rel/logical/union.py | 3 +- dask_sql/physical/rel/logical/window.py | 3 +- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 3 +- dask_sql/physical/rex/core/alias.py | 3 +- dask_sql/physical/rex/core/call.py | 5 +- dask_sql/physical/rex/core/input_ref.py | 3 +- dask_sql/physical/rex/core/literal.py | 5 +- dask_sql/physical/rex/core/subquery.py | 3 +- dask_sql/utils.py | 2 +- pyproject.toml | 11 +-- setup.py | 9 --- {dask_planner/src => src}/dialect.rs | 0 {dask_planner/src => src}/error.rs | 0 {dask_planner/src => src}/expression.rs | 0 {dask_planner/src => src}/lib.rs | 2 +- {dask_planner/src => src}/parser.rs | 0 {dask_planner/src => src}/sql.rs | 0 {dask_planner/src => src}/sql/column.rs | 0 {dask_planner/src => src}/sql/exceptions.rs | 0 {dask_planner/src => src}/sql/function.rs | 0 {dask_planner/src => src}/sql/logical.rs | 0 .../src => src}/sql/logical/aggregate.rs | 0 .../src => src}/sql/logical/alter_schema.rs | 0 .../src => src}/sql/logical/alter_table.rs | 0 .../src => src}/sql/logical/analyze_table.rs | 0 .../sql/logical/create_catalog_schema.rs | 0 .../sql/logical/create_experiment.rs | 0 .../sql/logical/create_memory_table.rs | 0 .../src => src}/sql/logical/create_model.rs | 0 .../src => src}/sql/logical/create_table.rs | 0 .../src => src}/sql/logical/describe_model.rs | 0 .../src => src}/sql/logical/drop_model.rs | 0 .../src => src}/sql/logical/drop_schema.rs | 0 .../src => src}/sql/logical/drop_table.rs | 0 .../src => src}/sql/logical/empty_relation.rs | 0 .../src => src}/sql/logical/explain.rs | 0 .../src => src}/sql/logical/export_model.rs | 0 .../src => src}/sql/logical/filter.rs | 0 {dask_planner/src => src}/sql/logical/join.rs | 0 .../src => src}/sql/logical/limit.rs | 0 .../src => src}/sql/logical/predict_model.rs | 0 .../src => src}/sql/logical/projection.rs | 0 .../src => src}/sql/logical/repartition_by.rs | 0 .../src => src}/sql/logical/show_columns.rs | 0 .../src => src}/sql/logical/show_models.rs | 0 .../src => src}/sql/logical/show_schemas.rs | 0 .../src => src}/sql/logical/show_tables.rs | 0 {dask_planner/src => src}/sql/logical/sort.rs | 0 .../src => src}/sql/logical/subquery_alias.rs | 0 .../src => src}/sql/logical/table_scan.rs | 0 .../src => src}/sql/logical/use_schema.rs | 0 .../src => src}/sql/logical/window.rs | 0 {dask_planner/src => src}/sql/optimizer.rs | 0 .../optimizer/dynamic_partition_pruning.rs | 0 .../src => src}/sql/optimizer/join_reorder.rs | 0 {dask_planner/src => src}/sql/parser_utils.rs | 0 {dask_planner/src => src}/sql/schema.rs | 0 {dask_planner/src => src}/sql/statement.rs | 0 {dask_planner/src => src}/sql/table.rs | 0 {dask_planner/src => src}/sql/types.rs | 0 .../src => src}/sql/types/rel_data_type.rs | 0 .../sql/types/rel_data_type_field.rs | 0 tests/unit/test_mapping.py | 2 +- 106 files changed, 95 insertions(+), 237 deletions(-) rename {dask_planner/.cargo => .cargo}/config.toml (100%) rename dask_planner/Cargo.lock => Cargo.lock (99%) rename dask_planner/Cargo.toml => Cargo.toml (83%) rename {dask_planner => continuous_integration/scripts}/update-dependencies.sh (100%) delete mode 100644 dask_planner/.classpath delete mode 100644 dask_planner/.gitignore delete mode 100644 dask_planner/.settings/org.eclipse.core.resources.prefs delete mode 100644 dask_planner/.settings/org.eclipse.jdt.apt.core.prefs delete mode 100644 dask_planner/.settings/org.eclipse.jdt.core.prefs delete mode 100644 dask_planner/.settings/org.eclipse.m2e.core.prefs delete mode 100644 dask_planner/MANIFEST.in delete mode 100644 dask_planner/README.md delete mode 100644 dask_planner/pyproject.toml rename {dask_planner/src => src}/dialect.rs (100%) rename {dask_planner/src => src}/error.rs (100%) rename {dask_planner/src => src}/expression.rs (100%) rename {dask_planner/src => src}/lib.rs (97%) rename {dask_planner/src => src}/parser.rs (100%) rename {dask_planner/src => src}/sql.rs (100%) rename {dask_planner/src => src}/sql/column.rs (100%) rename {dask_planner/src => src}/sql/exceptions.rs (100%) rename {dask_planner/src => src}/sql/function.rs (100%) rename {dask_planner/src => src}/sql/logical.rs (100%) rename {dask_planner/src => src}/sql/logical/aggregate.rs (100%) rename {dask_planner/src => src}/sql/logical/alter_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/alter_table.rs (100%) rename {dask_planner/src => src}/sql/logical/analyze_table.rs (100%) rename {dask_planner/src => src}/sql/logical/create_catalog_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/create_experiment.rs (100%) rename {dask_planner/src => src}/sql/logical/create_memory_table.rs (100%) rename {dask_planner/src => src}/sql/logical/create_model.rs (100%) rename {dask_planner/src => src}/sql/logical/create_table.rs (100%) rename {dask_planner/src => src}/sql/logical/describe_model.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_model.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/drop_table.rs (100%) rename {dask_planner/src => src}/sql/logical/empty_relation.rs (100%) rename {dask_planner/src => src}/sql/logical/explain.rs (100%) rename {dask_planner/src => src}/sql/logical/export_model.rs (100%) rename {dask_planner/src => src}/sql/logical/filter.rs (100%) rename {dask_planner/src => src}/sql/logical/join.rs (100%) rename {dask_planner/src => src}/sql/logical/limit.rs (100%) rename {dask_planner/src => src}/sql/logical/predict_model.rs (100%) rename {dask_planner/src => src}/sql/logical/projection.rs (100%) rename {dask_planner/src => src}/sql/logical/repartition_by.rs (100%) rename {dask_planner/src => src}/sql/logical/show_columns.rs (100%) rename {dask_planner/src => src}/sql/logical/show_models.rs (100%) rename {dask_planner/src => src}/sql/logical/show_schemas.rs (100%) rename {dask_planner/src => src}/sql/logical/show_tables.rs (100%) rename {dask_planner/src => src}/sql/logical/sort.rs (100%) rename {dask_planner/src => src}/sql/logical/subquery_alias.rs (100%) rename {dask_planner/src => src}/sql/logical/table_scan.rs (100%) rename {dask_planner/src => src}/sql/logical/use_schema.rs (100%) rename {dask_planner/src => src}/sql/logical/window.rs (100%) rename {dask_planner/src => src}/sql/optimizer.rs (100%) rename {dask_planner/src => src}/sql/optimizer/dynamic_partition_pruning.rs (100%) rename {dask_planner/src => src}/sql/optimizer/join_reorder.rs (100%) rename {dask_planner/src => src}/sql/parser_utils.rs (100%) rename {dask_planner/src => src}/sql/schema.rs (100%) rename {dask_planner/src => src}/sql/statement.rs (100%) rename {dask_planner/src => src}/sql/table.rs (100%) rename {dask_planner/src => src}/sql/types.rs (100%) rename {dask_planner/src => src}/sql/types/rel_data_type.rs (100%) rename {dask_planner/src => src}/sql/types/rel_data_type_field.rs (100%) diff --git a/dask_planner/.cargo/config.toml b/.cargo/config.toml similarity index 100% rename from dask_planner/.cargo/config.toml rename to .cargo/config.toml diff --git a/.gitignore b/.gitignore index 245817fc1..d41df8a68 100644 --- a/.gitignore +++ b/.gitignore @@ -46,23 +46,15 @@ venv # IDE .idea .vscode -planner/.classpath -planner/.project -planner/.settings/ -planner/.idea -planner/*.iml *.swp # project specific -planner/dependency-reduced-pom.xml -planner/target/ -dask_sql/jar -.next/ dask-worker-space/ node_modules/ docs/source/_build/ tests/unit/queries tests/unit/data +target/* # Ignore development specific local testing files dev_tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ed701014a..094c4ada1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -20,9 +20,9 @@ repos: rev: v1.0 hooks: - id: cargo-check - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] - id: clippy - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--', '-D', 'warnings'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--', '-D', 'warnings'] - repo: https://github.com/pre-commit/pre-commit-hooks rev: v4.2.0 hooks: @@ -39,4 +39,4 @@ repos: entry: cargo +nightly fmt language: system types: [rust] - args: ['--manifest-path', './dask_planner/Cargo.toml', '--verbose', '--'] + args: ['--manifest-path', './Cargo.toml', '--verbose', '--'] diff --git a/dask_planner/Cargo.lock b/Cargo.lock similarity index 99% rename from dask_planner/Cargo.lock rename to Cargo.lock index 1a75a215d..accc91264 100644 --- a/dask_planner/Cargo.lock +++ b/Cargo.lock @@ -658,7 +658,7 @@ dependencies = [ ] [[package]] -name = "dask_planner" +name = "dask-planner" version = "0.1.0" dependencies = [ "async-trait", diff --git a/dask_planner/Cargo.toml b/Cargo.toml similarity index 83% rename from dask_planner/Cargo.toml rename to Cargo.toml index 8a849628b..eefc51a32 100644 --- a/dask_planner/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "dask_planner" +name = "dask-planner" repository = "https://github.com/dask-contrib/dask-sql" version = "0.1.0" description = "Bindings for DataFusion used by Dask-SQL" @@ -8,16 +8,20 @@ license = "Apache-2.0" edition = "2021" rust-version = "1.65" +[lib] +name = "dask_planner" +crate-type = ["cdylib"] + [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" +[dependencies.pyo3] +version = "0.19.0" +features = ["abi3-py38"] + [build-dependencies] pyo3-build-config = "0.19.1" - -[lib] -crate-type = ["cdylib"] diff --git a/dask_planner/update-dependencies.sh b/continuous_integration/scripts/update-dependencies.sh similarity index 100% rename from dask_planner/update-dependencies.sh rename to continuous_integration/scripts/update-dependencies.sh diff --git a/dask_planner/.classpath b/dask_planner/.classpath deleted file mode 100644 index b14b13a76..000000000 --- a/dask_planner/.classpath +++ /dev/null @@ -1,55 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/dask_planner/.gitignore b/dask_planner/.gitignore deleted file mode 100644 index c8f044299..000000000 --- a/dask_planner/.gitignore +++ /dev/null @@ -1,72 +0,0 @@ -/target - -# Byte-compiled / optimized / DLL files -__pycache__/ -.pytest_cache/ -*.py[cod] - -# C extensions -*.so - -# Distribution / packaging -.Python -.venv/ -env/ -bin/ -build/ -develop-eggs/ -dist/ -eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -include/ -man/ -venv/ -*.egg-info/ -.installed.cfg -*.egg - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt -pip-selfcheck.json - -# Unit test / coverage reports -htmlcov/ -.tox/ -.coverage -.cache -nosetests.xml -coverage.xml - -# Translations -*.mo - -# Mr Developer -.mr.developer.cfg -.project -.pydevproject - -# Rope -.ropeproject - -# Django stuff: -*.log -*.pot - -.DS_Store - -# Sphinx documentation -docs/_build/ - -# PyCharm -.idea/ - -# VSCode -.vscode/ - -# Pyenv -.python-version diff --git a/dask_planner/.settings/org.eclipse.core.resources.prefs b/dask_planner/.settings/org.eclipse.core.resources.prefs deleted file mode 100644 index 92920805e..000000000 --- a/dask_planner/.settings/org.eclipse.core.resources.prefs +++ /dev/null @@ -1,5 +0,0 @@ -eclipse.preferences.version=1 -encoding//src/main/java=UTF-8 -encoding//src/main/resources=UTF-8 -encoding//target/generated-sources/annotations=UTF-8 -encoding/=UTF-8 diff --git a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs deleted file mode 100644 index d4313d4b2..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.apt.core.prefs +++ /dev/null @@ -1,2 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.apt.aptEnabled=false diff --git a/dask_planner/.settings/org.eclipse.jdt.core.prefs b/dask_planner/.settings/org.eclipse.jdt.core.prefs deleted file mode 100644 index 1b6e1ef22..000000000 --- a/dask_planner/.settings/org.eclipse.jdt.core.prefs +++ /dev/null @@ -1,9 +0,0 @@ -eclipse.preferences.version=1 -org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 -org.eclipse.jdt.core.compiler.compliance=1.8 -org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled -org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning -org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=ignore -org.eclipse.jdt.core.compiler.processAnnotations=disabled -org.eclipse.jdt.core.compiler.release=disabled -org.eclipse.jdt.core.compiler.source=1.8 diff --git a/dask_planner/.settings/org.eclipse.m2e.core.prefs b/dask_planner/.settings/org.eclipse.m2e.core.prefs deleted file mode 100644 index f897a7f1c..000000000 --- a/dask_planner/.settings/org.eclipse.m2e.core.prefs +++ /dev/null @@ -1,4 +0,0 @@ -activeProfiles= -eclipse.preferences.version=1 -resolveWorkspaceProjects=true -version=1 diff --git a/dask_planner/MANIFEST.in b/dask_planner/MANIFEST.in deleted file mode 100644 index 7c68298bd..000000000 --- a/dask_planner/MANIFEST.in +++ /dev/null @@ -1,2 +0,0 @@ -include Cargo.toml -recursive-include src * diff --git a/dask_planner/README.md b/dask_planner/README.md deleted file mode 100644 index e69de29bb..000000000 diff --git a/dask_planner/pyproject.toml b/dask_planner/pyproject.toml deleted file mode 100644 index f153e3f5a..000000000 --- a/dask_planner/pyproject.toml +++ /dev/null @@ -1,11 +0,0 @@ -[build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] - -[project] -name = "datafusion_planner" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", -] diff --git a/dask_sql/context.py b/dask_sql/context.py index 81e0a38a1..03947fa76 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,8 +9,7 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer - -from dask_planner.rust import ( +from dask_planner import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 4d0eb9cce..5d500180d 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,8 +5,7 @@ from typing import Any, Union import dask.dataframe as dd - -from dask_planner.rust import SqlTypeName +from dask_planner import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 9ba22f797..3d39ee392 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,8 +7,7 @@ import dask.dataframe as dd import numpy as np import pandas as pd - -from dask_planner.rust import DaskTypeMap, SqlTypeName +from dask_planner import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index a1f378197..f4463fe62 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,8 +7,9 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: + from dask_planner import LogicalPlan, RelDataType + import dask_sql - from dask_planner.rust import LogicalPlan, RelDataType logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 29ad8c327..24b06c337 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,8 +7,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 9c8a159b0..16ed9e9bb 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,8 +6,9 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 69f734a54..77edfff4b 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,8 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 52ed37b55..74f964621 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 36b165230..526ec9728 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,8 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index d915a6b0b..8b2e144ff 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,8 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index c7ce70610..6b6dba0b8 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,8 +6,9 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 444662e2b..455b27fa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index 07cf9979e..c96d19786 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,8 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index 917d712c3..c0339b1d7 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,8 +9,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index 3f879dd38..ecc81e82a 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,8 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 889dd2b1c..563415c2d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index a14900f99..27f5c102c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,8 +15,9 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 5f32d3257..dfa8cdf3c 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,8 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 23f8d1cd3..b50699b79 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,8 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index 69d20fca3..abf1d814c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,8 +3,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index d3c3f5fd3..a37e390ec 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,8 +11,9 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index c1c904af6..cec7df4d9 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,8 +17,9 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 3e2fc6434..00ba37fa2 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,8 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index b990e21b4..4630b5d6b 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,15 +1,17 @@ import logging from typing import TYPE_CHECKING -from dask_planner.rust import RexType +from dask_planner import RexType + from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 2e1376d41..6dc57211c 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,8 +5,9 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index 2473167d7..e82d9b105 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,8 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b4025ec97..b3b5cab0a 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,8 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 830f7f981..1fbc5b5ae 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,8 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 331876c49..bbcdae740 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,8 +16,9 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: + from dask_planner import LogicalPlan + import dask_sql - from dask_planner.rust import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 5724a4536..7f97a70d9 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner.rust import Expression, LogicalPlan + from dask_planner import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 71431cbb4..fce64be30 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,8 +8,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 40c373766..d6ae20698 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,8 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 5ef1d7fb8..56d01d006 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,8 +13,8 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( @@ -34,8 +34,9 @@ ) if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 4272c832e..01bf871c7 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,8 +6,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 73e3b8185..7fe59b383 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,15 +4,16 @@ import dask.dataframe as dd import numpy as np +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 5e0a33098..1253f257d 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,8 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: + from dask_planner import Expression, LogicalPlan + import dask_sql - from dask_planner.rust import Expression, LogicalPlan class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 39c165597..c2cfe45ab 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,8 +8,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value diff --git a/pyproject.toml b/pyproject.toml index dfed2ba50..17392d3b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,14 @@ [build-system] -requires = ["setuptools", "wheel", "setuptools-rust"] - -[tool.isort] -profile = "black" +requires = ["maturin>=1.0,<2.0"] +build-backend = "maturin" [tool.maturin] +features = ["pyo3/extension-module"] include = [ { path = "Cargo.lock", format = "sdist" } ] exclude = [".github/**", "ci/**", ".asf.yaml"] -# Require Cargo.lock is up to date locked = true + +[tool.isort] +profile = "black" diff --git a/setup.py b/setup.py index d149ac5f0..02693d0d6 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,6 @@ import sys from setuptools import find_packages, setup -from setuptools_rust import Binding, RustExtension import versioneer @@ -31,14 +30,6 @@ include=["dask_sql", "dask_sql.*", "dask_planner", "dask_planner.*"] ), package_data={"dask_sql": ["sql*.yaml"]}, - rust_extensions=[ - RustExtension( - "dask_planner.rust", - binding=Binding.PyO3, - path="dask_planner/Cargo.toml", - debug=debug_build, - ) - ], python_requires=">=3.8", setup_requires=sphinx_requirements, install_requires=[ diff --git a/dask_planner/src/dialect.rs b/src/dialect.rs similarity index 100% rename from dask_planner/src/dialect.rs rename to src/dialect.rs diff --git a/dask_planner/src/error.rs b/src/error.rs similarity index 100% rename from dask_planner/src/error.rs rename to src/error.rs diff --git a/dask_planner/src/expression.rs b/src/expression.rs similarity index 100% rename from dask_planner/src/expression.rs rename to src/expression.rs diff --git a/dask_planner/src/lib.rs b/src/lib.rs similarity index 97% rename from dask_planner/src/lib.rs rename to src/lib.rs index f5305d900..9f446b7e2 100644 --- a/dask_planner/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -#[pyo3(name = "rust")] +#[pyo3(name = "dask_planner")] fn rust(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/dask_planner/src/parser.rs b/src/parser.rs similarity index 100% rename from dask_planner/src/parser.rs rename to src/parser.rs diff --git a/dask_planner/src/sql.rs b/src/sql.rs similarity index 100% rename from dask_planner/src/sql.rs rename to src/sql.rs diff --git a/dask_planner/src/sql/column.rs b/src/sql/column.rs similarity index 100% rename from dask_planner/src/sql/column.rs rename to src/sql/column.rs diff --git a/dask_planner/src/sql/exceptions.rs b/src/sql/exceptions.rs similarity index 100% rename from dask_planner/src/sql/exceptions.rs rename to src/sql/exceptions.rs diff --git a/dask_planner/src/sql/function.rs b/src/sql/function.rs similarity index 100% rename from dask_planner/src/sql/function.rs rename to src/sql/function.rs diff --git a/dask_planner/src/sql/logical.rs b/src/sql/logical.rs similarity index 100% rename from dask_planner/src/sql/logical.rs rename to src/sql/logical.rs diff --git a/dask_planner/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs similarity index 100% rename from dask_planner/src/sql/logical/aggregate.rs rename to src/sql/logical/aggregate.rs diff --git a/dask_planner/src/sql/logical/alter_schema.rs b/src/sql/logical/alter_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/alter_schema.rs rename to src/sql/logical/alter_schema.rs diff --git a/dask_planner/src/sql/logical/alter_table.rs b/src/sql/logical/alter_table.rs similarity index 100% rename from dask_planner/src/sql/logical/alter_table.rs rename to src/sql/logical/alter_table.rs diff --git a/dask_planner/src/sql/logical/analyze_table.rs b/src/sql/logical/analyze_table.rs similarity index 100% rename from dask_planner/src/sql/logical/analyze_table.rs rename to src/sql/logical/analyze_table.rs diff --git a/dask_planner/src/sql/logical/create_catalog_schema.rs b/src/sql/logical/create_catalog_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/create_catalog_schema.rs rename to src/sql/logical/create_catalog_schema.rs diff --git a/dask_planner/src/sql/logical/create_experiment.rs b/src/sql/logical/create_experiment.rs similarity index 100% rename from dask_planner/src/sql/logical/create_experiment.rs rename to src/sql/logical/create_experiment.rs diff --git a/dask_planner/src/sql/logical/create_memory_table.rs b/src/sql/logical/create_memory_table.rs similarity index 100% rename from dask_planner/src/sql/logical/create_memory_table.rs rename to src/sql/logical/create_memory_table.rs diff --git a/dask_planner/src/sql/logical/create_model.rs b/src/sql/logical/create_model.rs similarity index 100% rename from dask_planner/src/sql/logical/create_model.rs rename to src/sql/logical/create_model.rs diff --git a/dask_planner/src/sql/logical/create_table.rs b/src/sql/logical/create_table.rs similarity index 100% rename from dask_planner/src/sql/logical/create_table.rs rename to src/sql/logical/create_table.rs diff --git a/dask_planner/src/sql/logical/describe_model.rs b/src/sql/logical/describe_model.rs similarity index 100% rename from dask_planner/src/sql/logical/describe_model.rs rename to src/sql/logical/describe_model.rs diff --git a/dask_planner/src/sql/logical/drop_model.rs b/src/sql/logical/drop_model.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_model.rs rename to src/sql/logical/drop_model.rs diff --git a/dask_planner/src/sql/logical/drop_schema.rs b/src/sql/logical/drop_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_schema.rs rename to src/sql/logical/drop_schema.rs diff --git a/dask_planner/src/sql/logical/drop_table.rs b/src/sql/logical/drop_table.rs similarity index 100% rename from dask_planner/src/sql/logical/drop_table.rs rename to src/sql/logical/drop_table.rs diff --git a/dask_planner/src/sql/logical/empty_relation.rs b/src/sql/logical/empty_relation.rs similarity index 100% rename from dask_planner/src/sql/logical/empty_relation.rs rename to src/sql/logical/empty_relation.rs diff --git a/dask_planner/src/sql/logical/explain.rs b/src/sql/logical/explain.rs similarity index 100% rename from dask_planner/src/sql/logical/explain.rs rename to src/sql/logical/explain.rs diff --git a/dask_planner/src/sql/logical/export_model.rs b/src/sql/logical/export_model.rs similarity index 100% rename from dask_planner/src/sql/logical/export_model.rs rename to src/sql/logical/export_model.rs diff --git a/dask_planner/src/sql/logical/filter.rs b/src/sql/logical/filter.rs similarity index 100% rename from dask_planner/src/sql/logical/filter.rs rename to src/sql/logical/filter.rs diff --git a/dask_planner/src/sql/logical/join.rs b/src/sql/logical/join.rs similarity index 100% rename from dask_planner/src/sql/logical/join.rs rename to src/sql/logical/join.rs diff --git a/dask_planner/src/sql/logical/limit.rs b/src/sql/logical/limit.rs similarity index 100% rename from dask_planner/src/sql/logical/limit.rs rename to src/sql/logical/limit.rs diff --git a/dask_planner/src/sql/logical/predict_model.rs b/src/sql/logical/predict_model.rs similarity index 100% rename from dask_planner/src/sql/logical/predict_model.rs rename to src/sql/logical/predict_model.rs diff --git a/dask_planner/src/sql/logical/projection.rs b/src/sql/logical/projection.rs similarity index 100% rename from dask_planner/src/sql/logical/projection.rs rename to src/sql/logical/projection.rs diff --git a/dask_planner/src/sql/logical/repartition_by.rs b/src/sql/logical/repartition_by.rs similarity index 100% rename from dask_planner/src/sql/logical/repartition_by.rs rename to src/sql/logical/repartition_by.rs diff --git a/dask_planner/src/sql/logical/show_columns.rs b/src/sql/logical/show_columns.rs similarity index 100% rename from dask_planner/src/sql/logical/show_columns.rs rename to src/sql/logical/show_columns.rs diff --git a/dask_planner/src/sql/logical/show_models.rs b/src/sql/logical/show_models.rs similarity index 100% rename from dask_planner/src/sql/logical/show_models.rs rename to src/sql/logical/show_models.rs diff --git a/dask_planner/src/sql/logical/show_schemas.rs b/src/sql/logical/show_schemas.rs similarity index 100% rename from dask_planner/src/sql/logical/show_schemas.rs rename to src/sql/logical/show_schemas.rs diff --git a/dask_planner/src/sql/logical/show_tables.rs b/src/sql/logical/show_tables.rs similarity index 100% rename from dask_planner/src/sql/logical/show_tables.rs rename to src/sql/logical/show_tables.rs diff --git a/dask_planner/src/sql/logical/sort.rs b/src/sql/logical/sort.rs similarity index 100% rename from dask_planner/src/sql/logical/sort.rs rename to src/sql/logical/sort.rs diff --git a/dask_planner/src/sql/logical/subquery_alias.rs b/src/sql/logical/subquery_alias.rs similarity index 100% rename from dask_planner/src/sql/logical/subquery_alias.rs rename to src/sql/logical/subquery_alias.rs diff --git a/dask_planner/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs similarity index 100% rename from dask_planner/src/sql/logical/table_scan.rs rename to src/sql/logical/table_scan.rs diff --git a/dask_planner/src/sql/logical/use_schema.rs b/src/sql/logical/use_schema.rs similarity index 100% rename from dask_planner/src/sql/logical/use_schema.rs rename to src/sql/logical/use_schema.rs diff --git a/dask_planner/src/sql/logical/window.rs b/src/sql/logical/window.rs similarity index 100% rename from dask_planner/src/sql/logical/window.rs rename to src/sql/logical/window.rs diff --git a/dask_planner/src/sql/optimizer.rs b/src/sql/optimizer.rs similarity index 100% rename from dask_planner/src/sql/optimizer.rs rename to src/sql/optimizer.rs diff --git a/dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs similarity index 100% rename from dask_planner/src/sql/optimizer/dynamic_partition_pruning.rs rename to src/sql/optimizer/dynamic_partition_pruning.rs diff --git a/dask_planner/src/sql/optimizer/join_reorder.rs b/src/sql/optimizer/join_reorder.rs similarity index 100% rename from dask_planner/src/sql/optimizer/join_reorder.rs rename to src/sql/optimizer/join_reorder.rs diff --git a/dask_planner/src/sql/parser_utils.rs b/src/sql/parser_utils.rs similarity index 100% rename from dask_planner/src/sql/parser_utils.rs rename to src/sql/parser_utils.rs diff --git a/dask_planner/src/sql/schema.rs b/src/sql/schema.rs similarity index 100% rename from dask_planner/src/sql/schema.rs rename to src/sql/schema.rs diff --git a/dask_planner/src/sql/statement.rs b/src/sql/statement.rs similarity index 100% rename from dask_planner/src/sql/statement.rs rename to src/sql/statement.rs diff --git a/dask_planner/src/sql/table.rs b/src/sql/table.rs similarity index 100% rename from dask_planner/src/sql/table.rs rename to src/sql/table.rs diff --git a/dask_planner/src/sql/types.rs b/src/sql/types.rs similarity index 100% rename from dask_planner/src/sql/types.rs rename to src/sql/types.rs diff --git a/dask_planner/src/sql/types/rel_data_type.rs b/src/sql/types/rel_data_type.rs similarity index 100% rename from dask_planner/src/sql/types/rel_data_type.rs rename to src/sql/types/rel_data_type.rs diff --git a/dask_planner/src/sql/types/rel_data_type_field.rs b/src/sql/types/rel_data_type_field.rs similarity index 100% rename from dask_planner/src/sql/types/rel_data_type_field.rs rename to src/sql/types/rel_data_type_field.rs diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index b49ed1aae..952bcb10e 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd import pytest +from dask_planner import SqlTypeName -from dask_planner.rust import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value From d900f0ed4a54fe2091b93fc905ae3c6e29dac8f4 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:27:29 -0400 Subject: [PATCH 16/89] Make some modifications to Rust package name --- Cargo.lock | 4 +- Cargo.toml | 13 ++-- dask_sql/context.py | 3 +- dask_sql/input_utils/hive.py | 3 +- dask_sql/mappings.py | 3 +- dask_sql/physical/rel/base.py | 4 +- dask_sql/physical/rel/convert.py | 4 +- dask_sql/physical/rel/custom/alter.py | 4 +- dask_sql/physical/rel/custom/analyze_table.py | 4 +- .../rel/custom/create_catalog_schema.py | 4 +- .../rel/custom/create_memory_table.py | 3 +- dask_sql/physical/rel/custom/create_table.py | 4 +- .../physical/rel/custom/describe_model.py | 4 +- dask_sql/physical/rel/custom/distributeby.py | 4 +- dask_sql/physical/rel/custom/drop_schema.py | 4 +- dask_sql/physical/rel/custom/export_model.py | 4 +- dask_sql/physical/rel/custom/predict_model.py | 4 +- dask_sql/physical/rel/custom/show_columns.py | 3 +- dask_sql/physical/rel/custom/show_models.py | 4 +- dask_sql/physical/rel/custom/show_schemas.py | 3 +- dask_sql/physical/rel/custom/show_tables.py | 3 +- dask_sql/physical/rel/custom/use_schema.py | 4 +- dask_sql/physical/rel/logical/aggregate.py | 4 +- dask_sql/physical/rel/logical/cross_join.py | 4 +- dask_sql/physical/rel/logical/empty.py | 4 +- dask_sql/physical/rel/logical/explain.py | 4 +- dask_sql/physical/rel/logical/filter.py | 4 +- dask_sql/physical/rel/logical/join.py | 4 +- dask_sql/physical/rel/logical/limit.py | 4 +- dask_sql/physical/rel/logical/project.py | 8 +- dask_sql/physical/rel/logical/sort.py | 4 +- .../physical/rel/logical/subquery_alias.py | 4 +- dask_sql/physical/rel/logical/table_scan.py | 4 +- dask_sql/physical/rel/logical/union.py | 4 +- dask_sql/physical/rel/logical/window.py | 4 +- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 4 +- dask_sql/physical/rex/core/alias.py | 4 +- dask_sql/physical/rex/core/call.py | 7 +- dask_sql/physical/rex/core/input_ref.py | 4 +- dask_sql/physical/rex/core/literal.py | 7 +- dask_sql/physical/rex/core/subquery.py | 4 +- dask_sql/utils.py | 3 +- pyproject.toml | 74 ++++++++++++++++++- setup.py | 74 +------------------ src/lib.rs | 3 +- src/sql.rs | 2 +- src/sql/column.rs | 2 +- src/sql/function.rs | 2 +- src/sql/logical.rs | 2 +- src/sql/logical/aggregate.rs | 2 +- src/sql/logical/alter_schema.rs | 2 +- src/sql/logical/alter_table.rs | 2 +- src/sql/logical/analyze_table.rs | 2 +- src/sql/logical/create_catalog_schema.rs | 2 +- src/sql/logical/create_experiment.rs | 2 +- src/sql/logical/create_memory_table.rs | 2 +- src/sql/logical/create_model.rs | 2 +- src/sql/logical/create_table.rs | 2 +- src/sql/logical/describe_model.rs | 2 +- src/sql/logical/drop_model.rs | 2 +- src/sql/logical/drop_schema.rs | 2 +- src/sql/logical/drop_table.rs | 2 +- src/sql/logical/empty_relation.rs | 2 +- src/sql/logical/explain.rs | 2 +- src/sql/logical/export_model.rs | 2 +- src/sql/logical/filter.rs | 2 +- src/sql/logical/join.rs | 2 +- src/sql/logical/limit.rs | 2 +- src/sql/logical/predict_model.rs | 2 +- src/sql/logical/projection.rs | 2 +- src/sql/logical/repartition_by.rs | 2 +- src/sql/logical/show_columns.rs | 2 +- src/sql/logical/show_models.rs | 2 +- src/sql/logical/show_schemas.rs | 2 +- src/sql/logical/show_tables.rs | 2 +- src/sql/logical/sort.rs | 2 +- src/sql/logical/subquery_alias.rs | 2 +- src/sql/logical/table_scan.rs | 4 +- src/sql/logical/use_schema.rs | 2 +- src/sql/logical/window.rs | 6 +- src/sql/schema.rs | 2 +- src/sql/statement.rs | 2 +- src/sql/table.rs | 4 +- src/sql/types/rel_data_type.rs | 2 +- src/sql/types/rel_data_type_field.rs | 2 +- tests/unit/test_mapping.py | 3 +- 87 files changed, 214 insertions(+), 209 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index accc91264..cb035a053 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -658,8 +658,8 @@ dependencies = [ ] [[package]] -name = "dask-planner" -version = "0.1.0" +name = "dask-sql" +version = "2023.6.0" dependencies = [ "async-trait", "datafusion-python", diff --git a/Cargo.toml b/Cargo.toml index eefc51a32..11444b09c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] -name = "dask-planner" +name = "dask-sql" repository = "https://github.com/dask-contrib/dask-sql" -version = "0.1.0" +version = "2023.6.0" description = "Bindings for DataFusion used by Dask-SQL" readme = "README.md" license = "Apache-2.0" @@ -9,19 +9,16 @@ edition = "2021" rust-version = "1.65" [lib] -name = "dask_planner" -crate-type = ["cdylib"] +name = "dask_sql" +crate-type = ["cdylib", "rlib"] [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" +pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" -[dependencies.pyo3] -version = "0.19.0" -features = ["abi3-py38"] - [build-dependencies] pyo3-build-config = "0.19.1" diff --git a/dask_sql/context.py b/dask_sql/context.py index 03947fa76..fb97ad47c 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,7 +9,8 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer -from dask_planner import ( + +from ._internal import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 5d500180d..fb1117289 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,7 +5,8 @@ from typing import Any, Union import dask.dataframe as dd -from dask_planner import SqlTypeName + +from ._internal import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 3d39ee392..f0d9c74d1 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,7 +7,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import DaskTypeMap, SqlTypeName + +from ._internal import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index f4463fe62..ce28aeb28 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,10 +7,10 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - from dask_planner import LogicalPlan, RelDataType - import dask_sql + from ._internal import LogicalPlan, RelDataType + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 24b06c337..6a17ac94d 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,10 +7,10 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 16ed9e9bb..f8e92671d 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,10 +6,10 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class AlterSchemaPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 77edfff4b..e42c0c229 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,10 +8,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class AnalyzeTablePlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 74f964621..1a28edd8c 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 760857563..32d4d1d8b 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -6,7 +6,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 526ec9728..0c4807d91 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,10 +6,10 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 8b2e144ff..931930b1b 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,10 +7,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DescribeModelPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index 6b6dba0b8..e45623038 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,10 +6,10 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 455b27fa4..9df844398 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index c96d19786..e3743406b 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,10 +6,10 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index c0339b1d7..e5866948b 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,10 +9,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 6b0b94fe9..8a2ee0306 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -9,7 +9,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index ecc81e82a..64c656ad8 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,10 +7,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class ShowModelsPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 98b9f8ab3..0f3bfdf7e 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -8,7 +8,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index d79b4052b..05b899949 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -8,7 +8,8 @@ if TYPE_CHECKING: import dask_sql - from dask_planner import LogicalPlan + + from ._internal import LogicalPlan class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 563415c2d..9186049f9 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class UseSchemaPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 27f5c102c..f228bd16c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,10 +15,10 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index dfa8cdf3c..94690e0bc 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,10 +6,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index b50699b79..202743a7b 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,10 +8,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index abf1d814c..4afd6870b 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,10 +3,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class ExplainPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index a37e390ec..58704ae5a 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,10 +11,10 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index cec7df4d9..ea5cfd4c2 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,10 +17,10 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 00ba37fa2..efb07a073 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,10 +11,10 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DaskLimitPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index 4630b5d6b..d4b41c046 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,18 +1,18 @@ import logging from typing import TYPE_CHECKING -from dask_planner import RexType - from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column -if TYPE_CHECKING: - from dask_planner import LogicalPlan +from ._internal import RexType +if TYPE_CHECKING: import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 6dc57211c..453c8895a 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,10 +5,10 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class DaskSortPlugin(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index e82d9b105..ba82391f0 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,10 +4,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + class SubqueryAlias(BaseRelPlugin): """ diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b3b5cab0a..fa0e6b5bd 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,10 +11,10 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 1fbc5b5ae..04ca0d150 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,10 +6,10 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + def _extract_df(obj_cc, obj_df, output_field_names): # For concatenating, they should have exactly the same fields diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index bbcdae740..793b71903 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,10 +16,10 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from ._internal import LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 7f97a70d9..97692284b 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan + from ._internal import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index fce64be30..6cba4db8c 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,10 +8,10 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) _REX_TYPE_TO_PLUGIN = { diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index d6ae20698..7821e8d74 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,10 +7,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexAliasPlugin(BaseRexPlugin): """ diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 56d01d006..e06050823 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,7 +13,6 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer @@ -33,11 +32,13 @@ is_frame, ) -if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan +from ._internal import SqlTypeName +if TYPE_CHECKING: import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 01bf871c7..57cb1bd1d 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,10 +6,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexInputRefPlugin(BaseRexPlugin): """ diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 7fe59b383..952c157aa 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,17 +4,18 @@ import dask.dataframe as dd import numpy as np -from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin -if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan +from ._internal import SqlTypeName +if TYPE_CHECKING: import dask_sql + from ._internal import Expression, LogicalPlan + logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 1253f257d..7afb74c3e 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,10 +7,10 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from ._internal import Expression, LogicalPlan + class RexScalarSubqueryPlugin(BaseRexPlugin): """ diff --git a/dask_sql/utils.py b/dask_sql/utils.py index c2cfe45ab..6eed1ed29 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,11 +8,12 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value +from ._internal import SqlTypeName + logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 17392d3b8..236b63350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,81 @@ [build-system] -requires = ["maturin>=1.0,<2.0"] +requires = ["maturin>=0.15,<0.16"] build-backend = "maturin" +[project] +name = "dask_sql" +description = "SQL query layer for Dask" +maintainers = [{name = "Nils Braun", email = "nilslennartbraun@gmail.com"}] +license = {text = "MIT"} +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Programming Language :: Rust", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering", + "Topic :: System :: Distributed Computing", +] +readme = "README.md" +urls = {Homepage = "https://github.com/dask-contrib/dask-sql/"} +requires-python = ">=3.8" +dependencies = [ + "dask[dataframe]>=2022.3.0", + "distributed>=2022.3.0", + "pandas>=1.4.0", + # FIXME: handling is needed for httpx-based fastapi>=0.87.0 + "fastapi>=0.69.0,<0.87.0", + "uvicorn>=0.13.4", + "tzlocal>=2.1", + "prompt_toolkit>=3.0.8", + "pygments>=2.7.1", + "tabulate", +] +dynamic = ["version"] + +[project.optional-dependencies] +dev = [ + "pytest>=6.0.1", + "pytest-cov>=2.10.1", + "mock>=4.0.3", + "sphinx>=3.2.1", + "pyarrow>=6.0.1", + "scikit-learn>=1.0.0", + "intake>=0.6.0", + "pre-commit", + "black==22.10.0", + "isort==5.12.0", +] +fugue = ["fugue>=0.7.3"] + +[project.entry-points."fugue.plugins"] +dasksql = "dask_sql.integrations.fugue:_register_engines[fugue]" + +[project.scripts] +dask-sql = "dask_sql.cmd:main" +dask-sql-server = "dask_sql.server.app:main" + +[tool.setuptools] +include-package-data = true +zip-safe = false +license-files = ["LICENSE.txt"] + +[tool.setuptools.packages] +find = {namespaces = false} + [tool.maturin] -features = ["pyo3/extension-module"] +module-name = "dask_sql" include = [ { path = "Cargo.lock", format = "sdist" } ] -exclude = [".github/**", "ci/**", ".asf.yaml"] +exclude = [".github/**", "continuous_integration/**"] locked = true [tool.isort] diff --git a/setup.py b/setup.py index 02693d0d6..fcbb31faf 100644 --- a/setup.py +++ b/setup.py @@ -1,78 +1,8 @@ -import os -import sys - -from setuptools import find_packages, setup +from setuptools import setup import versioneer -long_description = "" -if os.path.exists("README.md"): - with open("README.md") as f: - long_description = f.read() - -needs_sphinx = "build_sphinx" in sys.argv -sphinx_requirements = ["sphinx>=3.2.1", "sphinx_rtd_theme"] if needs_sphinx else [] -debug_build = "debug" in sys.argv - -cmdclass = versioneer.get_cmdclass() - setup( - name="dask_sql", version=versioneer.get_version(), - description="SQL query layer for Dask", - url="https://github.com/dask-contrib/dask-sql/", - maintainer="Nils Braun", - maintainer_email="nilslennartbraun@gmail.com", - license="MIT", - long_description=long_description, - long_description_content_type="text/markdown", - packages=find_packages( - include=["dask_sql", "dask_sql.*", "dask_planner", "dask_planner.*"] - ), - package_data={"dask_sql": ["sql*.yaml"]}, - python_requires=">=3.8", - setup_requires=sphinx_requirements, - install_requires=[ - "dask[dataframe]>=2022.3.0", - "distributed>=2022.3.0", - "pandas>=1.4.0", - # FIXME: handling is needed for httpx-based fastapi>=0.87.0 - "fastapi>=0.69.0,<0.87.0", - "uvicorn>=0.13.4", - "tzlocal>=2.1", - "prompt_toolkit>=3.0.8", - "pygments>=2.7.1", - "tabulate", - ], - extras_require={ - "dev": [ - "pytest>=6.0.1", - "pytest-cov>=2.10.1", - "mock>=4.0.3", - "sphinx>=3.2.1", - "pyarrow>=6.0.1", - "scikit-learn>=1.0.0", - "intake>=0.6.0", - "pre-commit", - "black==22.10.0", - "isort==5.12.0", - ], - "fugue": ["fugue>=0.7.3"], - }, - entry_points={ - "console_scripts": [ - "dask-sql-server = dask_sql.server.app:main", - "dask-sql = dask_sql.cmd:main", - ], - "fugue.plugins": [ - "dasksql = dask_sql.integrations.fugue:_register_engines[fugue]" - ], - }, - zip_safe=False, - cmdclass=cmdclass, - command_options={ - "build_sphinx": { - "source_dir": ("setup.py", "docs"), - } - }, + cmdclass=versioneer.get_cmdclass(), ) diff --git a/src/lib.rs b/src/lib.rs index 9f446b7e2..1ced3e9d7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,8 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -#[pyo3(name = "dask_planner")] -fn rust(py: Python, m: &PyModule) -> PyResult<()> { +fn _internal(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/src/sql.rs b/src/sql.rs index 39d4614d4..585fcad4d 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -92,7 +92,7 @@ use crate::{ /// # Ok(()) /// # } /// ``` -#[pyclass(name = "DaskSQLContext", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSQLContext", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSQLContext { current_catalog: String, diff --git a/src/sql/column.rs b/src/sql/column.rs index 63f043901..32250c382 100644 --- a/src/sql/column.rs +++ b/src/sql/column.rs @@ -1,7 +1,7 @@ use datafusion_python::datafusion_common::Column; use pyo3::prelude::*; -#[pyclass(name = "Column", module = "dask_planner", subclass)] +#[pyclass(name = "Column", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyColumn { /// Original Column instance diff --git a/src/sql/function.rs b/src/sql/function.rs index 39fa7635e..4169d386c 100644 --- a/src/sql/function.rs +++ b/src/sql/function.rs @@ -5,7 +5,7 @@ use pyo3::prelude::*; use super::types::PyDataType; -#[pyclass(name = "DaskFunction", module = "dask_planner", subclass)] +#[pyclass(name = "DaskFunction", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskFunction { #[pyo3(get, set)] diff --git a/src/sql/logical.rs b/src/sql/logical.rs index 890f9aacb..e8f5f9f6f 100644 --- a/src/sql/logical.rs +++ b/src/sql/logical.rs @@ -62,7 +62,7 @@ use self::{ }; use crate::{error::Result, sql::exceptions::py_type_err}; -#[pyclass(name = "LogicalPlan", module = "dask_planner", subclass)] +#[pyclass(name = "LogicalPlan", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyLogicalPlan { /// The original LogicalPlan that was parsed by DataFusion from the input SQL diff --git a/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs index 870d8d7ab..a36750dba 100644 --- a/src/sql/logical/aggregate.rs +++ b/src/sql/logical/aggregate.rs @@ -11,7 +11,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Aggregate", module = "dask_planner", subclass)] +#[pyclass(name = "Aggregate", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyAggregate { aggregate: Option, diff --git a/src/sql/logical/alter_schema.rs b/src/sql/logical/alter_schema.rs index 742ae513f..a7a8696b8 100644 --- a/src/sql/logical/alter_schema.rs +++ b/src/sql/logical/alter_schema.rs @@ -96,7 +96,7 @@ impl UserDefinedLogicalNode for AlterSchemaPlanNode { } } -#[pyclass(name = "AlterSchema", module = "dask_planner", subclass)] +#[pyclass(name = "AlterSchema", module = "dask_sql", subclass)] pub struct PyAlterSchema { pub(crate) alter_schema: AlterSchemaPlanNode, } diff --git a/src/sql/logical/alter_table.rs b/src/sql/logical/alter_table.rs index 7f51a15c3..d6b49315b 100644 --- a/src/sql/logical/alter_table.rs +++ b/src/sql/logical/alter_table.rs @@ -102,7 +102,7 @@ impl UserDefinedLogicalNode for AlterTablePlanNode { } } -#[pyclass(name = "AlterTable", module = "dask_planner", subclass)] +#[pyclass(name = "AlterTable", module = "dask_sql", subclass)] pub struct PyAlterTable { pub(crate) alter_table: AlterTablePlanNode, } diff --git a/src/sql/logical/analyze_table.rs b/src/sql/logical/analyze_table.rs index 9fa7fb219..6876c3704 100644 --- a/src/sql/logical/analyze_table.rs +++ b/src/sql/logical/analyze_table.rs @@ -99,7 +99,7 @@ impl UserDefinedLogicalNode for AnalyzeTablePlanNode { } } -#[pyclass(name = "AnalyzeTable", module = "dask_planner", subclass)] +#[pyclass(name = "AnalyzeTable", module = "dask_sql", subclass)] pub struct PyAnalyzeTable { pub(crate) analyze_table: AnalyzeTablePlanNode, } diff --git a/src/sql/logical/create_catalog_schema.rs b/src/sql/logical/create_catalog_schema.rs index bc89b02ce..82a1426af 100644 --- a/src/sql/logical/create_catalog_schema.rs +++ b/src/sql/logical/create_catalog_schema.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for CreateCatalogSchemaPlanNode { } } -#[pyclass(name = "CreateCatalogSchema", module = "dask_planner", subclass)] +#[pyclass(name = "CreateCatalogSchema", module = "dask_sql", subclass)] pub struct PyCreateCatalogSchema { pub(crate) create_catalog_schema: CreateCatalogSchemaPlanNode, } diff --git a/src/sql/logical/create_experiment.rs b/src/sql/logical/create_experiment.rs index 313357d75..06fe9d856 100644 --- a/src/sql/logical/create_experiment.rs +++ b/src/sql/logical/create_experiment.rs @@ -105,7 +105,7 @@ impl UserDefinedLogicalNode for CreateExperimentPlanNode { } } -#[pyclass(name = "CreateExperiment", module = "dask_planner", subclass)] +#[pyclass(name = "CreateExperiment", module = "dask_sql", subclass)] pub struct PyCreateExperiment { pub(crate) create_experiment: CreateExperimentPlanNode, } diff --git a/src/sql/logical/create_memory_table.rs b/src/sql/logical/create_memory_table.rs index dd3d0753d..53ff9432e 100644 --- a/src/sql/logical/create_memory_table.rs +++ b/src/sql/logical/create_memory_table.rs @@ -7,7 +7,7 @@ use pyo3::prelude::*; use crate::sql::{exceptions::py_type_err, logical::PyLogicalPlan}; -#[pyclass(name = "CreateMemoryTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateMemoryTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyCreateMemoryTable { create_memory_table: Option, diff --git a/src/sql/logical/create_model.rs b/src/sql/logical/create_model.rs index 782fe3325..7dbcdff95 100644 --- a/src/sql/logical/create_model.rs +++ b/src/sql/logical/create_model.rs @@ -101,7 +101,7 @@ impl UserDefinedLogicalNode for CreateModelPlanNode { } } -#[pyclass(name = "CreateModel", module = "dask_planner", subclass)] +#[pyclass(name = "CreateModel", module = "dask_sql", subclass)] pub struct PyCreateModel { pub(crate) create_model: CreateModelPlanNode, } diff --git a/src/sql/logical/create_table.rs b/src/sql/logical/create_table.rs index 9271130c7..1c423415f 100644 --- a/src/sql/logical/create_table.rs +++ b/src/sql/logical/create_table.rs @@ -100,7 +100,7 @@ impl UserDefinedLogicalNode for CreateTablePlanNode { } } -#[pyclass(name = "CreateTable", module = "dask_planner", subclass)] +#[pyclass(name = "CreateTable", module = "dask_sql", subclass)] pub struct PyCreateTable { pub(crate) create_table: CreateTablePlanNode, } diff --git a/src/sql/logical/describe_model.rs b/src/sql/logical/describe_model.rs index cb2087376..3e3563fe1 100644 --- a/src/sql/logical/describe_model.rs +++ b/src/sql/logical/describe_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for DescribeModelPlanNode { } } -#[pyclass(name = "DescribeModel", module = "dask_planner", subclass)] +#[pyclass(name = "DescribeModel", module = "dask_sql", subclass)] pub struct PyDescribeModel { pub(crate) describe_model: DescribeModelPlanNode, } diff --git a/src/sql/logical/drop_model.rs b/src/sql/logical/drop_model.rs index 71074905d..2715cb067 100644 --- a/src/sql/logical/drop_model.rs +++ b/src/sql/logical/drop_model.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for DropModelPlanNode { } } -#[pyclass(name = "DropModel", module = "dask_planner", subclass)] +#[pyclass(name = "DropModel", module = "dask_sql", subclass)] pub struct PyDropModel { pub(crate) drop_model: DropModelPlanNode, } diff --git a/src/sql/logical/drop_schema.rs b/src/sql/logical/drop_schema.rs index 2022a61c9..78d252d11 100644 --- a/src/sql/logical/drop_schema.rs +++ b/src/sql/logical/drop_schema.rs @@ -88,7 +88,7 @@ impl UserDefinedLogicalNode for DropSchemaPlanNode { } } -#[pyclass(name = "DropSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DropSchema", module = "dask_sql", subclass)] pub struct PyDropSchema { pub(crate) drop_schema: DropSchemaPlanNode, } diff --git a/src/sql/logical/drop_table.rs b/src/sql/logical/drop_table.rs index f91baf28a..504a104c1 100644 --- a/src/sql/logical/drop_table.rs +++ b/src/sql/logical/drop_table.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "DropTable", module = "dask_planner", subclass)] +#[pyclass(name = "DropTable", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyDropTable { drop_table: DropTable, diff --git a/src/sql/logical/empty_relation.rs b/src/sql/logical/empty_relation.rs index 5bd6659ce..6356f9c85 100644 --- a/src/sql/logical/empty_relation.rs +++ b/src/sql/logical/empty_relation.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "EmptyRelation", module = "dask_planner", subclass)] +#[pyclass(name = "EmptyRelation", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyEmptyRelation { empty_relation: EmptyRelation, diff --git a/src/sql/logical/explain.rs b/src/sql/logical/explain.rs index 17f1e4ee2..839a731d8 100644 --- a/src/sql/logical/explain.rs +++ b/src/sql/logical/explain.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "Explain", module = "dask_planner", subclass)] +#[pyclass(name = "Explain", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyExplain { explain: Explain, diff --git a/src/sql/logical/export_model.rs b/src/sql/logical/export_model.rs index e38551b58..58b5f7fad 100644 --- a/src/sql/logical/export_model.rs +++ b/src/sql/logical/export_model.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ExportModelPlanNode { } } -#[pyclass(name = "ExportModel", module = "dask_planner", subclass)] +#[pyclass(name = "ExportModel", module = "dask_sql", subclass)] pub struct PyExportModel { pub(crate) export_model: ExportModelPlanNode, } diff --git a/src/sql/logical/filter.rs b/src/sql/logical/filter.rs index a50d508ff..f2dc2e702 100644 --- a/src/sql/logical/filter.rs +++ b/src/sql/logical/filter.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Filter", module = "dask_planner", subclass)] +#[pyclass(name = "Filter", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyFilter { filter: Filter, diff --git a/src/sql/logical/join.rs b/src/sql/logical/join.rs index d6c31b55b..3261e9217 100644 --- a/src/sql/logical/join.rs +++ b/src/sql/logical/join.rs @@ -15,7 +15,7 @@ use crate::{ sql::{column, exceptions::py_type_err}, }; -#[pyclass(name = "Join", module = "dask_planner", subclass)] +#[pyclass(name = "Join", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyJoin { join: Join, diff --git a/src/sql/logical/limit.rs b/src/sql/logical/limit.rs index 189fdeea0..04d783fdd 100644 --- a/src/sql/logical/limit.rs +++ b/src/sql/logical/limit.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Limit", module = "dask_planner", subclass)] +#[pyclass(name = "Limit", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyLimit { limit: Limit, diff --git a/src/sql/logical/predict_model.rs b/src/sql/logical/predict_model.rs index e8d723d2c..3f68ffdb4 100644 --- a/src/sql/logical/predict_model.rs +++ b/src/sql/logical/predict_model.rs @@ -89,7 +89,7 @@ impl UserDefinedLogicalNode for PredictModelPlanNode { } } -#[pyclass(name = "PredictModel", module = "dask_planner", subclass)] +#[pyclass(name = "PredictModel", module = "dask_sql", subclass)] pub struct PyPredictModel { pub(crate) predict_model: PredictModelPlanNode, } diff --git a/src/sql/logical/projection.rs b/src/sql/logical/projection.rs index 99ed0d684..b954d3b71 100644 --- a/src/sql/logical/projection.rs +++ b/src/sql/logical/projection.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; -#[pyclass(name = "Projection", module = "dask_planner", subclass)] +#[pyclass(name = "Projection", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyProjection { pub(crate) projection: Projection, diff --git a/src/sql/logical/repartition_by.rs b/src/sql/logical/repartition_by.rs index e931b88e7..687958571 100644 --- a/src/sql/logical/repartition_by.rs +++ b/src/sql/logical/repartition_by.rs @@ -10,7 +10,7 @@ use crate::{ sql::{exceptions::py_type_err, logical}, }; -#[pyclass(name = "RepartitionBy", module = "dask_planner", subclass)] +#[pyclass(name = "RepartitionBy", module = "dask_sql", subclass)] pub struct PyRepartitionBy { pub(crate) repartition: Repartition, } diff --git a/src/sql/logical/show_columns.rs b/src/sql/logical/show_columns.rs index adfb584ef..cdd844127 100644 --- a/src/sql/logical/show_columns.rs +++ b/src/sql/logical/show_columns.rs @@ -92,7 +92,7 @@ impl UserDefinedLogicalNode for ShowColumnsPlanNode { } } -#[pyclass(name = "ShowColumns", module = "dask_planner", subclass)] +#[pyclass(name = "ShowColumns", module = "dask_sql", subclass)] pub struct PyShowColumns { pub(crate) show_columns: ShowColumnsPlanNode, } diff --git a/src/sql/logical/show_models.rs b/src/sql/logical/show_models.rs index 026a179a5..a228769de 100644 --- a/src/sql/logical/show_models.rs +++ b/src/sql/logical/show_models.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for ShowModelsPlanNode { } } -#[pyclass(name = "ShowModels", module = "dask_planner", subclass)] +#[pyclass(name = "ShowModels", module = "dask_sql", subclass)] pub struct PyShowModels { pub(crate) show_models: ShowModelsPlanNode, } diff --git a/src/sql/logical/show_schemas.rs b/src/sql/logical/show_schemas.rs index 3e3ed4783..454afb51d 100644 --- a/src/sql/logical/show_schemas.rs +++ b/src/sql/logical/show_schemas.rs @@ -91,7 +91,7 @@ impl UserDefinedLogicalNode for ShowSchemasPlanNode { } } -#[pyclass(name = "ShowSchema", module = "dask_planner", subclass)] +#[pyclass(name = "ShowSchema", module = "dask_sql", subclass)] pub struct PyShowSchema { pub(crate) show_schema: ShowSchemasPlanNode, } diff --git a/src/sql/logical/show_tables.rs b/src/sql/logical/show_tables.rs index 987f2546e..c01022828 100644 --- a/src/sql/logical/show_tables.rs +++ b/src/sql/logical/show_tables.rs @@ -95,7 +95,7 @@ impl UserDefinedLogicalNode for ShowTablesPlanNode { } } -#[pyclass(name = "ShowTables", module = "dask_planner", subclass)] +#[pyclass(name = "ShowTables", module = "dask_sql", subclass)] pub struct PyShowTables { pub(crate) show_tables: ShowTablesPlanNode, } diff --git a/src/sql/logical/sort.rs b/src/sql/logical/sort.rs index 9abcd3906..5a1f862a1 100644 --- a/src/sql/logical/sort.rs +++ b/src/sql/logical/sort.rs @@ -6,7 +6,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Sort", module = "dask_planner", subclass)] +#[pyclass(name = "Sort", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySort { sort: Sort, diff --git a/src/sql/logical/subquery_alias.rs b/src/sql/logical/subquery_alias.rs index 003e02045..e98c78203 100644 --- a/src/sql/logical/subquery_alias.rs +++ b/src/sql/logical/subquery_alias.rs @@ -3,7 +3,7 @@ use pyo3::prelude::*; use crate::sql::exceptions::py_type_err; -#[pyclass(name = "SubqueryAlias", module = "dask_planner", subclass)] +#[pyclass(name = "SubqueryAlias", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PySubqueryAlias { subquery_alias: SubqueryAlias, diff --git a/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs index 171e10400..1303f6474 100644 --- a/src/sql/logical/table_scan.rs +++ b/src/sql/logical/table_scan.rs @@ -12,7 +12,7 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "TableScan", module = "dask_planner", subclass)] +#[pyclass(name = "TableScan", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyTableScan { pub(crate) table_scan: TableScan, @@ -20,7 +20,7 @@ pub struct PyTableScan { } type FilterTuple = (String, String, Option>); -#[pyclass(name = "FilteredResult", module = "dask_planner", subclass)] +#[pyclass(name = "FilteredResult", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyFilteredResult { // Certain Expr(s) do not have supporting logic in pyarrow for IO filtering diff --git a/src/sql/logical/use_schema.rs b/src/sql/logical/use_schema.rs index 7c2206310..0f804ce7a 100644 --- a/src/sql/logical/use_schema.rs +++ b/src/sql/logical/use_schema.rs @@ -85,7 +85,7 @@ impl UserDefinedLogicalNode for UseSchemaPlanNode { } } -#[pyclass(name = "UseSchema", module = "dask_planner", subclass)] +#[pyclass(name = "UseSchema", module = "dask_sql", subclass)] pub struct PyUseSchema { pub(crate) use_schema: UseSchemaPlanNode, } diff --git a/src/sql/logical/window.rs b/src/sql/logical/window.rs index e104ccdb3..3dd9d8c0d 100644 --- a/src/sql/logical/window.rs +++ b/src/sql/logical/window.rs @@ -17,19 +17,19 @@ use crate::{ sql::exceptions::py_type_err, }; -#[pyclass(name = "Window", module = "dask_planner", subclass)] +#[pyclass(name = "Window", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindow { window: Window, } -#[pyclass(name = "WindowFrame", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrame", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrame { window_frame: WindowFrame, } -#[pyclass(name = "WindowFrameBound", module = "dask_planner", subclass)] +#[pyclass(name = "WindowFrameBound", module = "dask_sql", subclass)] #[derive(Clone)] pub struct PyWindowFrameBound { frame_bound: WindowFrameBound, diff --git a/src/sql/schema.rs b/src/sql/schema.rs index 0975391f4..804db700f 100644 --- a/src/sql/schema.rs +++ b/src/sql/schema.rs @@ -6,7 +6,7 @@ use pyo3::prelude::*; use super::types::PyDataType; use crate::sql::{function::DaskFunction, table}; -#[pyclass(name = "DaskSchema", module = "dask_planner", subclass)] +#[pyclass(name = "DaskSchema", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSchema { #[pyo3(get, set)] diff --git a/src/sql/statement.rs b/src/sql/statement.rs index f8fabc109..40fc9f268 100644 --- a/src/sql/statement.rs +++ b/src/sql/statement.rs @@ -2,7 +2,7 @@ use pyo3::prelude::*; use crate::parser::DaskStatement; -#[pyclass(name = "Statement", module = "dask_planner", subclass)] +#[pyclass(name = "Statement", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyStatement { pub statement: DaskStatement, diff --git a/src/sql/table.rs b/src/sql/table.rs index 47d1b6403..1c2585bef 100644 --- a/src/sql/table.rs +++ b/src/sql/table.rs @@ -90,7 +90,7 @@ fn is_supported_push_down_expr(_expr: &Expr) -> bool { true } -#[pyclass(name = "DaskStatistics", module = "dask_planner", subclass)] +#[pyclass(name = "DaskStatistics", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskStatistics { row_count: f64, @@ -109,7 +109,7 @@ impl DaskStatistics { } } -#[pyclass(name = "DaskTable", module = "dask_planner", subclass)] +#[pyclass(name = "DaskTable", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskTable { pub(crate) schema_name: Option, diff --git a/src/sql/types/rel_data_type.rs b/src/sql/types/rel_data_type.rs index 1ae3646b0..59cb0fb7c 100644 --- a/src/sql/types/rel_data_type.rs +++ b/src/sql/types/rel_data_type.rs @@ -8,7 +8,7 @@ const PRECISION_NOT_SPECIFIED: i32 = i32::MIN; const SCALE_NOT_SPECIFIED: i32 = -1; /// RelDataType represents the type of a scalar expression or entire row returned from a relational expression. -#[pyclass(name = "RelDataType", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataType", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataType { nullable: bool, diff --git a/src/sql/types/rel_data_type_field.rs b/src/sql/types/rel_data_type_field.rs index 13f036d0e..3694d0bce 100644 --- a/src/sql/types/rel_data_type_field.rs +++ b/src/sql/types/rel_data_type_field.rs @@ -12,7 +12,7 @@ use crate::{ }; /// RelDataTypeField represents the definition of a field in a structured RelDataType. -#[pyclass(name = "RelDataTypeField", module = "dask_planner", subclass)] +#[pyclass(name = "RelDataTypeField", module = "dask_sql", subclass)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct RelDataTypeField { qualifier: Option, diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 952bcb10e..8cb155db7 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,10 +3,11 @@ import numpy as np import pandas as pd import pytest -from dask_planner import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value +from ._internal import SqlTypeName + def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" From 7d1be9218263ea67d31068b6e9877cebf06f9270 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 17 Jul 2023 13:43:24 -0400 Subject: [PATCH 17/89] Adjust native library name from _.internal to dask_planner --- Cargo.toml | 1 + dask_sql/context.py | 3 +-- dask_sql/input_utils/hive.py | 3 +-- dask_sql/mappings.py | 3 +-- dask_sql/physical/rel/base.py | 4 ++-- dask_sql/physical/rel/convert.py | 4 ++-- dask_sql/physical/rel/custom/alter.py | 4 ++-- dask_sql/physical/rel/custom/analyze_table.py | 4 ++-- dask_sql/physical/rel/custom/create_catalog_schema.py | 4 ++-- dask_sql/physical/rel/custom/create_memory_table.py | 4 ++-- dask_sql/physical/rel/custom/create_table.py | 4 ++-- dask_sql/physical/rel/custom/describe_model.py | 4 ++-- dask_sql/physical/rel/custom/distributeby.py | 4 ++-- dask_sql/physical/rel/custom/drop_schema.py | 4 ++-- dask_sql/physical/rel/custom/export_model.py | 4 ++-- dask_sql/physical/rel/custom/predict_model.py | 4 ++-- dask_sql/physical/rel/custom/show_columns.py | 4 ++-- dask_sql/physical/rel/custom/show_models.py | 4 ++-- dask_sql/physical/rel/custom/show_schemas.py | 4 ++-- dask_sql/physical/rel/custom/show_tables.py | 4 ++-- dask_sql/physical/rel/custom/use_schema.py | 4 ++-- dask_sql/physical/rel/logical/aggregate.py | 4 ++-- dask_sql/physical/rel/logical/cross_join.py | 4 ++-- dask_sql/physical/rel/logical/empty.py | 4 ++-- dask_sql/physical/rel/logical/explain.py | 4 ++-- dask_sql/physical/rel/logical/filter.py | 4 ++-- dask_sql/physical/rel/logical/join.py | 4 ++-- dask_sql/physical/rel/logical/limit.py | 4 ++-- dask_sql/physical/rel/logical/project.py | 8 ++++---- dask_sql/physical/rel/logical/sort.py | 4 ++-- dask_sql/physical/rel/logical/subquery_alias.py | 4 ++-- dask_sql/physical/rel/logical/table_scan.py | 4 ++-- dask_sql/physical/rel/logical/union.py | 4 ++-- dask_sql/physical/rel/logical/window.py | 4 ++-- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 4 ++-- dask_sql/physical/rex/core/alias.py | 4 ++-- dask_sql/physical/rex/core/call.py | 7 +++---- dask_sql/physical/rex/core/input_ref.py | 4 ++-- dask_sql/physical/rex/core/literal.py | 7 +++---- dask_sql/physical/rex/core/subquery.py | 4 ++-- dask_sql/utils.py | 3 +-- pyproject.toml | 2 +- src/lib.rs | 4 ++-- tests/unit/test_mapping.py | 3 +-- 45 files changed, 86 insertions(+), 92 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 11444b09c..465472c11 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,6 +7,7 @@ readme = "README.md" license = "Apache-2.0" edition = "2021" rust-version = "1.65" +include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] [lib] name = "dask_sql" diff --git a/dask_sql/context.py b/dask_sql/context.py index fb97ad47c..03947fa76 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,8 +9,7 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer - -from ._internal import ( +from dask_planner import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index fb1117289..5d500180d 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,8 +5,7 @@ from typing import Any, Union import dask.dataframe as dd - -from ._internal import SqlTypeName +from dask_planner import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index f0d9c74d1..3d39ee392 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,8 +7,7 @@ import dask.dataframe as dd import numpy as np import pandas as pd - -from ._internal import DaskTypeMap, SqlTypeName +from dask_planner import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index ce28aeb28..f4463fe62 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,9 +7,9 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan, RelDataType - from ._internal import LogicalPlan, RelDataType + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 6a17ac94d..24b06c337 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,9 +7,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index f8e92671d..16ed9e9bb 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,9 +6,9 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index e42c0c229..77edfff4b 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 1a28edd8c..74f964621 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 32d4d1d8b..8c8c945ff 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -5,9 +5,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 0c4807d91..526ec9728 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,9 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 931930b1b..8b2e144ff 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index e45623038..6b6dba0b8 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,9 +6,9 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 9df844398..455b27fa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index e3743406b..c96d19786 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,9 +6,9 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index e5866948b..c0339b1d7 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,9 +9,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index 8a2ee0306..a11d05c94 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index 64c656ad8..ecc81e82a 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index 0f3bfdf7e..d49d3708b 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index 05b899949..85dc3687d 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -7,9 +7,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 9186049f9..563415c2d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index f228bd16c..27f5c102c 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,9 +15,9 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index 94690e0bc..dfa8cdf3c 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,9 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index 202743a7b..b50699b79 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,9 +8,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index 4afd6870b..abf1d814c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,9 +3,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index 58704ae5a..a37e390ec 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,9 +11,9 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index ea5cfd4c2..cec7df4d9 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,9 +17,9 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index efb07a073..00ba37fa2 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,9 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index d4b41c046..4630b5d6b 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,17 +1,17 @@ import logging from typing import TYPE_CHECKING +from dask_planner import RexType + from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column -from ._internal import RexType - if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 453c8895a..6dc57211c 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,9 +5,9 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index ba82391f0..e82d9b105 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,9 +4,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index fa0e6b5bd..b3b5cab0a 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,9 +11,9 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 04ca0d150..1fbc5b5ae 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,9 +6,9 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 793b71903..bbcdae740 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,9 +16,9 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - import dask_sql + from dask_planner import LogicalPlan - from ._internal import LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 97692284b..7f97a70d9 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from ._internal import Expression, LogicalPlan + from dask_planner import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index 6cba4db8c..fce64be30 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,9 +8,9 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index 7821e8d74..d6ae20698 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,9 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index e06050823..56d01d006 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,6 +13,7 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data +from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 from dask_sql.datacontainer import DataContainer @@ -32,12 +33,10 @@ is_frame, ) -from ._internal import SqlTypeName - if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 57cb1bd1d..01bf871c7 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,9 +6,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 952c157aa..7fe59b383 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,17 +4,16 @@ import dask.dataframe as dd import numpy as np +from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin -from ._internal import SqlTypeName - if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 7afb74c3e..1253f257d 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,9 +7,9 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - import dask_sql + from dask_planner import Expression, LogicalPlan - from ._internal import Expression, LogicalPlan + import dask_sql class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index 6eed1ed29..c2cfe45ab 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,12 +8,11 @@ import dask.dataframe as dd import numpy as np import pandas as pd +from dask_planner import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value -from ._internal import SqlTypeName - logger = logging.getLogger(__name__) diff --git a/pyproject.toml b/pyproject.toml index 236b63350..0e7ff0578 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ license-files = ["LICENSE.txt"] find = {namespaces = false} [tool.maturin] -module-name = "dask_sql" +module-name = "dask_planner" include = [ { path = "Cargo.lock", format = "sdist" } ] diff --git a/src/lib.rs b/src/lib.rs index 1ced3e9d7..63879e2fb 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -fn _internal(py: Python, m: &PyModule) -> PyResult<()> { +fn dask_planner(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); @@ -40,7 +40,7 @@ fn _internal(py: Python, m: &PyModule) -> PyResult<()> { py.get_type::(), )?; - debug!("dask_planner Python module loaded"); + debug!("dask_sql native library loaded"); Ok(()) } diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 8cb155db7..952bcb10e 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,11 +3,10 @@ import numpy as np import pandas as pd import pytest +from dask_planner import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value -from ._internal import SqlTypeName - def test_python_to_sql(): assert str(python_to_sql_type(np.dtype("int32"))) == "INTEGER" From 83fb5c39fe9cdc51d589b109ae06bdf9311118ad Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:25:57 -0700 Subject: [PATCH 18/89] Resolve initial conda build issues --- continuous_integration/recipe/conda_build_config.yaml | 4 ++-- continuous_integration/recipe/meta.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index b1c3c40cc..142300f28 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -4,5 +4,5 @@ rust_compiler_version: - 1.69 libprotobuf: - 3 -setuptools_rust: - - 1.5.2 +maturin: + - 0.15.3 diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 02e58d1fb..954825e1e 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -31,7 +31,7 @@ requirements: host: - pip - python - - setuptools-rust + - maturin - libprotobuf - zlib run: From c7bbbd7a62c51c8321b25a24d95ff7ec96d5cbf1 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:29:47 -0700 Subject: [PATCH 19/89] Replace setuptools-rust with maturin in CI --- .github/workflows/release.yml | 6 +++--- .github/workflows/test-upstream.yml | 1 - .github/workflows/test.yml | 1 - docker/conda.txt | 2 +- docker/main.dockerfile | 2 +- docs/environment.yml | 1 - docs/requirements-docs.txt | 2 +- 7 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 1ee1e6397..0cb3fccb8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,14 +60,14 @@ jobs: CARGO_NET_GIT_FETCH_WITH_CLI="true" PATH="$HOME/.cargo/bin:$HOME/.local/bin:$PATH" CIBW_ENVIRONMENT_WINDOWS: 'PATH="$UserProfile\.cargo\bin;$PATH"' - CIBW_BEFORE_BUILD: 'pip install -U setuptools-rust' + CIBW_BEFORE_BUILD: 'pip install -U "maturin>=0.15,<0.16"' CIBW_BEFORE_BUILD_LINUX: > ARCH=$([ $(uname -m) == x86_64 ] && echo x86_64 || echo aarch_64) && DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-${ARCH}.zip$") && curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && unzip protoc-*-linux-$ARCH.zip -d $HOME/.local && protoc --version && - pip install -U setuptools-rust && + pip install -U "maturin>=0.15,<0.16" && pip list && curl --retry 6 --retry-delay 10 https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=minimal -y && rustup show @@ -127,7 +127,7 @@ jobs: channel-priority: strict - name: Build source distribution run: | - mamba install setuptools-rust twine + mamba install "maturin>=0.15,<0.16" twine python setup.py sdist - name: Check dist files diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index ff0296b15..10eb032ad 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -126,7 +126,6 @@ jobs: bash update-dependencies.sh - name: Install dependencies and nothing else run: | - mamba install setuptools-rust pip install -e . -vv which python diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be2d98126..2bd043b34 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -116,7 +116,6 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies and nothing else run: | - mamba install "setuptools-rust>=1.5.2" pip install -e . -vv which python diff --git a/docker/conda.txt b/docker/conda.txt index d24d217aa..c0f185948 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -21,4 +21,4 @@ intake>=0.6.0 pre-commit>=2.11.1 black=22.10.0 isort=5.12.0 -setuptools-rust>=1.5.2 +maturin>=0.15,<0.16 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index da965a53c..ee0ab8c30 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -14,7 +14,7 @@ ENV PATH="/root/.cargo/bin:${PATH}" COPY docker/conda.txt /opt/dask_sql/ RUN mamba install -y \ # build requirements - "setuptools-rust>=1.5.2" \ + "maturin>=0.15,<0.16" \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ diff --git a/docs/environment.yml b/docs/environment.yml index 96a727465..8d6f0714f 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -17,6 +17,5 @@ dependencies: - prompt_toolkit>=3.0.8 - pygments>=2.7.1 - tabulate - - setuptools-rust>=1.5.2 - ucx-proc=*=cpu - rust>=1.65.0 diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index c9d8c6b0e..6ddeb3028 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -11,4 +11,4 @@ tzlocal>=2.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 tabulate -setuptools-rust>=1.5.2 +maturin>=0.15,<0.16 From 6dc634758da45a701cc84836881e3a593de935e6 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:40:58 -0700 Subject: [PATCH 20/89] Constrain maturin, remove setuptools-rust from CI envs --- continuous_integration/environment-3.10-dev.yaml | 3 +-- continuous_integration/environment-3.8-dev.yaml | 3 +-- continuous_integration/environment-3.9-dev.yaml | 3 +-- continuous_integration/gpuci/environment-3.10.yaml | 3 +-- continuous_integration/gpuci/environment-3.9.yaml | 3 +-- 5 files changed, 5 insertions(+), 10 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index a867996d1..cf35db316 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -28,7 +28,6 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 18b478472..4d737591b 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -10,7 +10,7 @@ dependencies: - intake=0.6.0 - jsonschema - lightgbm -- maturin=0.12.8 +- maturin=0.15 - mlflow - mock - numpy=1.21.6 @@ -27,7 +27,6 @@ dependencies: - pytest - python=3.8 - scikit-learn=1.0.0 -- setuptools-rust=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 7424529d6..ace64cb75 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -28,7 +28,6 @@ dependencies: - pytest - python=3.9 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 2467e144a..b0332dc4e 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -31,7 +31,6 @@ dependencies: - pytest - python=3.10 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 917892f24..7b12c8cbe 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.12.8 +- maturin>=0.15,<0.16 - mlflow - mock - numpy>=1.21.6 @@ -31,7 +31,6 @@ dependencies: - pytest - python=3.9 - scikit-learn>=1.0.0 -- setuptools-rust>=1.5.2 - sphinx - sqlalchemy<2 - tpot>=0.12.0 From 6dcf5e055cd0919f6ce3128d32cd4369062d43fe Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:44:41 -0700 Subject: [PATCH 21/89] Update docs and Rust CI --- .github/CODEOWNERS | 5 ++++- .github/workflows/conda.yml | 7 +++---- .github/workflows/release.yml | 2 +- .github/workflows/test-upstream.yml | 2 -- CONTRIBUTING.md | 20 ++++++++++---------- README.md | 2 +- docs/source/how_does_it_work.rst | 2 +- 7 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 527d01fa2..1ff63a673 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,4 +2,7 @@ * @ayushdg @charlesbluca @galipremsagar # rust codeowners -dask_planner/ @ayushdg @charlesbluca @galipremsagar @jdye64 +.cargo/ @ayushdg @charlesbluca @galipremsagar @jdye64 +src/ @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.toml @ayushdg @charlesbluca @galipremsagar @jdye64 +Cargo.lock @ayushdg @charlesbluca @galipremsagar @jdye64 diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index 78253db6b..d67798646 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -6,10 +6,9 @@ on: pull_request: paths: - setup.py - - dask_planner/Cargo.toml - - dask_planner/Cargo.lock - - dask_planner/pyproject.toml - - dask_planner/rust-toolchain.toml + - Cargo.toml + - Cargo.lock + - pyproject.toml - continuous_integration/recipe/** - .github/workflows/conda.yml schedule: diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 0cb3fccb8..7a837af3b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -74,7 +74,7 @@ jobs: with: package-dir: . output-dir: dist - config-file: "dask_planner/pyproject.toml" + config-file: "pyproject.toml" - name: Set up Python uses: conda-incubator/setup-miniconda@v2.2.0 with: diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index 10eb032ad..df361bb49 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -68,7 +68,6 @@ jobs: - name: Optionally update upstream cargo dependencies if: env.which_upstream == 'DataFusion' run: | - cd dask_planner bash update-dependencies.sh - name: Build the Rust DataFusion bindings run: | @@ -122,7 +121,6 @@ jobs: env: UPDATE_ALL_CARGO_DEPS: false run: | - cd dask_planner bash update-dependencies.sh - name: Install dependencies and nothing else run: | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9ab31230f..a6cd56c59 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -49,7 +49,7 @@ Note that while `setuptools-rust` is used by CI and should be used during your d Building Dask-SQL is straightforward with Python. To build run ```python setup.py install```. This will build both the Rust and Python codebase and install it into your locally activated conda environment. While not required, if you have updated dependencies for Rust you might prefer a clean build. To clean your setup run ```python setup.py clean``` and then run ```python setup.py install``` #### DataFusion Modules -DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](dask_planner/Cargo.toml). The modules that we use currently are +DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](Cargo.toml). The modules that we use currently are - `datafusion-common` - Datastructures and core logic - `datafusion-expr` - Expression based logic and operators @@ -57,7 +57,7 @@ DataFusion is broken down into a few modules. We consume those modules in our [C - `datafusion-optimizer` - Optimization logic and datastructures for modifying current plans into more efficient ones. #### Retrieving Upstream Dependencies -During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](dask_planner/Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. +During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. After updating the `Cargo.toml` file the codebase can be re-built to reflect those changes by running `python setup.py install` @@ -72,40 +72,40 @@ Sometimes when building against the latest Github commits for DataFusion you may ### Datastructures While working in the Rust codebase there are a few datastructures that you should make yourself familiar with. This section does not aim to verbosely list out all of the datastructure with in the project but rather just the key datastructures that you are likely to encounter while working on almost any feature/issue. The aim is to give you a better overview of the codebase without having to manually dig through the all the source code. -- [`PyLogicalPlan`](dask_planner/src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) +- [`PyLogicalPlan`](src/sql/logical.rs) -> [DataFusion LogicalPlan](https://docs.rs/datafusion/latest/datafusion/logical_plan/enum.LogicalPlan.html) - Often encountered in Python code with variable name `rel` - Python serializable umbrella representation of the entire LogicalPlan that was generated by DataFusion - Provides access to `DaskTable` instances and type information for each table - Access to individual nodes in the logical plan tree. Ex: `TableScan` -- [`DaskSQLContext`](dask_planner/src/sql.rs) +- [`DaskSQLContext`](src/sql.rs) - Analogous to Python `Context` - Contains metadata about the tables, schemas, functions, operators, and configurations that are persent within the current execution context - When adding custom functions/UDFs this is the location that you would register them - Entry point for parsing SQL strings to sql node trees. This is the location Python will begin its interactions with Rust -- [`PyExpr`](dask_planner/src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) +- [`PyExpr`](src/expression.rs) -> [DataFusion Expr](https://docs.rs/datafusion/latest/datafusion/prelude/enum.Expr.html) - Arguably where most of your time will be spent - Represents a single node in sql tree. Ex: `avg(age)` from `SELECT avg(age) FROM people` - Is associate with a single `RexType` - Can contain literal values or represent function calls, `avg()` for example - The expressions "index" in the tree can be retrieved by calling `PyExpr.index()` on an instance. This is useful when mapping frontend column names in Dask code to backend Dataframe columns - Certain `PyExpr`s contain operands. Ex: `2 + 2` would contain 3 operands. 1) A literal `PyExpr` instance with value 2 2) Another literal `PyExpr` instance with a value of 2. 3) A `+` `PyExpr` representing the addition of the 2 literals. -- [`DaskSqlOptimizer`](dask_planner/src/sql/optimizer.rs) +- [`DaskSqlOptimizer`](src/sql/optimizer.rs) - Registering location for all Dask-SQL specific logical plan optimizations - Optimizations that are written either custom or use from another source, DataFusion, are registered here in the order they are wished to be executed - Represents functions that modify/convert an original `PyLogicalPlan` into another `PyLogicalPlan` that would be more efficient when running in the underlying Dask framework -- [`RelDataType`](dask_planner/src/sql/types/rel_data_type.rs) +- [`RelDataType`](src/sql/types/rel_data_type.rs) - Not a fan of this name, was chosen to match existing Calcite logic - Represents a "row" in a table - Contains a list of "columns" that are present in that row - - [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) -- [RelDataTypeField](dask_planner/src/sql/types/rel_data_type_field.rs) + - [RelDataTypeField](src/sql/types/rel_data_type_field.rs) +- [RelDataTypeField](src/sql/types/rel_data_type_field.rs) - Represents an individual column in a table - Contains: - `qualifier` - schema the field belongs to - `name` - name of the column/field - `data_type` - `DaskTypeMap` instance containing information about the SQL type and underlying Arrow DataType - `index` - location of the field in the LogicalPlan -- [DaskTypeMap](dask_planner/src/sql/types.rs) +- [DaskTypeMap](src/sql/types.rs) - Maps a conventional SQL type to an underlying Arrow DataType diff --git a/README.md b/README.md index e978fadf8..ac27aea33 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" The Rust DataFusion bindings are built as part of the `pip install`. -If changes are made to the Rust source in `dask_planner/`, another build/install must be run to recompile the bindings: +If changes are made to the Rust source in `src/`, another build/install must be run to recompile the bindings: python setup.py build install diff --git a/docs/source/how_does_it_work.rst b/docs/source/how_does_it_work.rst index 32c736431..67d2eab01 100644 --- a/docs/source/how_does_it_work.rst +++ b/docs/source/how_does_it_work.rst @@ -22,7 +22,7 @@ No matter of via the Python API (:ref:`api`), the command line client (:ref:`cmd This function will first give the SQL string to the dask_planner Rust crate via the ``PyO3`` library. Inside this crate, Apache Arrow DataFusion is used to first parse the SQL string and then turn it into a relational algebra. For this, DataFusion uses the SQL language description specified in the `sqlparser-rs library `_ -We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. +We also include `SQL extensions specific to Dask-SQL `_. They specify custom language features, such as the ``CREATE MODEL`` statement. 3. SQL is (maybe) optimized --------------------------- From b7c02c91333dc02c7e8d6becafd72b30e4fa1e02 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:53:47 -0700 Subject: [PATCH 22/89] Remove more dask_planner appearances --- .github/workflows/rust.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 7e983172b..a9eeab1ab 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -51,7 +51,6 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner bash update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 @@ -60,11 +59,9 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Check workspace in debug mode run: | - cd dask_planner cargo check - name: Check workspace in release mode run: | - cd dask_planner cargo check --release # test the crate @@ -84,7 +81,6 @@ jobs: - name: Optionally update upstream dependencies if: needs.detect-ci-trigger.outputs.triggered == 'true' run: | - cd dask_planner bash update-dependencies.sh - name: Install Protoc uses: arduino/setup-protoc@v1 @@ -93,5 +89,4 @@ jobs: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Run tests run: | - cd dask_planner cargo test From a3e1a6838b71ecf2b1e9b317cf725c6aeb8ae748 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 12:33:07 -0700 Subject: [PATCH 23/89] Bump pyarrow min version to resolve 3.8 conflicts --- continuous_integration/environment-3.10-dev.yaml | 2 +- continuous_integration/environment-3.8-dev.yaml | 2 +- continuous_integration/environment-3.9-dev.yaml | 2 +- continuous_integration/gpuci/environment-3.10.yaml | 2 +- continuous_integration/gpuci/environment-3.9.yaml | 2 +- continuous_integration/recipe/meta.yaml | 1 + docker/conda.txt | 4 ++-- docker/main.dockerfile | 2 +- pyproject.toml | 2 +- 9 files changed, 10 insertions(+), 9 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index cf35db316..8d0710ec2 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -19,7 +19,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 4d737591b..2fd4ddad3 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -18,7 +18,7 @@ dependencies: - pre-commit - prompt_toolkit=3.0.8 - psycopg2 -- pyarrow=6.0.1 +- pyarrow=6.0.2 - pygments=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index ace64cb75..67cf0277d 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -19,7 +19,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index b0332dc4e..297c7572a 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -22,7 +22,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 7b12c8cbe..c8600fcfb 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -22,7 +22,7 @@ dependencies: - pre-commit - prompt_toolkit>=3.0.8 - psycopg2 -- pyarrow>=6.0.1 +- pyarrow>=6.0.2 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 954825e1e..625a071c4 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -24,6 +24,7 @@ requirements: build: - python # [build_platform != target_platform] - cross-python_{{ target_platform }} # [build_platform != target_platform] + - maturin # [build_platform != target_platform] - libprotobuf # [build_platform != target_platform] - zlib # [build_platform != target_platform] - {{ compiler('c') }} diff --git a/docker/conda.txt b/docker/conda.txt index c0f185948..7f0e8d91a 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -4,7 +4,7 @@ pandas>=1.4.0 jpype1>=1.0.2 openjdk>=8 maven>=3.6.0 -pytest>=6.0.1 +pytest>=6.0.2 pytest-cov>=2.10.1 pytest-xdist mock>=4.0.3 @@ -13,7 +13,7 @@ tzlocal>=2.1 # FIXME: handling is needed for httpx-based fastapi>=0.87.0 fastapi>=0.69.0,<0.87.0 uvicorn>=0.13.4 -pyarrow>=6.0.1 +pyarrow>=6.0.2 prompt_toolkit>=3.0.8 pygments>=2.7.1 scikit-learn>=1.0.0 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index ee0ab8c30..2a252e1f5 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -26,7 +26,7 @@ RUN mamba install -y \ "pygments>=2.7.1" \ tabulate \ # additional dependencies - "pyarrow>=6.0.1" \ + "pyarrow>=6.0.2" \ "scikit-learn>=1.0.0" \ "intake>=0.6.0" \ && conda clean -ay diff --git a/pyproject.toml b/pyproject.toml index 0e7ff0578..464c61585 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,7 +46,7 @@ dev = [ "pytest-cov>=2.10.1", "mock>=4.0.3", "sphinx>=3.2.1", - "pyarrow>=6.0.1", + "pyarrow>=6.0.2", "scikit-learn>=1.0.0", "intake>=0.6.0", "pre-commit", From 3ff8240f59bae3a1c844a8af9df125951cbb87cc Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 17 Jul 2023 17:28:25 -0400 Subject: [PATCH 24/89] test commit seeing how CI will respond without cmd_loop import --- dask_sql/__init__.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index 756486b74..5752c3d6d 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,9 +1,11 @@ from . import _version, config -from .cmd import cmd_loop + +# from .cmd import cmd_loop from .context import Context from .datacontainer import Statistics from .server.app import run_server __version__ = _version.get_versions()["version"] -__all__ = [__version__, cmd_loop, Context, run_server, Statistics] +# __all__ = [__version__, cmd_loop, Context, run_server, Statistics] +__all__ = [__version__, Context, run_server, Statistics] From ae7a3d6d9fedb20a466ed6db66eab5aaf419948d Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 14:42:13 -0700 Subject: [PATCH 25/89] Rename module to _datafusion_lib --- dask_sql/__init__.py | 6 ++---- dask_sql/context.py | 3 ++- dask_sql/input_utils/hive.py | 3 ++- dask_sql/mappings.py | 3 ++- dask_sql/physical/rel/base.py | 3 +-- dask_sql/physical/rel/convert.py | 3 +-- dask_sql/physical/rel/custom/alter.py | 3 +-- dask_sql/physical/rel/custom/analyze_table.py | 3 +-- dask_sql/physical/rel/custom/create_catalog_schema.py | 3 +-- dask_sql/physical/rel/custom/create_memory_table.py | 3 +-- dask_sql/physical/rel/custom/create_table.py | 3 +-- dask_sql/physical/rel/custom/describe_model.py | 3 +-- dask_sql/physical/rel/custom/distributeby.py | 3 +-- dask_sql/physical/rel/custom/drop_schema.py | 3 +-- dask_sql/physical/rel/custom/export_model.py | 3 +-- dask_sql/physical/rel/custom/predict_model.py | 3 +-- dask_sql/physical/rel/custom/show_columns.py | 3 +-- dask_sql/physical/rel/custom/show_models.py | 3 +-- dask_sql/physical/rel/custom/show_schemas.py | 3 +-- dask_sql/physical/rel/custom/show_tables.py | 3 +-- dask_sql/physical/rel/custom/use_schema.py | 3 +-- dask_sql/physical/rel/logical/aggregate.py | 3 +-- dask_sql/physical/rel/logical/cross_join.py | 3 +-- dask_sql/physical/rel/logical/empty.py | 3 +-- dask_sql/physical/rel/logical/explain.py | 3 +-- dask_sql/physical/rel/logical/filter.py | 3 +-- dask_sql/physical/rel/logical/join.py | 3 +-- dask_sql/physical/rel/logical/limit.py | 3 +-- dask_sql/physical/rel/logical/project.py | 6 ++---- dask_sql/physical/rel/logical/sort.py | 3 +-- dask_sql/physical/rel/logical/subquery_alias.py | 3 +-- dask_sql/physical/rel/logical/table_scan.py | 3 +-- dask_sql/physical/rel/logical/union.py | 3 +-- dask_sql/physical/rel/logical/window.py | 3 +-- dask_sql/physical/rex/base.py | 2 +- dask_sql/physical/rex/convert.py | 3 +-- dask_sql/physical/rex/core/alias.py | 3 +-- dask_sql/physical/rex/core/call.py | 5 ++--- dask_sql/physical/rex/core/input_ref.py | 3 +-- dask_sql/physical/rex/core/literal.py | 5 ++--- dask_sql/physical/rex/core/subquery.py | 3 +-- dask_sql/utils.py | 2 +- pyproject.toml | 2 +- src/lib.rs | 2 +- tests/unit/test_mapping.py | 2 +- 45 files changed, 52 insertions(+), 88 deletions(-) diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index 5752c3d6d..756486b74 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,11 +1,9 @@ from . import _version, config - -# from .cmd import cmd_loop +from .cmd import cmd_loop from .context import Context from .datacontainer import Statistics from .server.app import run_server __version__ = _version.get_versions()["version"] -# __all__ = [__version__, cmd_loop, Context, run_server, Statistics] -__all__ = [__version__, Context, run_server, Statistics] +__all__ = [__version__, cmd_loop, Context, run_server, Statistics] diff --git a/dask_sql/context.py b/dask_sql/context.py index 03947fa76..ab0c2ae71 100644 --- a/dask_sql/context.py +++ b/dask_sql/context.py @@ -9,7 +9,8 @@ from dask import config as dask_config from dask.base import optimize from dask.utils_test import hlg_layer -from dask_planner import ( + +from dask_sql._datafusion_lib import ( DaskSchema, DaskSQLContext, DaskTable, diff --git a/dask_sql/input_utils/hive.py b/dask_sql/input_utils/hive.py index 5d500180d..14bc547f0 100644 --- a/dask_sql/input_utils/hive.py +++ b/dask_sql/input_utils/hive.py @@ -5,7 +5,8 @@ from typing import Any, Union import dask.dataframe as dd -from dask_planner import SqlTypeName + +from dask_sql._datafusion_lib import SqlTypeName try: from pyhive import hive diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 3d39ee392..ca0e23691 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -7,7 +7,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import DaskTypeMap, SqlTypeName + +from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/base.py b/dask_sql/physical/rel/base.py index f4463fe62..5f70cde4e 100644 --- a/dask_sql/physical/rel/base.py +++ b/dask_sql/physical/rel/base.py @@ -7,9 +7,8 @@ from dask_sql.mappings import cast_column_type, sql_to_python_type if TYPE_CHECKING: - from dask_planner import LogicalPlan, RelDataType - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan, RelDataType logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/convert.py b/dask_sql/physical/rel/convert.py index 24b06c337..6d2beceff 100644 --- a/dask_sql/physical/rel/convert.py +++ b/dask_sql/physical/rel/convert.py @@ -7,9 +7,8 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/alter.py b/dask_sql/physical/rel/custom/alter.py index 16ed9e9bb..b29eb7737 100644 --- a/dask_sql/physical/rel/custom/alter.py +++ b/dask_sql/physical/rel/custom/alter.py @@ -6,9 +6,8 @@ logger = logging.getLogger(__name__) if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class AlterSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/analyze_table.py b/dask_sql/physical/rel/custom/analyze_table.py index 77edfff4b..49308cf3a 100644 --- a/dask_sql/physical/rel/custom/analyze_table.py +++ b/dask_sql/physical/rel/custom/analyze_table.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class AnalyzeTablePlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/create_catalog_schema.py b/dask_sql/physical/rel/custom/create_catalog_schema.py index 74f964621..e55d31a90 100644 --- a/dask_sql/physical/rel/custom/create_catalog_schema.py +++ b/dask_sql/physical/rel/custom/create_catalog_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_memory_table.py b/dask_sql/physical/rel/custom/create_memory_table.py index 8c8c945ff..3c829fb42 100644 --- a/dask_sql/physical/rel/custom/create_memory_table.py +++ b/dask_sql/physical/rel/custom/create_memory_table.py @@ -5,9 +5,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/create_table.py b/dask_sql/physical/rel/custom/create_table.py index 526ec9728..cbe61abf7 100644 --- a/dask_sql/physical/rel/custom/create_table.py +++ b/dask_sql/physical/rel/custom/create_table.py @@ -6,9 +6,8 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/describe_model.py b/dask_sql/physical/rel/custom/describe_model.py index 8b2e144ff..422ac7c3b 100644 --- a/dask_sql/physical/rel/custom/describe_model.py +++ b/dask_sql/physical/rel/custom/describe_model.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DescribeModelPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/distributeby.py b/dask_sql/physical/rel/custom/distributeby.py index 6b6dba0b8..71ac114f2 100644 --- a/dask_sql/physical/rel/custom/distributeby.py +++ b/dask_sql/physical/rel/custom/distributeby.py @@ -6,9 +6,8 @@ from dask_sql.utils import LoggableDataFrame if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/drop_schema.py b/dask_sql/physical/rel/custom/drop_schema.py index 455b27fa4..5491fcaa4 100644 --- a/dask_sql/physical/rel/custom/drop_schema.py +++ b/dask_sql/physical/rel/custom/drop_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/export_model.py b/dask_sql/physical/rel/custom/export_model.py index c96d19786..08446c43c 100644 --- a/dask_sql/physical/rel/custom/export_model.py +++ b/dask_sql/physical/rel/custom/export_model.py @@ -6,9 +6,8 @@ from dask_sql.utils import convert_sql_kwargs if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/predict_model.py b/dask_sql/physical/rel/custom/predict_model.py index c0339b1d7..0bb5c79b4 100644 --- a/dask_sql/physical/rel/custom/predict_model.py +++ b/dask_sql/physical/rel/custom/predict_model.py @@ -9,9 +9,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/custom/show_columns.py b/dask_sql/physical/rel/custom/show_columns.py index a11d05c94..2da4f4535 100644 --- a/dask_sql/physical/rel/custom/show_columns.py +++ b/dask_sql/physical/rel/custom/show_columns.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowColumnsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_models.py b/dask_sql/physical/rel/custom/show_models.py index ecc81e82a..28e495810 100644 --- a/dask_sql/physical/rel/custom/show_models.py +++ b/dask_sql/physical/rel/custom/show_models.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowModelsPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_schemas.py b/dask_sql/physical/rel/custom/show_schemas.py index d49d3708b..fb69c5359 100644 --- a/dask_sql/physical/rel/custom/show_schemas.py +++ b/dask_sql/physical/rel/custom/show_schemas.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowSchemasPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/show_tables.py b/dask_sql/physical/rel/custom/show_tables.py index 85dc3687d..05fb8a66c 100644 --- a/dask_sql/physical/rel/custom/show_tables.py +++ b/dask_sql/physical/rel/custom/show_tables.py @@ -7,9 +7,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ShowTablesPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/custom/use_schema.py b/dask_sql/physical/rel/custom/use_schema.py index 563415c2d..f5fc65b7d 100644 --- a/dask_sql/physical/rel/custom/use_schema.py +++ b/dask_sql/physical/rel/custom/use_schema.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class UseSchemaPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/aggregate.py b/dask_sql/physical/rel/logical/aggregate.py index 27f5c102c..dd2f9f41d 100644 --- a/dask_sql/physical/rel/logical/aggregate.py +++ b/dask_sql/physical/rel/logical/aggregate.py @@ -15,9 +15,8 @@ from dask_sql.utils import is_cudf_type, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/cross_join.py b/dask_sql/physical/rel/logical/cross_join.py index dfa8cdf3c..d1c74c8cc 100644 --- a/dask_sql/physical/rel/logical/cross_join.py +++ b/dask_sql/physical/rel/logical/cross_join.py @@ -6,9 +6,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/empty.py b/dask_sql/physical/rel/logical/empty.py index b50699b79..453f63de5 100644 --- a/dask_sql/physical/rel/logical/empty.py +++ b/dask_sql/physical/rel/logical/empty.py @@ -8,9 +8,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/explain.py b/dask_sql/physical/rel/logical/explain.py index abf1d814c..0e4875d0c 100644 --- a/dask_sql/physical/rel/logical/explain.py +++ b/dask_sql/physical/rel/logical/explain.py @@ -3,9 +3,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class ExplainPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/filter.py b/dask_sql/physical/rel/logical/filter.py index a37e390ec..af3685a11 100644 --- a/dask_sql/physical/rel/logical/filter.py +++ b/dask_sql/physical/rel/logical/filter.py @@ -11,9 +11,8 @@ from dask_sql.physical.utils.filter import attempt_predicate_pushdown if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/join.py b/dask_sql/physical/rel/logical/join.py index cec7df4d9..1657d2bf4 100644 --- a/dask_sql/physical/rel/logical/join.py +++ b/dask_sql/physical/rel/logical/join.py @@ -17,9 +17,8 @@ from dask_sql.utils import is_cudf_type if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/limit.py b/dask_sql/physical/rel/logical/limit.py index 00ba37fa2..9bd2be562 100644 --- a/dask_sql/physical/rel/logical/limit.py +++ b/dask_sql/physical/rel/logical/limit.py @@ -11,9 +11,8 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DaskLimitPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/project.py b/dask_sql/physical/rel/logical/project.py index 4630b5d6b..0a7637f59 100644 --- a/dask_sql/physical/rel/logical/project.py +++ b/dask_sql/physical/rel/logical/project.py @@ -1,17 +1,15 @@ import logging from typing import TYPE_CHECKING -from dask_planner import RexType - +from dask_sql._datafusion_lib import RexType from dask_sql.datacontainer import DataContainer from dask_sql.physical.rel.base import BaseRelPlugin from dask_sql.physical.rex import RexConverter from dask_sql.utils import new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/sort.py b/dask_sql/physical/rel/logical/sort.py index 6dc57211c..9dfccdc49 100644 --- a/dask_sql/physical/rel/logical/sort.py +++ b/dask_sql/physical/rel/logical/sort.py @@ -5,9 +5,8 @@ from dask_sql.physical.utils.sort import apply_sort if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class DaskSortPlugin(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/subquery_alias.py b/dask_sql/physical/rel/logical/subquery_alias.py index e82d9b105..14be8928f 100644 --- a/dask_sql/physical/rel/logical/subquery_alias.py +++ b/dask_sql/physical/rel/logical/subquery_alias.py @@ -4,9 +4,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan class SubqueryAlias(BaseRelPlugin): diff --git a/dask_sql/physical/rel/logical/table_scan.py b/dask_sql/physical/rel/logical/table_scan.py index b3b5cab0a..53e1d29be 100644 --- a/dask_sql/physical/rel/logical/table_scan.py +++ b/dask_sql/physical/rel/logical/table_scan.py @@ -11,9 +11,8 @@ from dask_sql.physical.rex import RexConverter if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rel/logical/union.py b/dask_sql/physical/rel/logical/union.py index 1fbc5b5ae..f31ced797 100644 --- a/dask_sql/physical/rel/logical/union.py +++ b/dask_sql/physical/rel/logical/union.py @@ -6,9 +6,8 @@ from dask_sql.physical.rel.base import BaseRelPlugin if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan def _extract_df(obj_cc, obj_df, output_field_names): diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index bbcdae740..aba788bc3 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -16,9 +16,8 @@ from dask_sql.utils import LoggableDataFrame, new_temporary_column if TYPE_CHECKING: - from dask_planner import LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/base.py b/dask_sql/physical/rex/base.py index 7f97a70d9..d74ad6309 100644 --- a/dask_sql/physical/rex/base.py +++ b/dask_sql/physical/rex/base.py @@ -7,7 +7,7 @@ from dask_sql.datacontainer import DataContainer if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/convert.py b/dask_sql/physical/rex/convert.py index fce64be30..1713e496d 100644 --- a/dask_sql/physical/rex/convert.py +++ b/dask_sql/physical/rex/convert.py @@ -8,9 +8,8 @@ from dask_sql.utils import LoggableDataFrame, Pluggable if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/alias.py b/dask_sql/physical/rex/core/alias.py index d6ae20698..7486bc9c5 100644 --- a/dask_sql/physical/rex/core/alias.py +++ b/dask_sql/physical/rex/core/alias.py @@ -7,9 +7,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexAliasPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 56d01d006..e513556d0 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -13,9 +13,9 @@ from dask.dataframe.core import Series from dask.highlevelgraph import HighLevelGraph from dask.utils import random_state_data -from dask_planner import SqlTypeName from dask_sql._compat import DASK_CUDF_TODATETIME_SUPPORT, PANDAS_GT_200 +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import ( cast_column_to_type, @@ -34,9 +34,8 @@ ) if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) SeriesOrScalar = Union[dd.Series, Any] diff --git a/dask_sql/physical/rex/core/input_ref.py b/dask_sql/physical/rex/core/input_ref.py index 01bf871c7..4d2c0f929 100644 --- a/dask_sql/physical/rex/core/input_ref.py +++ b/dask_sql/physical/rex/core/input_ref.py @@ -6,9 +6,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexInputRefPlugin(BaseRexPlugin): diff --git a/dask_sql/physical/rex/core/literal.py b/dask_sql/physical/rex/core/literal.py index 7fe59b383..da0eeb128 100644 --- a/dask_sql/physical/rex/core/literal.py +++ b/dask_sql/physical/rex/core/literal.py @@ -4,16 +4,15 @@ import dask.dataframe as dd import numpy as np -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan logger = logging.getLogger(__name__) diff --git a/dask_sql/physical/rex/core/subquery.py b/dask_sql/physical/rex/core/subquery.py index 1253f257d..60a07c0b9 100644 --- a/dask_sql/physical/rex/core/subquery.py +++ b/dask_sql/physical/rex/core/subquery.py @@ -7,9 +7,8 @@ from dask_sql.physical.rex.base import BaseRexPlugin if TYPE_CHECKING: - from dask_planner import Expression, LogicalPlan - import dask_sql + from dask_sql._datafusion_lib import Expression, LogicalPlan class RexScalarSubqueryPlugin(BaseRexPlugin): diff --git a/dask_sql/utils.py b/dask_sql/utils.py index c2cfe45ab..454eecb7f 100644 --- a/dask_sql/utils.py +++ b/dask_sql/utils.py @@ -8,8 +8,8 @@ import dask.dataframe as dd import numpy as np import pandas as pd -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.datacontainer import DataContainer from dask_sql.mappings import sql_to_python_value diff --git a/pyproject.toml b/pyproject.toml index 464c61585..75404e3e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ license-files = ["LICENSE.txt"] find = {namespaces = false} [tool.maturin] -module-name = "dask_planner" +module-name = "dask_sql._datafusion_lib" include = [ { path = "Cargo.lock", format = "sdist" } ] diff --git a/src/lib.rs b/src/lib.rs index 63879e2fb..921478973 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -12,7 +12,7 @@ mod sql; /// The higher-level public API is defined in pure python files under the /// dask_planner directory. #[pymodule] -fn dask_planner(py: Python, m: &PyModule) -> PyResult<()> { +fn _datafusion_lib(py: Python, m: &PyModule) -> PyResult<()> { // Initialize the global Python logger instance pyo3_log::init(); diff --git a/tests/unit/test_mapping.py b/tests/unit/test_mapping.py index 952bcb10e..98f065bf8 100644 --- a/tests/unit/test_mapping.py +++ b/tests/unit/test_mapping.py @@ -3,8 +3,8 @@ import numpy as np import pandas as pd import pytest -from dask_planner import SqlTypeName +from dask_sql._datafusion_lib import SqlTypeName from dask_sql.mappings import python_to_sql_type, similar_type, sql_to_python_value From 0c2908ce8fc2f453eb993196f378fea584e7bf2e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 17 Jul 2023 15:10:10 -0700 Subject: [PATCH 26/89] Switch to maturin develop for CI installs --- .github/workflows/test-upstream.yml | 2 +- .github/workflows/test.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-upstream.yml b/.github/workflows/test-upstream.yml index df361bb49..7c231c929 100644 --- a/.github/workflows/test-upstream.yml +++ b/.github/workflows/test-upstream.yml @@ -71,7 +71,7 @@ jobs: bash update-dependencies.sh - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2bd043b34..b3ec34a76 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,7 +72,7 @@ jobs: shared-key: test - name: Build the Rust DataFusion bindings run: | - python setup.py build install + maturin develop - name: Install hive testing dependencies if: matrix.os == 'ubuntu-latest' run: | From 849dc42a4e7382fff6aa253355dafcf138afb350 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 10:46:27 -0400 Subject: [PATCH 27/89] Fix failing cargo tests, changed output, from datafusion version bump --- .cargo/config.toml | 5 +++++ Cargo.toml | 14 +++++++++----- src/parser.rs | 27 +++++---------------------- src/sql/optimizer.rs | 13 ++----------- 4 files changed, 21 insertions(+), 38 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index d47f983e4..3bbaccf35 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,3 +9,8 @@ rustflags = [ "-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup", ] + +[target.x86_64-unknown-linux-gnu] +rustflags = [ + "-C", "link-arg=-undefined" +] diff --git a/Cargo.toml b/Cargo.toml index 465472c11..d80f261bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,17 +9,21 @@ edition = "2021" rust-version = "1.65" include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", "Cargo.lock"] -[lib] -name = "dask_sql" -crate-type = ["cdylib", "rlib"] - [dependencies] async-trait = "0.1.71" datafusion-python = "27.0.0" env_logger = "0.10" log = "^0.4" -pyo3 = { version = "0.19.0", features = ["extension-module", "abi3", "abi3-py38"] } +pyo3 = { version = "0.19.1", features = ["extension-module", "abi3", "abi3-py38"] } pyo3-log = "0.8.2" [build-dependencies] pyo3-build-config = "0.19.1" + +[lib] +name = "dask_sql" +crate-type = ["cdylib", "rlib"] + +[profile.release] +lto = true +codegen-units = 1 diff --git a/src/parser.rs b/src/parser.rs index 3147e6309..a051454bb 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -1374,14 +1374,7 @@ mod test { let statements = DaskParser::parse_sql(sql).unwrap(); assert_eq!(1, statements.len()); let actual = format!("{:?}", statements[0]); - let expected = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), \ - Unnamed(Expr(Value(Number(\"2\", false)))), \ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"timestampadd\", quote_style: None }]), args: [Unnamed(Expr(Value(SingleQuotedString(\"YEAR\")))), Unnamed(Expr(Value(Number(\"2\", false)))), Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None })))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; assert!(actual.contains(expected)); } @@ -1391,26 +1384,16 @@ mod test { let statements1 = DaskParser::parse_sql(sql1).unwrap(); assert_eq!(1, statements1.len()); let actual1 = format!("{:?}", statements1[0]); - let expected1 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected1 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"%Y-%m-%d %H:%M:%S\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual1.contains(expected1)); let sql2 = "SELECT TO_TIMESTAMP(d, \"%d/%m/%Y\") FROM t"; let statements2 = DaskParser::parse_sql(sql2).unwrap(); assert_eq!(1, statements2.len()); let actual2 = format!("{:?}", statements2[0]); - let expected2 = "projection: [\ - UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), \ - args: [\ - Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), \ - Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))\ - ], over: None, distinct: false, special: false }))\ - ]"; + let expected2 = "Statement(Query(Query { with: None, body: Select(Select { distinct: None, top: None, projection: [UnnamedExpr(Function(Function { name: ObjectName([Ident { value: \"dsql_totimestamp\", quote_style: None }]), args: [Unnamed(Expr(Identifier(Ident { value: \"d\", quote_style: None }))), Unnamed(Expr(Value(SingleQuotedString(\"\\\"%d/%m/%Y\\\"\"))))], over: None, distinct: false, special: false, order_by: [] }))], into: None, from: [TableWithJoins { relation: Table { name: ObjectName([Ident { value: \"t\", quote_style: None }]), alias: None, args: None, with_hints: [] }, joins: [] }], lateral_views: [], selection: None, group_by: [], cluster_by: [], distribute_by: [], sort_by: [], having: None, named_window: [], qualify: None }), order_by: [], limit: None, offset: None, fetch: None, locks: [] }))"; + assert!(actual2.contains(expected2)); } diff --git a/src/sql/optimizer.rs b/src/sql/optimizer.rs index a5957ac98..484ee7dd6 100644 --- a/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -147,17 +147,8 @@ mod tests { AND (cast('2002-05-08' as date) + interval '5 days')\ )"; let plan = test_sql(sql)?; - let expected = r#"Projection: test.col_int32 - Filter: CAST(test.col_int32 AS Float64) > __scalar_sq_1.__value - CrossJoin: - TableScan: test projection=[col_int32] - SubqueryAlias: __scalar_sq_1 - Projection: AVG(test.col_int32) AS __value - Aggregate: groupBy=[[]], aggr=[[AVG(test.col_int32)]] - Projection: test.col_int32 - Filter: test.col_utf8 >= Utf8("2002-05-08") AND test.col_utf8 <= Utf8("2002-05-13") - TableScan: test projection=[col_int32, col_utf8]"#; - assert_eq!(expected, format!("{:?}", plan)); + + assert!(expected.contains(r#"<= Date32("11820")"#)); Ok(()) } From 1f73b567311528a986d43df0bbfb3c2000342d98 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 11:19:37 -0400 Subject: [PATCH 28/89] Fix cargo test syntax issue --- src/sql/optimizer.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/sql/optimizer.rs b/src/sql/optimizer.rs index 484ee7dd6..b9fdaca06 100644 --- a/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -147,8 +147,7 @@ mod tests { AND (cast('2002-05-08' as date) + interval '5 days')\ )"; let plan = test_sql(sql)?; - - assert!(expected.contains(r#"<= Date32("11820")"#)); + assert!(format!("{:?}", plan).contains(r#"<= Date32("11820")"#)); Ok(()) } From 79b6eacc600f255ed0c513d2fda0d73e8cc2378c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 09:49:10 -0700 Subject: [PATCH 29/89] Fix failing Rust tests --- src/expression.rs | 4 ++-- src/parser.rs | 2 +- src/sql.rs | 14 -------------- src/sql/types.rs | 8 ++++---- 4 files changed, 7 insertions(+), 21 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index d13f66e89..53d4f1c84 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -43,7 +43,7 @@ use crate::{ }; /// An PyExpr that can be used on a DataFrame -#[pyclass(name = "Expression", module = "datafusion", subclass)] +#[pyclass(name = "Expression", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyExpr { pub expr: Expr, @@ -57,7 +57,7 @@ impl From for Expr { } } -#[pyclass(name = "ScalarValue", module = "datafusion", subclass)] +#[pyclass(name = "ScalarValue", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct PyScalarValue { pub scalar_value: ScalarValue, diff --git a/src/parser.rs b/src/parser.rs index a051454bb..100f9c137 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -30,7 +30,7 @@ pub enum CustomExpr { Nested(Vec<(String, PySqlArg)>), } -#[pyclass(name = "SqlArg", module = "datafusion")] +#[pyclass(name = "SqlArg", module = "dask_sql")] #[derive(Debug, Clone, PartialEq, Eq)] pub struct PySqlArg { expr: Option, diff --git a/src/sql.rs b/src/sql.rs index 585fcad4d..c9a600225 100644 --- a/src/sql.rs +++ b/src/sql.rs @@ -78,20 +78,6 @@ use crate::{ /// /// The following example demonstrates how to generate an optimized LogicalPlan /// from SQL using DaskSQLContext. -/// -/// ``` -/// use datafusion_python::datafusion::prelude::*; -/// -/// # use datafusion_python::datafusion_common::Result; -/// # #[tokio::main] -/// # async fn main() -> Result<()> { -/// let mut ctx = DaskSQLContext::new(); -/// let parsed_sql = ctx.parse_sql("SELECT COUNT(*) FROM test_table"); -/// let nonOptimizedRelAlgebra = ctx.logical_relational_algebra(parsed_sql); -/// let optmizedRelAlg = ctx.optimizeRelationalAlgebra(nonOptimizedRelAlgebra); -/// # Ok(()) -/// # } -/// ``` #[pyclass(name = "DaskSQLContext", module = "dask_sql", subclass)] #[derive(Debug, Clone)] pub struct DaskSQLContext { diff --git a/src/sql/types.rs b/src/sql/types.rs index 4642a4eb0..34af22342 100644 --- a/src/sql/types.rs +++ b/src/sql/types.rs @@ -12,7 +12,7 @@ use pyo3::{prelude::*, types::PyDict}; use crate::{dialect::DaskDialect, error::DaskPlannerError, sql::exceptions::py_type_err}; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "RexType", module = "datafusion")] +#[pyclass(name = "RexType", module = "dask_sql")] pub enum RexType { Alias, Literal, @@ -23,7 +23,7 @@ pub enum RexType { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "DaskTypeMap", module = "datafusion", subclass)] +#[pyclass(name = "DaskTypeMap", module = "dask_sql", subclass)] /// Represents a Python Data Type. This is needed instead of simple /// Enum instances because PyO3 can only support unit variants as /// of version 0.16 which means Enums like `DataType::TIMESTAMP_WITH_LOCAL_TIME_ZONE` @@ -167,7 +167,7 @@ impl DaskTypeMap { } #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "PyDataType", module = "datafusion", subclass)] +#[pyclass(name = "PyDataType", module = "dask_sql", subclass)] pub struct PyDataType { data_type: DataType, } @@ -210,7 +210,7 @@ impl From for PyDataType { #[allow(non_camel_case_types)] #[allow(clippy::upper_case_acronyms)] #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] -#[pyclass(name = "SqlTypeName", module = "datafusion")] +#[pyclass(name = "SqlTypeName", module = "dask_sql")] pub enum SqlTypeName { ANY, ARRAY, From 405470f168b453cd4e615972e9f85f4e6823fe1c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 18 Jul 2023 14:47:28 -0400 Subject: [PATCH 30/89] Remove linux config.toml options --- .cargo/config.toml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.cargo/config.toml b/.cargo/config.toml index 3bbaccf35..d47f983e4 100644 --- a/.cargo/config.toml +++ b/.cargo/config.toml @@ -9,8 +9,3 @@ rustflags = [ "-C", "link-arg=-undefined", "-C", "link-arg=dynamic_lookup", ] - -[target.x86_64-unknown-linux-gnu] -rustflags = [ - "-C", "link-arg=-undefined" -] From 7870c96c4ff7d807b6fb502690151d1d79167624 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 20 Jul 2023 11:53:03 -0700 Subject: [PATCH 31/89] Fix Rust object import --- dask_sql/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_sql/__init__.py b/dask_sql/__init__.py index d923876b8..fd8339b5a 100644 --- a/dask_sql/__init__.py +++ b/dask_sql/__init__.py @@ -1,6 +1,6 @@ # FIXME: can we modify TLS model of Rust object to avoid aarch64 glibc bug? # https://github.com/dask-contrib/dask-sql/issues/1169 -import dask_planner.rust +from . import _datafusion_lib # isort:skip from . import _version, config from .cmd import cmd_loop From 2961cfa81e08e51676a62c607631146feed5da8b Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 20 Jul 2023 11:53:48 -0700 Subject: [PATCH 32/89] Apply code suggestions --- src/sql/optimizer/dynamic_partition_pruning.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs index ac931b560..8cde26233 100644 --- a/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/src/sql/optimizer/dynamic_partition_pruning.rs @@ -782,6 +782,7 @@ fn satisfies_int64(long_value: Option, filter: Expr) -> bool { Expr::Literal(ScalarValue::Int32(i)) => i64::from(i.unwrap()), Expr::Literal(ScalarValue::Float64(i)) => i.unwrap() as i64, Expr::Literal(ScalarValue::TimestampNanosecond(i, None)) => i.unwrap(), + Expr::Literal(ScalarValue::Date32(i)) => i64::from(i.unwrap()), _ => { panic!("Unknown ScalarValue type {filter_value}"); } From 9983700435b0d3be99bc899d14174b49445d7c09 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 25 Jul 2023 07:38:40 -0700 Subject: [PATCH 33/89] Bump to recent ADP commit --- Cargo.lock | 162 +++++++++--------- Cargo.toml | 2 +- src/expression.rs | 37 ++-- src/sql/logical/aggregate.rs | 8 +- src/sql/logical/projection.rs | 13 +- src/sql/logical/table_scan.rs | 11 +- .../optimizer/dynamic_partition_pruning.rs | 2 +- 7 files changed, 122 insertions(+), 113 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 685bc7de2..7ad7d07f5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -125,9 +125,9 @@ checksum = "8868f09ff8cea88b079da74ae569d9b8c62a23c68c746240b704ee6f7525c89c" [[package]] name = "arrow" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "773d18d72cd290f3f9e2149a714c8ac404b6c3fd614c684f0015449940fca899" +checksum = "2feeebd77b34b0bc88f224e06d01c27da4733997cc4789a4e056196656cdc59a" dependencies = [ "ahash", "arrow-arith", @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93bc0da4b22ba63807fa2a74998e21209179c93c67856ae65d9218b81f3ef918" +checksum = "7173f5dc49c0ecb5135f52565af33afd3fdc9a12d13bd6f9973e8b96305e4b2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -163,9 +163,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea9a0fd21121304cad96f307c938d861cb1e7f0c151b93047462cd9817d760fb" +checksum = "63d7ea725f7d1f8bb2cffc53ef538557e95fc802e217d5be25122d402e22f3d0" dependencies = [ "ahash", "arrow-buffer", @@ -180,9 +180,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30ce342ecf5971004e23cef8b5fb3bacd2bbc48a381464144925074e1472e9eb" +checksum = "bdbe439e077f484e5000b9e1d47b5e4c0d15f2b311a8f5bcc682553d5d67a722" dependencies = [ "half", "num", @@ -190,9 +190,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b94a0ce7d27abbb02e2ee4db770f593127610f57b32625b0bc6a1a90d65f085" +checksum = "93913cc14875770aa1eef5e310765e855effa352c094cb1c7c00607d0f37b4e1" dependencies = [ "arrow-array", "arrow-buffer", @@ -208,9 +208,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f3be10a00a43c4bf0d243c070754ebdde17c5d576b4928d9c3efbe3005a3853" +checksum = "ef55b67c55ed877e6fe7b923121c19dae5e31ca70249ea2779a17b58fb0fbd9a" dependencies = [ "arrow-array", "arrow-buffer", @@ -227,9 +227,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d9a83dad6a53d6907765106d3bc61d6d9d313cfe1751701b3ef0948e7283dc2" +checksum = "d4f4f4a3c54614126a71ab91f6631c9743eb4643d6e9318b74191da9dc6e028b" dependencies = [ "arrow-buffer", "arrow-schema", @@ -239,9 +239,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a46da5e438a854e0386b38774da88a98782c0973c6dbc5c949ca4e02faf9b016" +checksum = "d41a3659f984a524ef1c2981d43747b24d8eec78e2425267fcd0ef34ce71cd18" dependencies = [ "arrow-array", "arrow-buffer", @@ -253,9 +253,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5f27a1fbc76553ad92dc1a9583e56b7058d8c418c4089b0b689f5b87e2da5e1" +checksum = "10b95faa95a378f56ef32d84cc0104ea998c39ef7cd1faaa6b4cebf8ea92846d" dependencies = [ "arrow-array", "arrow-buffer", @@ -264,7 +264,7 @@ dependencies = [ "arrow-schema", "chrono", "half", - "indexmap", + "indexmap 2.0.0", "lexical-core", "num", "serde", @@ -273,9 +273,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2373661f6c2233e18f6fa69c40999a9440231d1e8899be8bbbe73c7e24aa3b4" +checksum = "c68549a4284d9f8b39586afb8d5ff8158b8f0286353a4844deb1d11cf1ba1f26" dependencies = [ "arrow-array", "arrow-buffer", @@ -288,9 +288,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "377cd5158b7de4034a175e296726c40c3236e65d71d90a5dab2fb4fab526a8f4" +checksum = "0a75a4a757afc301ce010adadff54d79d66140c4282ed3de565f6ccb716a5cf3" dependencies = [ "ahash", "arrow-array", @@ -303,18 +303,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba9ed245bd2d7d97ad1457cb281d4296e8b593588758b8fec6d67b2b2b0f2265" +checksum = "2bebcb57eef570b15afbcf2d07d813eb476fde9f6dd69c81004d6476c197e87e" dependencies = [ "bitflags 2.3.2", ] [[package]] name = "arrow-select" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0dc9bd6aebc565b1d04bae64a0f4dda3abc677190eb7d960471b1b20e1cebed0" +checksum = "f6e2943fa433a48921e914417173816af64eef61c0a3d448280e6c40a62df221" dependencies = [ "arrow-array", "arrow-buffer", @@ -325,15 +325,16 @@ dependencies = [ [[package]] name = "arrow-string" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23cf2baea2ef53787332050decf7d71aca836a352e188c8ad062892405955d2b" +checksum = "bbc92ed638851774f6d7af1ad900b92bc1486746497511868b4298fcbcfa35af" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "num", "regex", "regex-syntax", ] @@ -672,9 +673,8 @@ dependencies = [ [[package]] name = "datafusion" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e96f6e4eb10bd3e6b709686858246466983e8c5354a928ff77ee34919aa60d00" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "ahash", "apache-avro", @@ -692,13 +692,13 @@ dependencies = [ "datafusion-expr", "datafusion-optimizer", "datafusion-physical-expr", - "datafusion-row", "datafusion-sql", "flate2", "futures", "glob", + "half", "hashbrown 0.14.0", - "indexmap", + "indexmap 2.0.0", "itertools 0.11.0", "lazy_static", "log", @@ -723,9 +723,8 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00e5fddcc0dd49bbe199e43aa406f39c46c790bb2a43c7b36a478e5f3f971235" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "apache-avro", "arrow", @@ -740,9 +739,8 @@ dependencies = [ [[package]] name = "datafusion-execution" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfd50b6cb17acc78d2473c0d28014b8fd4e2e0a2c067c07645d6547b33b0aeeb" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "dashmap", "datafusion-common", @@ -758,9 +756,8 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1a35dc2cd9eac18063d636f7ddf4f090fe1f34284d80192ac7ade38cc3c6991" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "ahash", "arrow", @@ -773,9 +770,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5043afeb45ec1c0f45519e1eed6a477f2d30732e8f975d9cf9a75fba0ca716" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "arrow", "async-trait", @@ -791,27 +787,28 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6cc892a24f4b829ee7718ad3950884c0346dbdf1517f3df153af4bcf54d8ca4d" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "ahash", "arrow", "arrow-array", "arrow-buffer", "arrow-schema", + "base64", "blake2", "blake3", "chrono", "datafusion-common", "datafusion-expr", - "datafusion-row", "half", "hashbrown 0.14.0", - "indexmap", + "hex", + "indexmap 2.0.0", "itertools 0.11.0", "lazy_static", "libc", + "log", "md-5", "paste", "petgraph", @@ -824,9 +821,8 @@ dependencies = [ [[package]] name = "datafusion-python" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3311b157d1afe2a363d37a5ccb675384aa76e6033572ef9246f8af1579e6f0b2" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=309fc486c47d86776aeec07d86cd04b5d70d97a1#309fc486c47d86776aeec07d86cd04b5d70d97a1" dependencies = [ "async-trait", "datafusion", @@ -851,23 +847,10 @@ dependencies = [ "uuid", ] -[[package]] -name = "datafusion-row" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce75c660bbddfdd254109e668e5b5bd69df31ea26e3768e15cef0c68015e650e" -dependencies = [ - "arrow", - "datafusion-common", - "paste", - "rand", -] - [[package]] name = "datafusion-sql" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49cab87e4933a452e0b7b3f0cbd0e760daf7d33fb54d09d70d3ffba229eaa652" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "arrow", "arrow-schema", @@ -879,9 +862,8 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "27.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ba77d22232053f6cdd98bd6f5328940850844450253f25b8c50bfc5199c505d4" +version = "28.0.0" +source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" dependencies = [ "async-recursion", "chrono", @@ -945,6 +927,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.1" @@ -1157,7 +1145,7 @@ dependencies = [ "futures-sink", "futures-util", "http", - "indexmap", + "indexmap 1.9.3", "slab", "tokio", "tokio-util", @@ -1220,6 +1208,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fed44880c466736ef9a5c5b5facefb5ed0785676d0c02d612db14e54f0d84286" +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + [[package]] name = "http" version = "0.2.9" @@ -1340,6 +1334,16 @@ dependencies = [ "hashbrown 0.12.3", ] +[[package]] +name = "indexmap" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5477fe2230a79769d8dc68e0eabf5437907c0457a5614a9e8dddb67f65eb65d" +dependencies = [ + "equivalent", + "hashbrown 0.14.0", +] + [[package]] name = "indoc" version = "1.0.9" @@ -1843,9 +1847,9 @@ dependencies = [ [[package]] name = "parquet" -version = "42.0.0" +version = "43.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baab9c36b1c8300b81b4d577d306a0a733f9d34021363098d3548e37757ed6c8" +checksum = "ec7267a9607c3f955d4d0ac41b88a67cecc0d8d009173ad3da390699a6cb3750" dependencies = [ "ahash", "arrow-array", @@ -1903,7 +1907,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4" dependencies = [ "fixedbitset", - "indexmap", + "indexmap 1.9.3", ] [[package]] @@ -2468,7 +2472,7 @@ version = "0.9.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9d684e3ec7de3bf5466b32bd75303ac16f0736426e5a4e0d6e489559ce1249c" dependencies = [ - "indexmap", + "indexmap 1.9.3", "itoa", "ryu", "serde", @@ -2621,9 +2625,9 @@ dependencies = [ [[package]] name = "substrait" -version = "0.11.0" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d3b77ddddd080d1bb5ebfe6b62d1c4e2f33c9f6a4586d5eac5306a08f3d4585" +checksum = "2ac1ce8315086b127ca0abf162c62279550942bb26ebf7946fe17fe114446472" dependencies = [ "git2", "heck", diff --git a/Cargo.toml b/Cargo.toml index f15249c90..6da783b80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", [dependencies] async-trait = "0.1.71" -datafusion-python = "27.0.0" +datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "309fc486c47d86776aeec07d86cd04b5d70d97a1" } env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.19.1", features = ["extension-module", "abi3", "abi3-py38"] } diff --git a/src/expression.rs b/src/expression.rs index 53d4f1c84..7f5e2ef35 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -7,6 +7,7 @@ use datafusion_python::{ expr::{ AggregateFunction, AggregateUDF, + Alias, BinaryExpr, Cast, Exists, @@ -115,7 +116,6 @@ impl PyExpr { | Expr::Negative(..) | Expr::IsNull(..) | Expr::Like { .. } - | Expr::ILike { .. } | Expr::SimilarTo { .. } | Expr::Between { .. } | Expr::Case { .. } @@ -289,7 +289,6 @@ impl PyExpr { | Expr::IsNotTrue(_) | Expr::IsNotFalse(_) | Expr::Like { .. } - | Expr::ILike { .. } | Expr::SimilarTo { .. } | Expr::IsNotUnknown(_) | Expr::Case { .. } @@ -333,8 +332,7 @@ impl PyExpr { } // Expr(s) that house the Expr instance to return in their bounded params - Expr::Alias(expr, ..) - | Expr::Not(expr) + Expr::Not(expr) | Expr::IsNull(expr) | Expr::IsNotNull(expr) | Expr::IsTrue(expr) @@ -395,6 +393,9 @@ impl PyExpr { Ok(operands) } + Expr::Alias(Alias { expr, .. }) => { + Ok(vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]) + } Expr::InList(InList { expr, list, .. }) => { let mut operands: Vec = vec![PyExpr::from(*expr.clone(), self.input_plan.clone())]; @@ -412,10 +413,6 @@ impl PyExpr { PyExpr::from(*expr.clone(), self.input_plan.clone()), PyExpr::from(*pattern.clone(), self.input_plan.clone()), ]), - Expr::ILike(Like { expr, pattern, .. }) => Ok(vec![ - PyExpr::from(*expr.clone(), self.input_plan.clone()), - PyExpr::from(*pattern.clone(), self.input_plan.clone()), - ]), Expr::SimilarTo(Like { expr, pattern, .. }) => Ok(vec![ PyExpr::from(*expr.clone(), self.input_plan.clone()), PyExpr::from(*pattern.clone(), self.input_plan.clone()), @@ -479,13 +476,6 @@ impl PyExpr { "like".to_string() } } - Expr::ILike(Like { negated, .. }) => { - if *negated { - "not ilike".to_string() - } else { - "ilike".to_string() - } - } Expr::SimilarTo(Like { negated, .. }) => { if *negated { "not similar to".to_string() @@ -577,6 +567,11 @@ impl PyExpr { ScalarValue::List(..) => "List", ScalarValue::Struct(..) => "Struct", ScalarValue::FixedSizeBinary(_, _) => "FixedSizeBinary", + ScalarValue::Fixedsizelist(..) => "Fixedsizelist", + ScalarValue::DurationSecond(..) => "DurationSecond", + ScalarValue::DurationMillisecond(..) => "DurationMillisecond", + ScalarValue::DurationMicrosecond(..) => "DurationMicrosecond", + ScalarValue::DurationNanosecond(..) => "DurationNanosecond", }, Expr::ScalarFunction(ScalarFunction { fun, args: _ }) => match fun { BuiltinScalarFunction::Abs => "Abs", @@ -658,7 +653,7 @@ impl PyExpr { pub fn get_filter_expr(&self) -> PyResult> { // TODO refactor to avoid duplication match &self.expr { - Expr::Alias(expr, _) => match expr.as_ref() { + Expr::Alias(Alias { expr, .. }) => match expr.as_ref() { Expr::AggregateFunction(AggregateFunction { filter, .. }) | Expr::AggregateUDF(AggregateUDF { filter, .. }) => match filter { Some(filter) => { @@ -830,7 +825,7 @@ impl PyExpr { match &self.expr { Expr::AggregateFunction(funct) => Ok(funct.distinct), Expr::AggregateUDF { .. } => Ok(false), - Expr::Alias(expr, _) => match expr.as_ref() { + Expr::Alias(Alias { expr, .. }) => match expr.as_ref() { Expr::AggregateFunction(funct) => Ok(funct.distinct), Expr::AggregateUDF { .. } => Ok(false), _ => Err(py_type_err( @@ -871,9 +866,9 @@ impl PyExpr { #[pyo3(name = "getEscapeChar")] pub fn get_escape_char(&self) -> PyResult> { match &self.expr { - Expr::Like(Like { escape_char, .. }) - | Expr::ILike(Like { escape_char, .. }) - | Expr::SimilarTo(Like { escape_char, .. }) => Ok(*escape_char), + Expr::Like(Like { escape_char, .. }) | Expr::SimilarTo(Like { escape_char, .. }) => { + Ok(*escape_char) + } _ => Err(py_type_err(format!( "Provided Expr {:?} not one of Like/ILike/SimilarTo", &self.expr @@ -901,7 +896,7 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { fn get_expr_name(expr: &Expr) -> Result { match expr { - Expr::Alias(expr, _) => get_expr_name(expr), + Expr::Alias(Alias { expr, .. }) => get_expr_name(expr), Expr::Wildcard => { // 'Wildcard' means any and all columns. We get the first valid column name here Ok("*".to_owned()) diff --git a/src/sql/logical/aggregate.rs b/src/sql/logical/aggregate.rs index a36750dba..1c4074239 100644 --- a/src/sql/logical/aggregate.rs +++ b/src/sql/logical/aggregate.rs @@ -1,5 +1,5 @@ use datafusion_python::datafusion_expr::{ - expr::{AggregateFunction, AggregateUDF}, + expr::{AggregateFunction, AggregateUDF, Alias}, logical_plan::{Aggregate, Distinct}, Expr, LogicalPlan, @@ -73,7 +73,7 @@ impl PyAggregate { impl PyAggregate { fn _aggregation_arguments(&self, expr: &Expr) -> PyResult> { match expr { - Expr::Alias(expr, _) => self._aggregation_arguments(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => self._aggregation_arguments(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun: _, args, .. }) | Expr::AggregateUDF(AggregateUDF { fun: _, args, .. }) => match &self.aggregate { Some(e) => py_expr_list(&e.input, args), @@ -88,7 +88,7 @@ impl PyAggregate { fn _agg_func_name(expr: &Expr) -> PyResult { match expr { - Expr::Alias(expr, _) => _agg_func_name(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => _agg_func_name(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { fun, .. }) => Ok(fun.to_string()), Expr::AggregateUDF(AggregateUDF { fun, .. }) => Ok(fun.name.clone()), _ => Err(py_type_err( @@ -99,7 +99,7 @@ fn _agg_func_name(expr: &Expr) -> PyResult { fn _distinct_agg_expr(expr: &Expr) -> PyResult { match expr { - Expr::Alias(expr, _) => _distinct_agg_expr(expr.as_ref()), + Expr::Alias(Alias { expr, .. }) => _distinct_agg_expr(expr.as_ref()), Expr::AggregateFunction(AggregateFunction { distinct, .. }) => Ok(*distinct), Expr::AggregateUDF { .. } => { // DataFusion does not support DISTINCT in UDAFs diff --git a/src/sql/logical/projection.rs b/src/sql/logical/projection.rs index b954d3b71..56e5e28d8 100644 --- a/src/sql/logical/projection.rs +++ b/src/sql/logical/projection.rs @@ -1,4 +1,9 @@ -use datafusion_python::datafusion_expr::{logical_plan::Projection, Expr, LogicalPlan}; +use datafusion_python::datafusion_expr::{ + expr::Alias, + logical_plan::Projection, + Expr, + LogicalPlan, +}; use pyo3::prelude::*; use crate::{expression::PyExpr, sql::exceptions::py_type_err}; @@ -14,7 +19,7 @@ impl PyProjection { fn projected_expressions(&mut self, local_expr: &PyExpr) -> Vec { let mut projs: Vec = Vec::new(); match &local_expr.expr { - Expr::Alias(expr, _name) => { + Expr::Alias(Alias { expr, .. }) => { let py_expr: PyExpr = PyExpr::from(*expr.clone(), Some(vec![self.projection.input.clone()])); projs.extend_from_slice(self.projected_expressions(&py_expr).as_slice()); @@ -35,9 +40,9 @@ impl PyProjection { PyExpr::from(expression, Some(vec![self.projection.input.clone()])); for expr in self.projected_expressions(&py_expr) { match expr.expr { - Expr::Alias(ex, name) => named.push(( + Expr::Alias(Alias { expr, name }) => named.push(( name.to_string(), - PyExpr::from(*ex, Some(vec![self.projection.input.clone()])), + PyExpr::from(*expr, Some(vec![self.projection.input.clone()])), )), _ => { if let Ok(name) = expr._column_name(&self.projection.input) { diff --git a/src/sql/logical/table_scan.rs b/src/sql/logical/table_scan.rs index 1303f6474..c9cb92ebd 100644 --- a/src/sql/logical/table_scan.rs +++ b/src/sql/logical/table_scan.rs @@ -2,7 +2,12 @@ use std::{sync::Arc, vec}; use datafusion_python::{ datafusion_common::{DFSchema, ScalarValue}, - datafusion_expr::{expr::InList, logical_plan::TableScan, Expr, LogicalPlan}, + datafusion_expr::{ + expr::{Alias, InList}, + logical_plan::TableScan, + Expr, + LogicalPlan, + }, }; use pyo3::prelude::*; @@ -64,7 +69,7 @@ impl PyTableScan { // IF it is something else it is returned to Dask to handle let ident = match *expr.clone() { Expr::Column(col) => Ok(col.name), - Expr::Alias(_, name) => Ok(name), + Expr::Alias(Alias { name, .. }) => Ok(name), Expr::Literal(val) => Ok(format!("{}", val)), _ => Err(DaskPlannerError::InvalidIOFilter(format!( "Invalid InList Expr type `{}`. using in Dask instead", @@ -77,7 +82,7 @@ impl PyTableScan { .iter() .map(|f| match f { Expr::Column(col) => Ok(col.name.clone().into_py(py)), - Expr::Alias(_, name) => Ok(name.clone().into_py(py)), + Expr::Alias(Alias { name, ..}) => Ok(name.clone().into_py(py)), Expr::Literal(val) => match val { ScalarValue::Boolean(val) => Ok(val.unwrap().into_py(py)), ScalarValue::Float32(val) => Ok(val.unwrap().into_py(py)), diff --git a/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs index 8cde26233..bce064182 100644 --- a/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/src/sql/optimizer/dynamic_partition_pruning.rs @@ -537,7 +537,7 @@ fn read_table( .project(projection.clone()) .ok(); if let Some(row_iter) = row_iter_result { - rows.extend(row_iter); + rows.extend(row_iter.map(|r| r.expect("Parquet error encountered"))); } else { // TODO: Investigate cases when this would happen rows.clear(); From 9fd4770eb094230f8301514388ce2c2b18f9aba9 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 25 Jul 2023 09:56:06 -0700 Subject: [PATCH 34/89] Initial unblocker for pyarrow string handling --- dask_sql/mappings.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index ca0e23691..0fac21e9e 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -41,7 +41,9 @@ pd.BooleanDtype(): SqlTypeName.BOOLEAN, str: SqlTypeName.VARCHAR, np.object_: SqlTypeName.VARCHAR, + # TODO: can we identify a case where we should always be using pyarrow strings? pd.StringDtype(): SqlTypeName.VARCHAR, + pd.StringDtype(storage="pyarrow"): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, } From 36c58ab48569d39e259461f7d82753db6e9d8160 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 25 Jul 2023 10:30:09 -0700 Subject: [PATCH 35/89] Compatibility code for old or no pyarrow installation --- dask_sql/_compat.py | 8 ++++++++ dask_sql/mappings.py | 7 +++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index be8cfbae5..dc6227f8c 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -3,7 +3,13 @@ import prompt_toolkit from packaging.version import parse as parseVersion +try: + import pyarrow as pa +except ImportError: + pa = None + _pandas_version = parseVersion(pd.__version__) +_pyarrow_version = parseVersion(pa.__version__) if pa else parseVersion("0.0.0") _prompt_toolkit_version = parseVersion(prompt_toolkit.__version__) _dask_version = parseVersion(dask.__version__) @@ -21,3 +27,5 @@ PQ_IS_SUPPORT = parseVersion(dask.__version__) >= parseVersion("2023.3.1") DASK_CUDF_TODATETIME_SUPPORT = _dask_version >= parseVersion("2023.5.1") + +PA_GT_700 = _pyarrow_version >= parseVersion("7.0.0") diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 0fac21e9e..2f78f957d 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd +from dask_sql._compat import PA_GT_700 from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) @@ -41,12 +42,14 @@ pd.BooleanDtype(): SqlTypeName.BOOLEAN, str: SqlTypeName.VARCHAR, np.object_: SqlTypeName.VARCHAR, - # TODO: can we identify a case where we should always be using pyarrow strings? pd.StringDtype(): SqlTypeName.VARCHAR, - pd.StringDtype(storage="pyarrow"): SqlTypeName.VARCHAR, np.datetime64: SqlTypeName.TIMESTAMP, } +# TODO: can we identify a case where we should always be using pyarrow strings? +if PA_GT_700: + _PYTHON_TO_SQL[pd.StringDtype(storage="pyarrow")] = SqlTypeName.VARCHAR + # Default mapping between SQL types and python types # for values _SQL_TO_PYTHON_SCALARS = { From b4b2cdb64a0eb2fae3beb34f3ed4b13edb85b25e Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 25 Jul 2023 16:18:53 -0400 Subject: [PATCH 36/89] Added RexCall Operation to handle InSubquery Expr and also adjusted column_name function to examine InSubquery nested Expr instance for name --- dask_sql/physical/rex/core/call.py | 49 +++++++++++++++++++++++++++++- src/expression.rs | 5 +++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index e513556d0..1c3d2ef56 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -22,6 +22,7 @@ sql_to_python_type, sql_to_python_value, ) +from dask_sql.physical.rel import RelConverter from dask_sql.physical.rex import RexConverter from dask_sql.physical.rex.base import BaseRexPlugin from dask_sql.physical.rex.core.literal import SargPythonImplementation @@ -61,6 +62,12 @@ class Operation: # True, if the operation should also get the REX needs_rex = False + # True, if the operation should also needs the Context, possible subquery Relation expansion + needs_context = False + + # True, if the operation needs the original relation algebra + needs_rel = False + @staticmethod def op_needs_dc(op): return hasattr(op, "needs_dc") and op.needs_dc @@ -69,6 +76,14 @@ def op_needs_dc(op): def op_needs_rex(op): return hasattr(op, "needs_rex") and op.needs_rex + @staticmethod + def op_needs_context(op): + return hasattr(op, "needs_context") and op.needs_context + + @staticmethod + def op_needs_rel(op): + return hasattr(op, "needs_rel") and op.needs_rel + def __init__(self, f: Callable): """Init with the given function""" self.f = f @@ -82,6 +97,8 @@ def of(self, op: "Operation") -> "Operation": new_op = Operation(lambda *x, **kwargs: self(op(*x, **kwargs))) new_op.needs_dc = Operation.op_needs_dc(op) new_op.needs_rex = Operation.op_needs_rex(op) + new_op.needs_context = Operation.op_needs_context(op) + new_op.needs_rel = Operation.op_needs_rel(op) return new_op @@ -987,6 +1004,29 @@ def inList(self, series: dd.Series, *operands, rex=None): return ~result if rex.isNegated() else result +class InSubqueryOperation(Operation): + """ + Returns a boolean of whether an expression is/isn't in a Subquery Expression result + """ + + needs_rex = True + needs_context = True + needs_rel = True + + def __init__(self): + super().__init__(self.inSubquery) + + def inSubquery( + self, series: dd.Series, *operands, rel=None, rex=None, context=None + ): + sub_rel = rex.getSubqueryLogicalPlan() + dc = RelConverter.convert(sub_rel, context=context) + + # Extract the specified column/Series from the Dataframe + fq_column_name = rex.column_name(rel).split(".") + return dc.df[fq_column_name[len(fq_column_name) - 1]] + + class RexCallPlugin(BaseRexPlugin): """ RexCall is used for expressions, which calculate something. @@ -1036,6 +1076,7 @@ class RexCallPlugin(BaseRexPlugin): "negative": NegativeOperation(), "not": NotOperation(), "in list": InListOperation(), + "in subquery": InSubqueryOperation(), "is null": IsNullOperation(), "is not null": NotOperation().of(IsNullOperation()), "is true": IsTrueOperation(), @@ -1139,7 +1180,9 @@ def convert( try: operation = context.schema[schema_name].functions[operator_name] except KeyError: # pragma: no cover - raise NotImplementedError(f"{operator_name} not (yet) implemented") + raise NotImplementedError( + f"RexCall operator '{operator_name}' not (yet) implemented" + ) logger.debug( f"Executing {operator_name} on {[str(LoggableDataFrame(df)) for df in operands]}" @@ -1151,6 +1194,10 @@ def convert( kwargs["dc"] = dc if Operation.op_needs_rex(operation): kwargs["rex"] = expr + if Operation.op_needs_context(operation): + kwargs["context"] = context + if Operation.op_needs_rel(operation): + kwargs["rel"] = rel return operation(*operands, **kwargs) # TODO: We have information on the typing here - we should use it diff --git a/src/expression.rs b/src/expression.rs index 7f5e2ef35..9688d4026 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -166,6 +166,9 @@ impl PyExpr { pub fn subquery_plan(&self) -> PyResult { match &self.expr { Expr::ScalarSubquery(subquery) => Ok(subquery.subquery.as_ref().clone().into()), + Expr::InSubquery(insubquery) => { + Ok(insubquery.subquery.subquery.as_ref().clone().into()) + } _ => Err(py_type_err(format!( "Attempted to extract a LogicalPlan instance from invalid Expr {:?}. Only Subquery and related variants are supported for this operation.", @@ -467,6 +470,7 @@ impl PyExpr { Expr::IsNotFalse(_) => "is not false".to_string(), Expr::IsNotUnknown(_) => "is not unknown".to_string(), Expr::InList { .. } => "in list".to_string(), + Expr::InSubquery(..) => "in subquery".to_string(), Expr::Negative(..) => "negative".to_string(), Expr::Not(..) => "not".to_string(), Expr::Like(Like { negated, .. }) => { @@ -917,6 +921,7 @@ pub fn expr_to_field(expr: &Expr, input_plan: &LogicalPlan) -> Result { // Any column will do. We use the first column to keep things consistent Ok(input_plan.schema().field(0).clone()) } + Expr::InSubquery(insubquery) => expr_to_field(&insubquery.expr, input_plan), _ => { let fields = exprlist_to_fields(&[expr.clone()], input_plan).map_err(DaskPlannerError::from)?; From 336b8eafe3c1443ccd398b088fdbe0dcdf8f9672 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 26 Jul 2023 09:50:05 -0400 Subject: [PATCH 37/89] Add Sarah's fix for datetime.time error --- dask_sql/physical/utils/filter.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index f99934c07..5e0ef5a65 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -6,6 +6,7 @@ import dask.dataframe as dd import numpy as np +import pandas as pd from dask.blockwise import Blockwise from dask.highlevelgraph import HighLevelGraph, MaterializedLayer from dask.layers import DataFrameIOLayer @@ -388,6 +389,18 @@ def _regenerate_collection( regen_kwargs = self.creation_info.get("kwargs", {}).copy() regen_kwargs = {k: v for k, v in self.creation_info.get("kwargs", {}).items()} regen_kwargs.update((new_kwargs or {}).get(self.layer.output, {})) + + if "read_parquet" in str(func): + for i in range(len(regen_kwargs["filters"])): + new_filters = [] + for f in regen_kwargs["filters"][i]: + if isinstance(f[2], np.datetime64): + dt = pd.Timestamp(f[2]) + new_filters.append((f[0], f[1], dt)) + else: + new_filters.append(f) + regen_kwargs["filters"][i] = new_filters + result = func(*inputs, *regen_args, **regen_kwargs) _regen_cache[self.layer.output] = result return result From 465e9df1817675acbbb295c779d20a7f08d7fdc0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 26 Jul 2023 10:18:27 -0400 Subject: [PATCH 38/89] Add condition to guard against complex function names that contain a '.' in their column name --- src/expression.rs | 80 ++++++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 36 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index 9688d4026..12f827fba 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -207,42 +207,49 @@ impl PyExpr { // Handles cases when from_qualified_name doesn't format the Column correctly. // "name" will always contain the name of the column. Anything in addition to // that will be separated by a '.' and should be further referenced. - let parts = name.split('.').collect::>(); - let tbl_reference = match parts.len() { - // Single element means name contains just the column name so no TableReference - 1 => None, - // Tablename.column_name - 2 => Some( - TableReference::Bare { - table: Cow::Borrowed(parts[0]), - } - .to_owned_reference(), - ), - // Schema_name.table_name.column_name - 3 => Some( - TableReference::Partial { - schema: Cow::Borrowed(parts[0]), - table: Cow::Borrowed(parts[1]), - } - .to_owned_reference(), - ), - // catalog_name.schema_name.table_name.column_name - 4 => Some( - TableReference::Full { - catalog: Cow::Borrowed(parts[0]), - schema: Cow::Borrowed(parts[1]), - table: Cow::Borrowed(parts[2]), - } - .to_owned_reference(), - ), - _ => None, - }; - - let col = Column { - relation: tbl_reference.clone(), - name: parts[parts.len() - 1].to_string(), - }; - schema.index_of_column(&col).map_err(py_runtime_err) + match &self.expr { + Expr::Column(col) => { + schema.index_of_column(col).map_err(py_runtime_err) + } + _ => { + let parts = name.split('.').collect::>(); + let tbl_reference = match parts.len() { + // Single element means name contains just the column name so no TableReference + 1 => None, + // Tablename.column_name + 2 => Some( + TableReference::Bare { + table: Cow::Borrowed(parts[0]), + } + .to_owned_reference(), + ), + // Schema_name.table_name.column_name + 3 => Some( + TableReference::Partial { + schema: Cow::Borrowed(parts[0]), + table: Cow::Borrowed(parts[1]), + } + .to_owned_reference(), + ), + // catalog_name.schema_name.table_name.column_name + 4 => Some( + TableReference::Full { + catalog: Cow::Borrowed(parts[0]), + schema: Cow::Borrowed(parts[1]), + table: Cow::Borrowed(parts[2]), + } + .to_owned_reference(), + ), + _ => None, + }; + + let col = Column { + relation: tbl_reference.clone(), + name: parts[parts.len() - 1].to_string(), + }; + schema.index_of_column(&col).map_err(py_runtime_err) + } + } }) } else { // Since this is wildcard any Column will do, just use first one @@ -899,6 +906,7 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { } fn get_expr_name(expr: &Expr) -> Result { + println!("get_expr_name: {:?}", expr); match expr { Expr::Alias(Alias { expr, .. }) => get_expr_name(expr), Expr::Wildcard => { From 483bab579b49d261ca7b437d205651d6919cbac4 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 26 Jul 2023 11:03:30 -0400 Subject: [PATCH 39/89] unmarked xfail for queries 6, 9, & 54 --- tests/unit/test_queries.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py index 67120df82..9e19ada4c 100644 --- a/tests/unit/test_queries.py +++ b/tests/unit/test_queries.py @@ -4,9 +4,7 @@ XFAIL_QUERIES = ( 5, - 6, 8, - 9, 10, 14, 16, @@ -25,7 +23,6 @@ 47, 49, 51, - 54, 57, 58, 62, From 955bf4d66c99bdd963ebbc58ab484e2e35c03d4e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 26 Jul 2023 08:52:13 -0700 Subject: [PATCH 40/89] Quick fix for pydantic upstream breakage --- continuous_integration/environment-3.10-dev.yaml | 2 ++ continuous_integration/environment-3.8-dev.yaml | 2 ++ continuous_integration/environment-3.9-dev.yaml | 2 ++ continuous_integration/gpuci/environment-3.10.yaml | 2 ++ continuous_integration/gpuci/environment-3.9.yaml | 2 ++ 5 files changed, 10 insertions(+) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index 8d0710ec2..39a66d797 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -20,6 +20,8 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 +# TODO: remove once fastapi has resolved downstream breakage +- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 2fd4ddad3..1e715659a 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -19,6 +19,8 @@ dependencies: - prompt_toolkit=3.0.8 - psycopg2 - pyarrow=6.0.2 +# TODO: remove once fastapi has resolved downstream breakage +- pydantic<1.10.12 - pygments=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 67cf0277d..50f858f5c 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -20,6 +20,8 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 +# TODO: remove once fastapi has resolved downstream breakage +- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 297c7572a..39423b6b2 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -23,6 +23,8 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 +# TODO: remove once fastapi has resolved downstream breakage +- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index c8600fcfb..2a9f1b906 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -23,6 +23,8 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 +# TODO: remove once fastapi has resolved downstream breakage +- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov From 8757515f67d8207b247f87d2f64789f735d0f0b3 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 26 Jul 2023 13:28:15 -0400 Subject: [PATCH 41/89] Update dask_sql/physical/utils/filter.py Co-authored-by: Sarah Yurick <53962159+sarahyurick@users.noreply.github.com> --- dask_sql/physical/utils/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index 5e0ef5a65..e6b7c38de 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -394,7 +394,7 @@ def _regenerate_collection( for i in range(len(regen_kwargs["filters"])): new_filters = [] for f in regen_kwargs["filters"][i]: - if isinstance(f[2], np.datetime64): + if len(f) == 3 and isinstance(f[2], np.datetime64): dt = pd.Timestamp(f[2]) new_filters.append((f[0], f[1], dt)) else: From 5271eea983fc6aa46d3c58351afb00b934683e06 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 27 Jul 2023 09:35:14 -0700 Subject: [PATCH 42/89] Apply Sarah's suggestions --- dask_sql/physical/rex/core/call.py | 6 +++++- dask_sql/physical/utils/filter.py | 6 ++++++ src/sql/optimizer/dynamic_partition_pruning.rs | 2 ++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 1c3d2ef56..1959bb40e 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -46,7 +46,11 @@ def as_timelike(op): if isinstance(op, np.int64): return np.timedelta64(op, "D") elif isinstance(op, str): - return np.datetime64(op) + try: + return np.datetime64(op) + except ValueError: + op = datetime.strptime(op, "%Y-%m-%d") + return np.datetime64(op.strftime("%Y-%m-%d")) elif pd.api.types.is_datetime64_dtype(op) or isinstance(op, np.timedelta64): return op else: diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index e6b7c38de..404bfd032 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -13,6 +13,7 @@ from dask.utils import M, apply, is_arraylike from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT +from dask_sql.mappings import parse_datetime logger = logging.getLogger(__name__) @@ -397,6 +398,11 @@ def _regenerate_collection( if len(f) == 3 and isinstance(f[2], np.datetime64): dt = pd.Timestamp(f[2]) new_filters.append((f[0], f[1], dt)) + elif len(f) == 3 and f[1] == "in": + new_tuple = [] + for dt in f[2]: + new_tuple.append(parse_datetime(dt)) + new_filters.append((f[0], f[1], tuple(new_tuple))) else: new_filters.append(f) regen_kwargs["filters"][i] = new_filters diff --git a/src/sql/optimizer/dynamic_partition_pruning.rs b/src/sql/optimizer/dynamic_partition_pruning.rs index bce064182..d7e1a8be5 100644 --- a/src/sql/optimizer/dynamic_partition_pruning.rs +++ b/src/sql/optimizer/dynamic_partition_pruning.rs @@ -783,6 +783,8 @@ fn satisfies_int64(long_value: Option, filter: Expr) -> bool { Expr::Literal(ScalarValue::Float64(i)) => i.unwrap() as i64, Expr::Literal(ScalarValue::TimestampNanosecond(i, None)) => i.unwrap(), Expr::Literal(ScalarValue::Date32(i)) => i64::from(i.unwrap()), + // TODO: Add logic to check if the string can be converted to a timestamp + Expr::Literal(ScalarValue::Utf8(_)) => return false, _ => { panic!("Unknown ScalarValue type {filter_value}"); } From 69441fc516494f39836a1e8da44c827831ec08f5 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 27 Jul 2023 11:14:33 -0700 Subject: [PATCH 43/89] Attempt to unblock failures at parse_datetime --- dask_sql/physical/utils/filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index 404bfd032..0415d9c96 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -398,7 +398,7 @@ def _regenerate_collection( if len(f) == 3 and isinstance(f[2], np.datetime64): dt = pd.Timestamp(f[2]) new_filters.append((f[0], f[1], dt)) - elif len(f) == 3 and f[1] == "in": + elif len(f) == 3 and f[1] == "in" and isinstance(f[2][0], str): new_tuple = [] for dt in f[2]: new_tuple.append(parse_datetime(dt)) From 7688f8b85b3d9646e3b7248e1d765d5e801029e6 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 28 Jul 2023 10:43:02 -0700 Subject: [PATCH 44/89] Disable pyarrow strings for now --- conftest.py | 2 ++ dask_sql/_compat.py | 8 -------- dask_sql/mappings.py | 5 ----- dask_sql/physical/rel/logical/window.py | 3 +++ 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/conftest.py b/conftest.py index 0042c9ca7..4d959bb59 100644 --- a/conftest.py +++ b/conftest.py @@ -12,6 +12,8 @@ def pytest_addoption(parser): def pytest_runtest_setup(item): + # TODO: work on adding support for pyarrow strings + dask.config.set({"dataframe.convert-string": False}) if "gpu" in item.keywords: if not item.config.getoption("--rungpu"): pytest.skip("need --rungpu option to run") diff --git a/dask_sql/_compat.py b/dask_sql/_compat.py index dc6227f8c..be8cfbae5 100644 --- a/dask_sql/_compat.py +++ b/dask_sql/_compat.py @@ -3,13 +3,7 @@ import prompt_toolkit from packaging.version import parse as parseVersion -try: - import pyarrow as pa -except ImportError: - pa = None - _pandas_version = parseVersion(pd.__version__) -_pyarrow_version = parseVersion(pa.__version__) if pa else parseVersion("0.0.0") _prompt_toolkit_version = parseVersion(prompt_toolkit.__version__) _dask_version = parseVersion(dask.__version__) @@ -27,5 +21,3 @@ PQ_IS_SUPPORT = parseVersion(dask.__version__) >= parseVersion("2023.3.1") DASK_CUDF_TODATETIME_SUPPORT = _dask_version >= parseVersion("2023.5.1") - -PA_GT_700 = _pyarrow_version >= parseVersion("7.0.0") diff --git a/dask_sql/mappings.py b/dask_sql/mappings.py index 2f78f957d..ca0e23691 100644 --- a/dask_sql/mappings.py +++ b/dask_sql/mappings.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -from dask_sql._compat import PA_GT_700 from dask_sql._datafusion_lib import DaskTypeMap, SqlTypeName logger = logging.getLogger(__name__) @@ -46,10 +45,6 @@ np.datetime64: SqlTypeName.TIMESTAMP, } -# TODO: can we identify a case where we should always be using pyarrow strings? -if PA_GT_700: - _PYTHON_TO_SQL[pd.StringDtype(storage="pyarrow")] = SqlTypeName.VARCHAR - # Default mapping between SQL types and python types # for values _SQL_TO_PYTHON_SCALARS = { diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index aba788bc3..3b077ea20 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -252,6 +252,9 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai cc = dc.column_container cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) + + breakpoint() + dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc From 19aed3fb0d53370d929c6d0862b76433f52109af Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 28 Jul 2023 11:10:10 -0700 Subject: [PATCH 45/89] Remove breakpoint --- dask_sql/physical/rel/logical/window.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/dask_sql/physical/rel/logical/window.py b/dask_sql/physical/rel/logical/window.py index 3b077ea20..aba788bc3 100644 --- a/dask_sql/physical/rel/logical/window.py +++ b/dask_sql/physical/rel/logical/window.py @@ -252,9 +252,6 @@ def convert(self, rel: "LogicalPlan", context: "dask_sql.Context") -> DataContai cc = dc.column_container cc = self.fix_column_to_row_type(cc, rel.getRowType()) dc = DataContainer(df, cc) - - breakpoint() - dc = self.fix_dtype_to_row_type(dc, rel.getRowType()) return dc From 62ba03b52f529bc84cb778f7ee8fbd4e598ad359 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 8 Aug 2023 11:04:29 -0700 Subject: [PATCH 46/89] Remove pydantic constraint now that fastapi is bumped --- continuous_integration/environment-3.10-dev.yaml | 2 -- continuous_integration/environment-3.8-dev.yaml | 2 -- continuous_integration/environment-3.9-dev.yaml | 2 -- continuous_integration/gpuci/environment-3.10.yaml | 5 ++--- continuous_integration/gpuci/environment-3.9.yaml | 2 -- 5 files changed, 2 insertions(+), 11 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index b4b878e45..b042f9da0 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -20,8 +20,6 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 -# TODO: remove once fastapi has resolved downstream breakage -- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 88b816888..b9ac79f15 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -20,8 +20,6 @@ dependencies: - prompt_toolkit=3.0.8 - psycopg2 - pyarrow=6.0.2 -# TODO: remove once fastapi has resolved downstream breakage -- pydantic<1.10.12 - pygments=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 09bff2815..95ef02163 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -20,8 +20,6 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 -# TODO: remove once fastapi has resolved downstream breakage -- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index cfde8dd99..8e590e4b2 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -23,8 +23,6 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 -# TODO: remove once fastapi has resolved downstream breakage -- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov @@ -46,4 +44,5 @@ dependencies: - dask-cuda=23.08 - ucx-proc=*=gpu - ucx-py=0.33 -- xgboost=*rapidsai23.08 +- xgboost=*=rapidsai_py* +- libxgboost=*=rapidsai_h* diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index f16298c7b..2763d8de6 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -23,8 +23,6 @@ dependencies: - prompt_toolkit>=3.0.8 - psycopg2 - pyarrow>=6.0.2 -# TODO: remove once fastapi has resolved downstream breakage -- pydantic<1.10.12 - pygments>=2.7.1 - pyhive - pytest-cov From bbb0dc56ae9d32e27733d5cc706e49b73b8c4918 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 8 Aug 2023 11:07:51 -0700 Subject: [PATCH 47/89] Apply pyproject suggestions --- pyproject.toml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d82b88a7c..f145cbef5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ classifiers = [ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers", "Intended Audience :: Science/Research", - "License :: OSI Approved :: BSD License", + "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", "Programming Language :: Rust", "Programming Language :: Python", @@ -24,7 +24,6 @@ classifiers = [ "Topic :: System :: Distributed Computing", ] readme = "README.md" -urls = {Homepage = "https://github.com/dask-contrib/dask-sql/"} requires-python = ">=3.8" dependencies = [ "dask[dataframe]>=2022.3.0", @@ -40,6 +39,11 @@ dependencies = [ ] dynamic = ["version"] +[project.urls] +Homepage = "https://github.com/dask-contrib/dask-sql" +Documentation = "https://dask-sql.readthedocs.io" +Source = "https://github.com/dask-contrib/dask-sql" + [project.optional-dependencies] dev = [ "pytest>=6.0.1", From 3706115543b0224eb0713c641f3d196cc88d3645 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 8 Aug 2023 11:27:13 -0700 Subject: [PATCH 48/89] Bump build system to maturin 1.1 --- .github/workflows/release.yml | 6 +++--- continuous_integration/environment-3.10-dev.yaml | 2 +- continuous_integration/environment-3.8-dev.yaml | 2 +- continuous_integration/environment-3.9-dev.yaml | 2 +- continuous_integration/gpuci/environment-3.10.yaml | 2 +- continuous_integration/gpuci/environment-3.9.yaml | 5 +++-- continuous_integration/recipe/conda_build_config.yaml | 2 +- docker/conda.txt | 2 +- docker/main.dockerfile | 2 +- docs/requirements-docs.txt | 2 +- pyproject.toml | 2 +- 11 files changed, 15 insertions(+), 14 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d03d18d3d..66db9f92c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -60,14 +60,14 @@ jobs: CARGO_NET_GIT_FETCH_WITH_CLI="true" PATH="$HOME/.cargo/bin:$HOME/.local/bin:$PATH" CIBW_ENVIRONMENT_WINDOWS: 'PATH="$UserProfile\.cargo\bin;$PATH"' - CIBW_BEFORE_BUILD: 'pip install -U "maturin>=0.15,<0.16"' + CIBW_BEFORE_BUILD: 'pip install -U "maturin>=1.1,<1.2"' CIBW_BEFORE_BUILD_LINUX: > ARCH=$([ $(uname -m) == x86_64 ] && echo x86_64 || echo aarch_64) && DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-${ARCH}.zip$") && curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && unzip protoc-*-linux-$ARCH.zip -d $HOME/.local && protoc --version && - pip install -U "maturin>=0.15,<0.16" && + pip install -U "maturin>=1.1,<1.2" && pip list && curl --retry 6 --retry-delay 10 https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=minimal -y && rustup show @@ -115,7 +115,7 @@ jobs: channel-priority: strict - name: Build source distribution run: | - mamba install "maturin>=0.15,<0.16" twine + mamba install "maturin>=1.1,<1.2" twine python setup.py sdist - name: Check dist files diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index b042f9da0..b2f66081e 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.15,<0.16 +- maturin>=1.1,<1.2 - mlflow - mock - numpy>=1.21.6 diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index b9ac79f15..5bf6aa424 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake=0.6.0 - jsonschema - lightgbm -- maturin=0.15 +- maturin=1.1 - mlflow - mock - numpy=1.21.6 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 95ef02163..5df723d45 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -11,7 +11,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.15,<0.16 +- maturin>=1.1,<1.2 - mlflow - mock - numpy>=1.21.6 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 8e590e4b2..93479ebb1 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.15,<0.16 +- maturin>=1.1,<1.2 - mlflow - mock - numpy>=1.21.6 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 2763d8de6..985483adc 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -14,7 +14,7 @@ dependencies: - intake>=0.6.0 - jsonschema - lightgbm -- maturin>=0.15,<0.16 +- maturin>=1.1,<1.2 - mlflow - mock - numpy>=1.21.6 @@ -44,4 +44,5 @@ dependencies: - dask-cuda=23.08 - ucx-proc=*=gpu - ucx-py=0.33 -- xgboost=*rapidsai23.08 +- xgboost=*=rapidsai_py* +- libxgboost=*=rapidsai_h* diff --git a/continuous_integration/recipe/conda_build_config.yaml b/continuous_integration/recipe/conda_build_config.yaml index 142300f28..df3cde4e1 100644 --- a/continuous_integration/recipe/conda_build_config.yaml +++ b/continuous_integration/recipe/conda_build_config.yaml @@ -5,4 +5,4 @@ rust_compiler_version: libprotobuf: - 3 maturin: - - 0.15.3 + - 1.1 diff --git a/docker/conda.txt b/docker/conda.txt index 5a3a0740a..5083ebd85 100644 --- a/docker/conda.txt +++ b/docker/conda.txt @@ -21,4 +21,4 @@ intake>=0.6.0 pre-commit>=2.11.1 black=22.10.0 isort=5.12.0 -maturin>=0.15,<0.16 +maturin>=1.1,<1.2 diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 7ebb820fc..9f16958d7 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -14,7 +14,7 @@ ENV PATH="/root/.cargo/bin:${PATH}" COPY docker/conda.txt /opt/dask_sql/ RUN mamba install -y \ # build requirements - "maturin>=0.15,<0.16" \ + "maturin>=1.1,<1.2" \ # core dependencies "dask>=2022.3.0" \ "pandas>=1.4.0" \ diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index e54284821..cce9cb599 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -11,4 +11,4 @@ tzlocal>=2.1 prompt_toolkit>=3.0.8 pygments>=2.7.1 tabulate -maturin>=0.15,<0.16 +maturin>=1.1,<1.2 diff --git a/pyproject.toml b/pyproject.toml index f145cbef5..3caa92ddb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["maturin>=0.15,<0.16"] +requires = ["maturin>=1.1,<1.2"] build-backend = "maturin" [project] From 1fc8849d8d78d82c549bec497d754aed6b76ea2b Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 9 Aug 2023 12:03:43 -0700 Subject: [PATCH 49/89] Move filter datetime handling, remove string datetime handling for now --- dask_sql/physical/utils/filter.py | 34 +++++++++++++++---------------- 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/dask_sql/physical/utils/filter.py b/dask_sql/physical/utils/filter.py index 0415d9c96..ae564244d 100644 --- a/dask_sql/physical/utils/filter.py +++ b/dask_sql/physical/utils/filter.py @@ -6,14 +6,12 @@ import dask.dataframe as dd import numpy as np -import pandas as pd from dask.blockwise import Blockwise from dask.highlevelgraph import HighLevelGraph, MaterializedLayer from dask.layers import DataFrameIOLayer from dask.utils import M, apply, is_arraylike from dask_sql._compat import PQ_IS_SUPPORT, PQ_NOT_IN_SUPPORT -from dask_sql.mappings import parse_datetime logger = logging.getLogger(__name__) @@ -129,6 +127,22 @@ def attempt_predicate_pushdown( return ddf filters = filters.to_list_tuple() + # FIXME: pyarrow doesn't seem to like converting datetime64[D] to scalars + # so we must convert any we encounter to datetime64[ns] + filters = [ + [ + ( + col, + op, + val.astype("datetime64[ns]") + if isinstance(val, np.datetime64) and val.dtype == "datetime64[D]" + else val, + ) + for col, op, val in sublist + ] + for sublist in filters + ] + # Regenerate collection with filtered IO layer try: _regen_cache = {} @@ -391,22 +405,6 @@ def _regenerate_collection( regen_kwargs = {k: v for k, v in self.creation_info.get("kwargs", {}).items()} regen_kwargs.update((new_kwargs or {}).get(self.layer.output, {})) - if "read_parquet" in str(func): - for i in range(len(regen_kwargs["filters"])): - new_filters = [] - for f in regen_kwargs["filters"][i]: - if len(f) == 3 and isinstance(f[2], np.datetime64): - dt = pd.Timestamp(f[2]) - new_filters.append((f[0], f[1], dt)) - elif len(f) == 3 and f[1] == "in" and isinstance(f[2][0], str): - new_tuple = [] - for dt in f[2]: - new_tuple.append(parse_datetime(dt)) - new_filters.append((f[0], f[1], tuple(new_tuple))) - else: - new_filters.append(f) - regen_kwargs["filters"][i] = new_filters - result = func(*inputs, *regen_args, **regen_kwargs) _regen_cache[self.layer.output] = result return result From 510b0638da4308e8f072ca1c9b1120e67c35d984 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Wed, 9 Aug 2023 13:24:54 -0700 Subject: [PATCH 50/89] Actually check containment in InSubquery call --- dask_sql/physical/rex/core/call.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index 1959bb40e..bce6e11c1 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -1,6 +1,7 @@ import logging import operator import re +import warnings from datetime import datetime from functools import partial, reduce from typing import TYPE_CHECKING, Any, Callable, Union @@ -1028,7 +1029,17 @@ def inSubquery( # Extract the specified column/Series from the Dataframe fq_column_name = rex.column_name(rel).split(".") - return dc.df[fq_column_name[len(fq_column_name) - 1]] + + # FIXME: dask's isin doesn't support dask frames as arguments + # so we need to compute here + col = dc.df[fq_column_name[-1]].compute() + + warnings.warn( + "Dask doesn't support Dask frames as input for .isin, so we must force an early computation", + ResourceWarning, + ) + + return series.isin(col) class RexCallPlugin(BaseRexPlugin): From 3d4d9487f48c05950fda5ec7bb26ab5a7f8cb202 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 9 Aug 2023 18:07:55 -0400 Subject: [PATCH 51/89] bring back decorrelated_where_exists and decorrelate_when_in --- src/sql/optimizer.rs | 17 +- src/sql/optimizer/decorrelate_where_exists.rs | 783 +++++++++++ src/sql/optimizer/decorrelate_where_in.rs | 1191 +++++++++++++++++ src/sql/optimizer/utils.rs | 512 +++++++ 4 files changed, 2498 insertions(+), 5 deletions(-) create mode 100644 src/sql/optimizer/decorrelate_where_exists.rs create mode 100644 src/sql/optimizer/decorrelate_where_in.rs create mode 100644 src/sql/optimizer/utils.rs diff --git a/src/sql/optimizer.rs b/src/sql/optimizer.rs index b9fdaca06..85f335572 100644 --- a/src/sql/optimizer.rs +++ b/src/sql/optimizer.rs @@ -1,3 +1,10 @@ +// Declare optimizer modules +pub mod decorrelate_where_exists; +pub mod decorrelate_where_in; +pub mod dynamic_partition_pruning; +pub mod join_reorder; +pub mod utils; + use std::sync::Arc; use datafusion_python::{ @@ -20,13 +27,11 @@ use datafusion_python::{ OptimizerContext, }, }; -use log::{debug, trace}; - -mod dynamic_partition_pruning; +use decorrelate_where_exists::DecorrelateWhereExists; +use decorrelate_where_in::DecorrelateWhereIn; use dynamic_partition_pruning::DynamicPartitionPruning; - -mod join_reorder; use join_reorder::JoinReorder; +use log::{debug, trace}; /// Houses the optimization logic for Dask-SQL. This optimization controls the optimizations /// and their ordering in regards to their impact on the underlying `LogicalPlan` instance @@ -44,6 +49,8 @@ impl DaskSqlOptimizer { Arc::new(SimplifyExpressions::new()), Arc::new(UnwrapCastInComparison::new()), // Arc::new(ReplaceDistinctWithAggregate::new()), + Arc::new(DecorrelateWhereExists::new()), + Arc::new(DecorrelateWhereIn::new()), Arc::new(ScalarSubqueryToJoin::new()), //Arc::new(ExtractEquijoinPredicate::new()), diff --git a/src/sql/optimizer/decorrelate_where_exists.rs b/src/sql/optimizer/decorrelate_where_exists.rs new file mode 100644 index 000000000..bd2d3348f --- /dev/null +++ b/src/sql/optimizer/decorrelate_where_exists.rs @@ -0,0 +1,783 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion_python::{ + datafusion_common::{Column, DataFusionError, Result}, + datafusion_expr::{ + expr::Exists, + logical_plan::{Distinct, Filter, JoinType, Subquery}, + Expr, + LogicalPlan, + LogicalPlanBuilder, + }, + datafusion_optimizer::optimizer::{ApplyOrder, OptimizerConfig, OptimizerRule}, +}; + +use crate::sql::optimizer::utils::{ + collect_subquery_cols, + conjunction, + extract_join_filters, + split_conjunction, +}; + +/// Optimizer rule for rewriting subquery filters to joins +#[derive(Default)] +pub struct DecorrelateWhereExists {} + +impl DecorrelateWhereExists { + #[allow(missing_docs)] + pub fn new() -> Self { + Self {} + } + + /// Finds expressions that have a where in subquery (and recurse when found) + /// + /// # Arguments + /// + /// * `predicate` - A conjunction to split and search + /// * `optimizer_config` - For generating unique subquery aliases + /// + /// Returns a tuple (subqueries, non-subquery expressions) + fn extract_subquery_exprs( + &self, + predicate: &Expr, + config: &dyn OptimizerConfig, + ) -> Result<(Vec, Vec)> { + let filters = split_conjunction(predicate); + + let mut subqueries = vec![]; + let mut others = vec![]; + for it in filters.iter() { + match it { + Expr::Exists(Exists { subquery, negated }) => { + let subquery_plan = self + .try_optimize(&subquery.subquery, config)? + .map(Arc::new) + .unwrap_or_else(|| subquery.subquery.clone()); + let new_subquery = subquery.with_plan(subquery_plan); + subqueries.push(SubqueryInfo::new(new_subquery, *negated)); + } + _ => others.push((*it).clone()), + } + } + + Ok((subqueries, others)) + } +} + +impl OptimizerRule for DecorrelateWhereExists { + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Filter(filter) => { + let (subqueries, other_exprs) = + self.extract_subquery_exprs(&filter.predicate, config)?; + if subqueries.is_empty() { + // regular filter, no subquery exists clause here + return Ok(None); + } + + // iterate through all exists clauses in predicate, turning each into a join + let mut cur_input = filter.input.as_ref().clone(); + for subquery in subqueries { + if let Some(x) = optimize_exists(&subquery, &cur_input)? { + cur_input = x; + } else { + return Ok(None); + } + } + + let expr = conjunction(other_exprs); + if let Some(expr) = expr { + let new_filter = Filter::try_new(expr, Arc::new(cur_input))?; + cur_input = LogicalPlan::Filter(new_filter); + } + + Ok(Some(cur_input)) + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "decorrelate_where_exists" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +/// Takes a query like: +/// +/// SELECT t1.id +/// FROM t1 +/// WHERE exists +/// ( +/// SELECT t2.id FROM t2 WHERE t1.id = t2.id +/// ) +/// +/// and optimizes it into: +/// +/// SELECT t1.id +/// FROM t1 LEFT SEMI +/// JOIN t2 +/// ON t1.id = t2.id +/// +/// # Arguments +/// +/// * query_info - The subquery and negated(exists/not exists) info. +/// * outer_input - The non-subquery portion (relation t1) +fn optimize_exists( + query_info: &SubqueryInfo, + outer_input: &LogicalPlan, +) -> Result> { + let subquery = query_info.query.subquery.as_ref(); + if let Some((join_filter, optimized_subquery)) = optimize_subquery(subquery)? { + // join our sub query into the main plan + let join_type = match query_info.negated { + true => JoinType::LeftAnti, + false => JoinType::LeftSemi, + }; + + let new_plan = LogicalPlanBuilder::from(outer_input.clone()) + .join( + optimized_subquery, + join_type, + (Vec::::new(), Vec::::new()), + Some(join_filter), + )? + .build()?; + + Ok(Some(new_plan)) + } else { + Ok(None) + } +} +/// Optimize the subquery and extract the possible join filter. +/// This function can't optimize non-correlated subquery, and will return None. +fn optimize_subquery(subquery: &LogicalPlan) -> Result> { + match subquery { + LogicalPlan::Distinct(subqry_distinct) => { + let distinct_input = &subqry_distinct.input; + let optimized_plan = optimize_subquery(distinct_input)?.map(|(filters, right)| { + ( + filters, + LogicalPlan::Distinct(Distinct { + input: Arc::new(right), + }), + ) + }); + Ok(optimized_plan) + } + LogicalPlan::Projection(projection) => { + // extract join filters + let (join_filters, subquery_input) = extract_join_filters(&projection.input)?; + // cannot optimize non-correlated subquery + if join_filters.is_empty() { + return Ok(None); + } + let input_schema = subquery_input.schema(); + let project_exprs: Vec = + collect_subquery_cols(&join_filters, input_schema.clone())? + .into_iter() + .map(Expr::Column) + .collect(); + let right = LogicalPlanBuilder::from(subquery_input) + .project(project_exprs)? + .build()?; + + // join_filters is not empty. + let join_filter = conjunction(join_filters).ok_or_else(|| { + DataFusionError::Internal("join filters should not be empty".to_string()) + })?; + Ok(Some((join_filter, right))) + } + _ => Ok(None), + } +} + +struct SubqueryInfo { + query: Subquery, + negated: bool, +} + +impl SubqueryInfo { + pub fn new(query: Subquery, negated: bool) -> Self { + Self { query, negated } + } +} + +#[cfg(test)] +mod tests { + use std::ops::Add; + + use arrow::datatypes::DataType; + use datafusion_common::Result; + use datafusion_expr::{ + col, + exists, + lit, + logical_plan::LogicalPlanBuilder, + not_exists, + out_ref_col, + }; + + use super::*; + use crate::test::*; + + fn assert_plan_eq(plan: &LogicalPlan, expected: &str) -> Result<()> { + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereExists::new()), + plan, + expected, + ); + Ok(()) + } + + /// Test for multiple exists subqueries in the same filter expression + #[test] + fn multiple_subqueries() -> Result<()> { + let orders = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(orders.clone()).and(exists(orders)))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ + \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + assert_plan_eq(&plan, expected) + } + + /// Test recursive correlated subqueries + #[test] + fn recursive_subqueries() -> Result<()> { + let lineitem = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("lineitem")) + .filter( + col("lineitem.l_orderkey") + .eq(out_ref_col(DataType::Int64, "orders.o_orderkey")), + )? + .project(vec![col("lineitem.l_orderkey")])? + .build()?, + ); + + let orders = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(exists(lineitem).and( + col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), + ))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(orders))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n LeftSemi Join: Filter: lineitem.l_orderkey = orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n Projection: lineitem.l_orderkey [l_orderkey:Int64]\ + \n TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]"; + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists subquery filter with additional subquery filters + #[test] + fn exists_subquery_with_subquery_filters() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(col("orders.o_custkey")) + .and(col("o_orderkey").eq(lit(1))), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_subquery_no_cols() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(out_ref_col(DataType::Int64, "customer.c_custkey").eq(lit(1u32)))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + // Other rule will pushdown `customer.c_custkey = 1`, + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = UInt32(1) [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: []\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for exists subquery with both columns in schema + #[test] + fn exists_subquery_with_no_correlated_cols() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(col("orders.o_custkey").eq(col("orders.o_custkey")))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) + } + + /// Test for correlated exists subquery not equal + #[test] + fn exists_subquery_where_not_eq() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .not_eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey != orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists subquery less than + #[test] + fn exists_subquery_where_less_than() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").lt(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey < orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists subquery filter with subquery disjunction + #[test] + fn exists_subquery_with_subquery_disjunction() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(col("orders.o_custkey")) + .or(col("o_orderkey").eq(lit(1))), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey OR orders.o_orderkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey, orders.o_orderkey [o_custkey:Int64, o_orderkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists without projection + #[test] + fn exists_subquery_no_projection() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) + } + + /// Test for correlated exists expressions + #[test] + fn exists_subquery_project_expr() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey").add(lit(1))])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists subquery filter with additional filters + #[test] + fn should_support_additional_filters() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq).and(col("c_custkey").eq(lit(1))))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n Filter: customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ + \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n Projection: orders.o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated exists subquery filter with disjunctions + #[test] + fn exists_subquery_disjunction() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(exists(sq).or(col("customer.c_custkey").eq(lit(1))))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + // not optimized + let expected = r#"Projection: customer.c_custkey [c_custkey:Int64] + Filter: EXISTS () OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8] + Subquery: [o_custkey:Int64] + Projection: orders.o_custkey [o_custkey:Int64] + Filter: customer.c_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] + TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] + TableScan: customer [c_custkey:Int64, c_name:Utf8]"#; + + assert_plan_eq(&plan, expected) + } + + /// Test for correlated EXISTS subquery filter + #[test] + fn exists_subquery_correlated() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) + .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq.a")))? + .project(vec![col("c")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("test")?) + .filter(exists(sq))? + .project(vec![col("test.c")])? + .build()?; + + let expected = "Projection: test.c [c:UInt32]\ + \n LeftSemi Join: Filter: test.a = sq.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Projection: sq.a [a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + /// Test for single exists subquery filter + #[test] + fn exists_subquery_simple() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(exists(test_subquery_with_name("sq")?))? + .project(vec![col("test.b")])? + .build()?; + + assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) + } + + /// Test for single NOT exists subquery filter + #[test] + fn not_exists_subquery_simple() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(not_exists(test_subquery_with_name("sq")?))? + .project(vec![col("test.b")])? + .build()?; + + assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) + } + + #[test] + fn two_exists_subquery_with_outer_filter() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan1 = test_table_scan_with_name("sq1")?; + let subquery_scan2 = test_table_scan_with_name("sq2")?; + + let subquery1 = LogicalPlanBuilder::from(subquery_scan1) + .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq1.a")))? + .project(vec![col("c")])? + .build()?; + + let subquery2 = LogicalPlanBuilder::from(subquery_scan2) + .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq2.a")))? + .project(vec![col("c")])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter( + exists(Arc::new(subquery1)) + .and(exists(Arc::new(subquery2)).and(col("test.c").gt(lit(1u32)))), + )? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: test.c > UInt32(1) [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.a = sq2.a [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.a = sq1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Projection: sq1.a [a:UInt32]\ + \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ + \n Projection: sq2.a [a:UInt32]\ + \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_subquery_expr_filter() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), + )? + .project(vec![lit(1u32)])? + .build()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Projection: sq.a [a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_subquery_with_same_table() -> Result<()> { + let outer_scan = test_table_scan()?; + let subquery_scan = test_table_scan()?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter(col("test.a").gt(col("test.b")))? + .project(vec![col("c")])? + .build()?; + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + // Subquery and outer query refer to the same table. + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: EXISTS () [a:UInt32, b:UInt32, c:UInt32]\ + \n Subquery: [c:UInt32]\ + \n Projection: test.c [c:UInt32]\ + \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_distinct_subquery() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), + )? + .project(vec![col("sq.c")])? + .distinct()? + .build()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Distinct: [a:UInt32]\ + \n Projection: sq.a [a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_distinct_expr_subquery() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), + )? + .project(vec![col("sq.b") + col("sq.c")])? + .distinct()? + .build()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Distinct: [a:UInt32]\ + \n Projection: sq.a [a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } + + #[test] + fn exists_distinct_subquery_with_literal() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), + )? + .project(vec![lit(1u32), col("sq.c")])? + .distinct()? + .build()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(exists(Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n Distinct: [a:UInt32]\ + \n Projection: sq.a [a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_plan_eq(&plan, expected) + } +} diff --git a/src/sql/optimizer/decorrelate_where_in.rs b/src/sql/optimizer/decorrelate_where_in.rs new file mode 100644 index 000000000..d3907293a --- /dev/null +++ b/src/sql/optimizer/decorrelate_where_in.rs @@ -0,0 +1,1191 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::sync::Arc; + +use datafusion_python::{ + datafusion_common::{alias::AliasGenerator, context, Column, Result}, + datafusion_expr::{ + expr::InSubquery, + expr_rewriter::unnormalize_col, + logical_plan::{JoinType, Projection, Subquery}, + Expr, + Filter, + LogicalPlan, + LogicalPlanBuilder, + }, + datafusion_optimizer::optimizer::{ApplyOrder, OptimizerConfig, OptimizerRule}, +}; +use log::debug; + +use crate::sql::optimizer::utils::{ + collect_subquery_cols, + conjunction, + extract_join_filters, + only_or_err, + replace_qualified_name, + split_conjunction, +}; + +#[derive(Default)] +pub struct DecorrelateWhereIn { + alias: AliasGenerator, +} + +impl DecorrelateWhereIn { + #[allow(missing_docs)] + pub fn new() -> Self { + Self::default() + } + + /// Finds expressions that have a where in subquery (and recurses when found) + /// + /// # Arguments + /// + /// * `predicate` - A conjunction to split and search + /// * `optimizer_config` - For generating unique subquery aliases + /// + /// Returns a tuple (subqueries, non-subquery expressions) + fn extract_subquery_exprs( + &self, + predicate: &Expr, + config: &dyn OptimizerConfig, + ) -> Result<(Vec, Vec)> { + let filters = split_conjunction(predicate); // TODO: disjunctions + + let mut subqueries = vec![]; + let mut others = vec![]; + for it in filters.iter() { + match it { + Expr::InSubquery(InSubquery { + expr, + subquery, + negated, + }) => { + let subquery_plan = self + .try_optimize(&subquery.subquery, config)? + .map(Arc::new) + .unwrap_or_else(|| subquery.subquery.clone()); + let new_subquery = subquery.with_plan(subquery_plan); + subqueries.push(SubqueryInfo::new(new_subquery, (**expr).clone(), *negated)); + // TODO: if subquery doesn't get optimized, optimized children are lost + } + _ => others.push((*it).clone()), + } + } + + Ok((subqueries, others)) + } +} + +impl OptimizerRule for DecorrelateWhereIn { + fn try_optimize( + &self, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, + ) -> Result> { + match plan { + LogicalPlan::Filter(filter) => { + let (subqueries, other_exprs) = + self.extract_subquery_exprs(&filter.predicate, config)?; + if subqueries.is_empty() { + // regular filter, no subquery exists clause here + return Ok(None); + } + + // iterate through all exists clauses in predicate, turning each into a join + let mut cur_input = filter.input.as_ref().clone(); + for subquery in subqueries { + cur_input = optimize_where_in(&subquery, &cur_input, &self.alias)?; + } + + let expr = conjunction(other_exprs); + if let Some(expr) = expr { + let new_filter = Filter::try_new(expr, Arc::new(cur_input))?; + cur_input = LogicalPlan::Filter(new_filter); + } + + Ok(Some(cur_input)) + } + _ => Ok(None), + } + } + + fn name(&self) -> &str { + "decorrelate_where_in" + } + + fn apply_order(&self) -> Option { + Some(ApplyOrder::TopDown) + } +} + +/// Optimize the where in subquery to left-anti/left-semi join. +/// If the subquery is a correlated subquery, we need extract the join predicate from the subquery. +/// +/// For example, given a query like: +/// `select t1.a, t1.b from t1 where t1 in (select t2.a from t2 where t1.b = t2.b and t1.c > t2.c)` +/// +/// The optimized plan will be: +/// +/// ```text +/// Projection: t1.a, t1.b +/// LeftSemi Join: Filter: t1.a = __correlated_sq_1.a AND t1.b = __correlated_sq_1.b AND t1.c > __correlated_sq_1.c +/// TableScan: t1 +/// SubqueryAlias: __correlated_sq_1 +/// Projection: t2.a AS a, t2.b, t2.c +/// TableScan: t2 +/// ``` +fn optimize_where_in( + query_info: &SubqueryInfo, + left: &LogicalPlan, + alias: &AliasGenerator, +) -> Result { + let projection = Projection::try_from_plan(&query_info.query.subquery) + .map_err(|e| context!("a projection is required", e))?; + let subquery_input = projection.input.clone(); + // TODO add the validate logic to Analyzer + let subquery_expr = only_or_err(projection.expr.as_slice()) + .map_err(|e| context!("single expression projection required", e))?; + + // extract join filters + let (join_filters, subquery_input) = extract_join_filters(subquery_input.as_ref())?; + + // in_predicate may be also include in the join filters, remove it from the join filters. + let in_predicate = Expr::eq(query_info.where_in_expr.clone(), subquery_expr.clone()); + let join_filters = remove_duplicated_filter(join_filters, in_predicate); + + // replace qualified name with subquery alias. + let subquery_alias = alias.next("__correlated_sq"); + let input_schema = subquery_input.schema(); + let mut subquery_cols = collect_subquery_cols(&join_filters, input_schema.clone())?; + let join_filter = conjunction(join_filters).map_or(Ok(None), |filter| { + replace_qualified_name(filter, &subquery_cols, &subquery_alias).map(Option::Some) + })?; + + // add projection + if let Expr::Column(col) = subquery_expr { + subquery_cols.remove(col); + } + let subquery_expr_name = format!("{:?}", unnormalize_col(subquery_expr.clone())); + let first_expr = subquery_expr.clone().alias(subquery_expr_name.clone()); + let projection_exprs: Vec = [first_expr] + .into_iter() + .chain(subquery_cols.into_iter().map(Expr::Column)) + .collect(); + + let right = LogicalPlanBuilder::from(subquery_input) + .project(projection_exprs)? + .alias(subquery_alias.clone())? + .build()?; + + // join our sub query into the main plan + let join_type = match query_info.negated { + true => JoinType::LeftAnti, + false => JoinType::LeftSemi, + }; + let right_join_col = Column::new(Some(subquery_alias), subquery_expr_name); + let in_predicate = Expr::eq( + query_info.where_in_expr.clone(), + Expr::Column(right_join_col), + ); + let join_filter = join_filter + .map(|filter| in_predicate.clone().and(filter)) + .unwrap_or_else(|| in_predicate); + + let new_plan = LogicalPlanBuilder::from(left.clone()) + .join( + right, + join_type, + (Vec::::new(), Vec::::new()), + Some(join_filter), + )? + .build()?; + + debug!("where in optimized:\n{}", new_plan.display_indent()); + Ok(new_plan) +} + +fn remove_duplicated_filter(filters: Vec, in_predicate: Expr) -> Vec { + filters + .into_iter() + .filter(|filter| { + if filter == &in_predicate { + return false; + } + + // ignore the binary order + !match (filter, &in_predicate) { + (Expr::BinaryExpr(a_expr), Expr::BinaryExpr(b_expr)) => { + (a_expr.op == b_expr.op) + && (a_expr.left == b_expr.left && a_expr.right == b_expr.right) + || (a_expr.left == b_expr.right && a_expr.right == b_expr.left) + } + _ => false, + } + }) + .collect::>() +} + +struct SubqueryInfo { + query: Subquery, + where_in_expr: Expr, + negated: bool, +} + +impl SubqueryInfo { + pub fn new(query: Subquery, expr: Expr, negated: bool) -> Self { + Self { + query, + where_in_expr: expr, + negated, + } + } +} + +#[cfg(test)] +mod tests { + use arrow::datatypes::DataType; + use datafusion_common::Result; + use datafusion_expr::{ + and, + binary_expr, + col, + in_subquery, + lit, + logical_plan::LogicalPlanBuilder, + not_in_subquery, + or, + out_ref_col, + Operator, + }; + + use super::*; + use crate::test::*; + + fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> { + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + plan, + expected, + ); + Ok(()) + } + + fn test_subquery_with_name(name: &str) -> Result> { + let table_scan = test_table_scan_with_name(name)?; + Ok(Arc::new( + LogicalPlanBuilder::from(table_scan) + .project(vec![col("c")])? + .build()?, + )) + } + + /// Test for several IN subquery expressions + #[test] + fn in_subquery_multiple() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(and( + in_subquery(col("c"), test_subquery_with_name("sq_1")?), + in_subquery(col("b"), test_subquery_with_name("sq_2")?), + ))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.b = __correlated_sq_2.c [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq_1.c AS c [c:UInt32]\ + \n TableScan: sq_1 [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [c:UInt32]\ + \n Projection: sq_2.c AS c [c:UInt32]\ + \n TableScan: sq_2 [a:UInt32, b:UInt32, c:UInt32]"; + assert_optimized_plan_equal(&plan, expected) + } + + /// Test for IN subquery with additional AND filter + #[test] + fn in_subquery_with_and_filters() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(and( + in_subquery(col("c"), test_subquery_with_name("sq")?), + and( + binary_expr(col("a"), Operator::Eq, lit(1_u32)), + binary_expr(col("b"), Operator::Lt, lit(30_u32)), + ), + ))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: test.a = UInt32(1) AND test.b < UInt32(30) [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq.c AS c [c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + /// Test for IN subquery with additional OR filter + /// filter expression not modified + #[test] + fn in_subquery_with_or_filters() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(or( + and( + binary_expr(col("a"), Operator::Eq, lit(1_u32)), + binary_expr(col("b"), Operator::Lt, lit(30_u32)), + ), + in_subquery(col("c"), test_subquery_with_name("sq")?), + ))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: test.a = UInt32(1) AND test.b < UInt32(30) OR test.c IN () [a:UInt32, b:UInt32, c:UInt32]\ + \n Subquery: [c:UInt32]\ + \n Projection: sq.c [c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + #[test] + fn in_subquery_with_and_or_filters() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(and( + or( + binary_expr(col("a"), Operator::Eq, lit(1_u32)), + in_subquery(col("b"), test_subquery_with_name("sq1")?), + ), + in_subquery(col("c"), test_subquery_with_name("sq2")?), + ))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: test.a = UInt32(1) OR test.b IN () [a:UInt32, b:UInt32, c:UInt32]\ + \n Subquery: [c:UInt32]\ + \n Projection: sq1.c [c:UInt32]\ + \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq2.c AS c [c:UInt32]\ + \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + /// Test for nested IN subqueries + #[test] + fn in_subquery_nested() -> Result<()> { + let table_scan = test_table_scan()?; + + let subquery = LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) + .filter(in_subquery(col("a"), test_subquery_with_name("sq_nested")?))? + .project(vec![col("a")])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("b"), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.b = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ + \n Projection: sq.a AS a [a:UInt32]\ + \n LeftSemi Join: Filter: sq.a = __correlated_sq_2.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [c:UInt32]\ + \n Projection: sq_nested.c AS c [c:UInt32]\ + \n TableScan: sq_nested [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + /// Test for filter input modification in case filter not supported + /// Outer filter expression not modified while inner converted to join + #[test] + fn in_subquery_input_modified() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("c"), test_subquery_with_name("sq_inner")?))? + .project(vec![col("b"), col("c")])? + .alias("wrapped")? + .filter(or( + binary_expr(col("b"), Operator::Lt, lit(30_u32)), + in_subquery(col("c"), test_subquery_with_name("sq_outer")?), + ))? + .project(vec![col("b")])? + .build()?; + + let expected = "Projection: wrapped.b [b:UInt32]\ + \n Filter: wrapped.b < UInt32(30) OR wrapped.c IN () [b:UInt32, c:UInt32]\ + \n Subquery: [c:UInt32]\ + \n Projection: sq_outer.c [c:UInt32]\ + \n TableScan: sq_outer [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: wrapped [b:UInt32, c:UInt32]\ + \n Projection: test.b, test.c [b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq_inner.c AS c [c:UInt32]\ + \n TableScan: sq_inner [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_equal(&plan, expected) + } + + #[cfg(test)] + #[ctor::ctor] + fn init() { + let _ = env_logger::try_init(); + } + + /// Test multiple correlated subqueries + /// See subqueries.rs where_in_multiple() + #[test] + fn multiple_subqueries() -> Result<()> { + let orders = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter( + in_subquery(col("customer.c_custkey"), orders.clone()) + .and(in_subquery(col("customer.c_custkey"), orders)), + )? + .project(vec![col("customer.c_custkey")])? + .build()?; + debug!("plan to optimize:\n{}", plan.display_indent()); + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n SubqueryAlias: __correlated_sq_2 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test recursive correlated subqueries + /// See subqueries.rs where_in_recursive() + #[test] + fn recursive_subqueries() -> Result<()> { + let lineitem = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("lineitem")) + .filter( + col("lineitem.l_orderkey") + .eq(out_ref_col(DataType::Int64, "orders.o_orderkey")), + )? + .project(vec![col("lineitem.l_orderkey")])? + .build()?, + ); + + let orders = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(in_subquery(col("orders.o_orderkey"), lineitem).and( + col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), + ))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), orders))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n LeftSemi Join: Filter: orders.o_orderkey = __correlated_sq_2.l_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n SubqueryAlias: __correlated_sq_2 [l_orderkey:Int64]\ + \n Projection: lineitem.l_orderkey AS l_orderkey [l_orderkey:Int64]\ + \n TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery filter with additional subquery filters + #[test] + fn in_subquery_with_subquery_filters() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(col("orders.o_custkey")) + .and(col("o_orderkey").eq(lit(1))), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery with no columns in schema + #[test] + fn in_subquery_no_cols() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(out_ref_col(DataType::Int64, "customer.c_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for IN subquery with both columns in schema + #[test] + fn in_subquery_with_no_correlated_cols() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(col("orders.o_custkey").eq(col("orders.o_custkey")))? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery not equal + #[test] + fn in_subquery_where_not_eq() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .not_eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND customer.c_custkey != __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery less than + #[test] + fn in_subquery_where_less_than() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").lt(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND customer.c_custkey < __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery filter with subquery disjunction + #[test] + fn in_subquery_with_subquery_disjunction() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey") + .eq(col("orders.o_custkey")) + .or(col("o_orderkey").eq(lit(1))), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND (customer.c_custkey = __correlated_sq_1.o_custkey OR __correlated_sq_1.o_orderkey = Int32(1)) [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64, o_orderkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey, orders.o_orderkey [o_custkey:Int64, o_orderkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + + Ok(()) + } + + /// Test for correlated IN without projection + #[test] + fn in_subquery_no_projection() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + // Maybe okay if the table only has a single column? + assert_optimizer_err( + Arc::new(DecorrelateWhereIn::new()), + &plan, + "a projection is required", + ); + Ok(()) + } + + /// Test for correlated IN subquery join on expression + #[test] + fn in_subquery_join_expr() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey").add(lit(1)), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey + Int32(1) = __correlated_sq_1.o_custkey AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN expressions + #[test] + fn in_subquery_project_expr() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey").add(lit(1))])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey + Int32(1) AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n Projection: orders.o_custkey + Int32(1) AS o_custkey + Int32(1), orders.o_custkey [o_custkey + Int32(1):Int64, o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery multiple projected columns + #[test] + fn in_subquery_multi_col() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey"), col("orders.o_orderkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq).and(col("c_custkey").eq(lit(1))))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + assert_optimizer_err( + Arc::new(DecorrelateWhereIn::new()), + &plan, + "single expression projection required", + ); + Ok(()) + } + + /// Test for correlated IN subquery filter with additional filters + #[test] + fn should_support_additional_filters() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter(in_subquery(col("customer.c_custkey"), sq).and(col("c_custkey").eq(lit(1))))? + .project(vec![col("customer.c_custkey")])? + .build()?; + + let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ + \n Filter: customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ + \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ + \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ + \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ + \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ + \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery filter with disjustions + #[test] + fn in_subquery_disjunction() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(scan_tpch_table("orders")) + .filter( + out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), + )? + .project(vec![col("orders.o_custkey")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) + .filter( + in_subquery(col("customer.c_custkey"), sq).or(col("customer.c_custkey").eq(lit(1))), + )? + .project(vec![col("customer.c_custkey")])? + .build()?; + + // TODO: support disjunction - for now expect unaltered plan + let expected = r#"Projection: customer.c_custkey [c_custkey:Int64] + Filter: customer.c_custkey IN () OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8] + Subquery: [o_custkey:Int64] + Projection: orders.o_custkey [o_custkey:Int64] + Filter: outer_ref(customer.c_custkey) = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] + TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] + TableScan: customer [c_custkey:Int64, c_name:Utf8]"#; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for correlated IN subquery filter + #[test] + fn in_subquery_correlated() -> Result<()> { + let sq = Arc::new( + LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) + .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq.a")))? + .project(vec![col("c")])? + .build()?, + ); + + let plan = LogicalPlanBuilder::from(test_table_scan_with_name("test")?) + .filter(in_subquery(col("c"), sq))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32, a:UInt32]\ + \n Projection: sq.c AS c, sq.a [c:UInt32, a:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for single IN subquery filter + #[test] + fn in_subquery_simple() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("c"), test_subquery_with_name("sq")?))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq.c AS c [c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + /// Test for single NOT IN subquery filter + #[test] + fn not_in_subquery_simple() -> Result<()> { + let table_scan = test_table_scan()?; + let plan = LogicalPlanBuilder::from(table_scan) + .filter(not_in_subquery(col("c"), test_subquery_with_name("sq")?))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftAnti Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: sq.c AS c [c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + #[test] + fn in_subquery_both_side_expr() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + + let subquery = LogicalPlanBuilder::from(subquery_scan) + .project(vec![col("c") * lit(2u32)])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2) [c * UInt32(2):UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + #[test] + fn in_subquery_join_filter_and_inner_filter() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + out_ref_col(DataType::UInt32, "test.a") + .eq(col("sq.a")) + .and(col("sq.a").add(lit(1u32)).eq(col("sq.b"))), + )? + .project(vec![col("c") * lit(2u32)])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a [c * UInt32(2):UInt32, a:UInt32]\ + \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + #[test] + fn in_subquery_muti_project_subquery_cols() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan = test_table_scan_with_name("sq")?; + + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter( + out_ref_col(DataType::UInt32, "test.a") + .add(out_ref_col(DataType::UInt32, "test.b")) + .eq(col("sq.a").add(col("sq.b"))) + .and(col("sq.a").add(lit(1u32)).eq(col("sq.b"))), + )? + .project(vec![col("c") * lit(2u32)])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a + test.b = __correlated_sq_1.a + __correlated_sq_1.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a, sq.b [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ + \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + #[test] + fn two_in_subquery_with_outer_filter() -> Result<()> { + let table_scan = test_table_scan()?; + let subquery_scan1 = test_table_scan_with_name("sq1")?; + let subquery_scan2 = test_table_scan_with_name("sq2")?; + + let subquery1 = LogicalPlanBuilder::from(subquery_scan1) + .filter(out_ref_col(DataType::UInt32, "test.a").gt(col("sq1.a")))? + .project(vec![col("c") * lit(2u32)])? + .build()?; + + let subquery2 = LogicalPlanBuilder::from(subquery_scan2) + .filter(out_ref_col(DataType::UInt32, "test.a").gt(col("sq2.a")))? + .project(vec![col("c") * lit(2u32)])? + .build()?; + + let plan = LogicalPlanBuilder::from(table_scan) + .filter( + in_subquery(col("c") + lit(1u32), Arc::new(subquery1)).and( + in_subquery(col("c") * lit(2u32), Arc::new(subquery2)) + .and(col("test.c").gt(lit(1u32))), + ), + )? + .project(vec![col("test.b")])? + .build()?; + + let expected = "Projection: test.b [b:UInt32]\ + \n Filter: test.c > UInt32(1) [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c * UInt32(2) = __correlated_sq_2.c * UInt32(2) AND test.a > __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ + \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a > __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq1.c * UInt32(2) AS c * UInt32(2), sq1.a [c * UInt32(2):UInt32, a:UInt32]\ + \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_2 [c * UInt32(2):UInt32, a:UInt32]\ + \n Projection: sq2.c * UInt32(2) AS c * UInt32(2), sq2.a [c * UInt32(2):UInt32, a:UInt32]\ + \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } + + #[test] + fn in_subquery_with_same_table() -> Result<()> { + let outer_scan = test_table_scan()?; + let subquery_scan = test_table_scan()?; + let subquery = LogicalPlanBuilder::from(subquery_scan) + .filter(col("test.a").gt(col("test.b")))? + .project(vec![col("c")])? + .build()?; + + let plan = LogicalPlanBuilder::from(outer_scan) + .filter(in_subquery(col("test.a"), Arc::new(subquery)))? + .project(vec![col("test.b")])? + .build()?; + + // Subquery and outer query refer to the same table. + let expected = "Projection: test.b [b:UInt32]\ + \n LeftSemi Join: Filter: test.a = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ + \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ + \n Projection: test.c AS c [c:UInt32]\ + \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ + \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; + + assert_optimized_plan_eq_display_indent( + Arc::new(DecorrelateWhereIn::new()), + &plan, + expected, + ); + Ok(()) + } +} diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs new file mode 100644 index 000000000..019d9f853 --- /dev/null +++ b/src/sql/optimizer/utils.rs @@ -0,0 +1,512 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Collection of utility functions that are leveraged by the query optimizer rules + +use std::{ + collections::{BTreeSet, HashMap}, + sync::Arc, +}; + +use datafusion_python::{ + datafusion_common::{Column, DFSchema, DFSchemaRef, Result}, + datafusion_expr::{ + and, + expr::{Alias, BinaryExpr}, + expr_rewriter::{replace_col, strip_outer_reference}, + logical_plan::{Filter, LogicalPlan}, + Expr, + LogicalPlanBuilder, + Operator, + }, + datafusion_optimizer::optimizer::{OptimizerConfig, OptimizerRule}, +}; +use log::{debug, trace}; + +/// Convenience rule for writing optimizers: recursively invoke +/// optimize on plan's children and then return a node of the same +/// type. Useful for optimizer rules which want to leave the type +/// of plan unchanged but still apply to the children. +/// This also handles the case when the `plan` is a [`LogicalPlan::Explain`]. +/// +/// Returning `Ok(None)` indicates that the plan can't be optimized by the `optimizer`. +pub fn optimize_children( + optimizer: &impl OptimizerRule, + plan: &LogicalPlan, + config: &dyn OptimizerConfig, +) -> Result> { + let mut new_inputs = Vec::with_capacity(plan.inputs().len()); + let mut plan_is_changed = false; + for input in plan.inputs() { + let new_input = optimizer.try_optimize(input, config)?; + plan_is_changed = plan_is_changed || new_input.is_some(); + new_inputs.push(new_input.unwrap_or_else(|| input.clone())) + } + if plan_is_changed { + Ok(Some(plan.with_new_inputs(&new_inputs)?)) + } else { + Ok(None) + } +} + +/// Splits a conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]` +/// +/// See [`split_conjunction_owned`] for more details and an example. +pub fn split_conjunction(expr: &Expr) -> Vec<&Expr> { + split_conjunction_impl(expr, vec![]) +} + +fn split_conjunction_impl<'a>(expr: &'a Expr, mut exprs: Vec<&'a Expr>) -> Vec<&'a Expr> { + match expr { + Expr::BinaryExpr(BinaryExpr { + right, + op: Operator::And, + left, + }) => { + let exprs = split_conjunction_impl(left, exprs); + split_conjunction_impl(right, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_conjunction_impl(expr, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +/// Extract join predicates from the correclated subquery. +/// The join predicate means that the expression references columns +/// from both the subquery and outer table or only from the outer table. +/// +/// Returns join predicates and subquery(extracted). +pub(crate) fn extract_join_filters(maybe_filter: &LogicalPlan) -> Result<(Vec, LogicalPlan)> { + if let LogicalPlan::Filter(plan_filter) = maybe_filter { + let subquery_filter_exprs = split_conjunction(&plan_filter.predicate); + let (join_filters, subquery_filters) = find_join_exprs(subquery_filter_exprs)?; + // if the subquery still has filter expressions, restore them. + let mut plan = LogicalPlanBuilder::from((*plan_filter.input).clone()); + if let Some(expr) = conjunction(subquery_filters) { + plan = plan.filter(expr)? + } + + Ok((join_filters, plan.build()?)) + } else { + Ok((vec![], maybe_filter.clone())) + } +} + +#[allow(dead_code)] +/// Splits an owned conjunctive [`Expr`] such as `A AND B AND C` => `[A, B, C]` +/// +/// This is often used to "split" filter expressions such as `col1 = 5 +/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`]; +/// +/// # Example +/// ``` +/// # use datafusion_expr::{col, lit}; +/// # use datafusion_optimizer::utils::split_conjunction_owned; +/// // a=1 AND b=2 +/// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use split_conjunction_owned to split them +/// assert_eq!(split_conjunction_owned(expr), split); +/// ``` +pub fn split_conjunction_owned(expr: Expr) -> Vec { + split_binary_owned(expr, Operator::And) +} + +#[allow(dead_code)] +/// Splits an owned binary operator tree [`Expr`] such as `A B C` => `[A, B, C]` +/// +/// This is often used to "split" expressions such as `col1 = 5 +/// AND col2 = 10` into [`col1 = 5`, `col2 = 10`]; +/// +/// # Example +/// ``` +/// # use datafusion_expr::{col, lit, Operator}; +/// # use datafusion_optimizer::utils::split_binary_owned; +/// # use std::ops::Add; +/// // a=1 + b=2 +/// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use split_binary_owned to split them +/// assert_eq!(split_binary_owned(expr, Operator::Plus), split); +/// ``` +pub fn split_binary_owned(expr: Expr, op: Operator) -> Vec { + split_binary_owned_impl(expr, op, vec![]) +} + +#[allow(dead_code)] +fn split_binary_owned_impl(expr: Expr, operator: Operator, mut exprs: Vec) -> Vec { + match expr { + Expr::BinaryExpr(BinaryExpr { right, op, left }) if op == operator => { + let exprs = split_binary_owned_impl(*left, operator, exprs); + split_binary_owned_impl(*right, operator, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_binary_owned_impl(*expr, operator, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +#[allow(dead_code)] +/// Splits an binary operator tree [`Expr`] such as `A B C` => `[A, B, C]` +/// +/// See [`split_binary_owned`] for more details and an example. +pub fn split_binary(expr: &Expr, op: Operator) -> Vec<&Expr> { + split_binary_impl(expr, op, vec![]) +} + +#[allow(dead_code)] +fn split_binary_impl<'a>( + expr: &'a Expr, + operator: Operator, + mut exprs: Vec<&'a Expr>, +) -> Vec<&'a Expr> { + match expr { + Expr::BinaryExpr(BinaryExpr { right, op, left }) if *op == operator => { + let exprs = split_binary_impl(left, operator, exprs); + split_binary_impl(right, operator, exprs) + } + Expr::Alias(Alias { expr, .. }) => split_binary_impl(expr, operator, exprs), + other => { + exprs.push(other); + exprs + } + } +} + +/// Combines an array of filter expressions into a single filter +/// expression consisting of the input filter expressions joined with +/// logical AND. +/// +/// Returns None if the filters array is empty. +/// +/// # Example +/// ``` +/// # use datafusion_expr::{col, lit}; +/// # use datafusion_optimizer::utils::conjunction; +/// // a=1 AND b=2 +/// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); +/// +/// // [a=1, b=2] +/// let split = vec![ +/// col("a").eq(lit(1)), +/// col("b").eq(lit(2)), +/// ]; +/// +/// // use conjunction to join them together with `AND` +/// assert_eq!(conjunction(split), Some(expr)); +/// ``` +pub fn conjunction(filters: impl IntoIterator) -> Option { + filters.into_iter().reduce(|accum, expr| accum.and(expr)) +} + +#[allow(dead_code)] +/// Combines an array of filter expressions into a single filter +/// expression consisting of the input filter expressions joined with +/// logical OR. +/// +/// Returns None if the filters array is empty. +pub fn disjunction(filters: impl IntoIterator) -> Option { + filters.into_iter().reduce(|accum, expr| accum.or(expr)) +} + +/// returns a new [LogicalPlan] that wraps `plan` in a [LogicalPlan::Filter] with +/// its predicate be all `predicates` ANDed. +#[allow(dead_code)] +pub fn add_filter(plan: LogicalPlan, predicates: &[&Expr]) -> Result { + // reduce filters to a single filter with an AND + let predicate = predicates + .iter() + .skip(1) + .fold(predicates[0].clone(), |acc, predicate| { + and(acc, (*predicate).to_owned()) + }); + + Ok(LogicalPlan::Filter(Filter::try_new( + predicate, + Arc::new(plan), + )?)) +} + +/// Looks for correlating expressions: for example, a binary expression with one field from the subquery, and +/// one not in the subquery (closed upon from outer scope) +/// +/// # Arguments +/// +/// * `exprs` - List of expressions that may or may not be joins +/// +/// # Return value +/// +/// Tuple of (expressions containing joins, remaining non-join expressions) +pub fn find_join_exprs(exprs: Vec<&Expr>) -> Result<(Vec, Vec)> { + let mut joins = vec![]; + let mut others = vec![]; + for filter in exprs.into_iter() { + // If the expression contains correlated predicates, add it to join filters + if filter.contains_outer() { + if !matches!(filter, Expr::BinaryExpr(BinaryExpr{ left, op: Operator::Eq, right }) if left.eq(right)) + { + joins.push(strip_outer_reference((*filter).clone())); + } + } else { + others.push((*filter).clone()); + } + } + + Ok((joins, others)) +} + +/// Returns the first (and only) element in a slice, or an error +/// +/// # Arguments +/// +/// * `slice` - The slice to extract from +/// +/// # Return value +/// +/// The first element, or an error +pub fn only_or_err(slice: &[T]) -> Result<&T> { + match slice { + [it] => Ok(it), + [] => Err(datafusion_python::datafusion_common::DataFusionError::Plan( + "No items found!".to_owned(), + )), + _ => Err(datafusion_python::datafusion_common::DataFusionError::Plan( + "More than one item found!".to_owned(), + )), + } +} + +/// merge inputs schema into a single schema. +#[allow(dead_code)] +pub fn merge_schema(inputs: Vec<&LogicalPlan>) -> DFSchema { + if inputs.len() == 1 { + inputs[0].schema().clone().as_ref().clone() + } else { + inputs + .iter() + .map(|input| input.schema()) + .fold(DFSchema::empty(), |mut lhs, rhs| { + lhs.merge(rhs); + lhs + }) + } +} + +pub(crate) fn collect_subquery_cols( + exprs: &[Expr], + subquery_schema: DFSchemaRef, +) -> Result> { + exprs.iter().try_fold(BTreeSet::new(), |mut cols, expr| { + let mut using_cols: Vec = vec![]; + for col in expr.to_columns()?.into_iter() { + if subquery_schema.has_column(&col) { + using_cols.push(col); + } + } + + cols.extend(using_cols); + Result::<_>::Ok(cols) + }) +} + +pub(crate) fn replace_qualified_name( + expr: Expr, + cols: &BTreeSet, + subquery_alias: &str, +) -> Result { + let alias_cols: Vec = cols + .iter() + .map(|col| Column::from_qualified_name(format!("{}.{}", subquery_alias, col.name))) + .collect(); + let replace_map: HashMap<&Column, &Column> = cols.iter().zip(alias_cols.iter()).collect(); + + replace_col(expr, &replace_map) +} + +/// Log the plan in debug/tracing mode after some part of the optimizer runs +pub fn log_plan(description: &str, plan: &LogicalPlan) { + debug!("{description}:\n{}\n", plan.display_indent()); + trace!("{description}::\n{}\n", plan.display_indent_schema()); +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use arrow::datatypes::DataType; + use datafusion_common::Column; + use datafusion_expr::{col, expr::Cast, lit, utils::expr_to_columns}; + + use super::*; + + #[test] + fn test_split_conjunction() { + let expr = col("a"); + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr]); + } + + #[test] + fn test_split_conjunction_two() { + let expr = col("a").eq(lit(5)).and(col("b")); + let expr1 = col("a").eq(lit(5)); + let expr2 = col("b"); + + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr1, &expr2]); + } + + #[test] + fn test_split_conjunction_alias() { + let expr = col("a").eq(lit(5)).and(col("b").alias("the_alias")); + let expr1 = col("a").eq(lit(5)); + let expr2 = col("b"); // has no alias + + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr1, &expr2]); + } + + #[test] + fn test_split_conjunction_or() { + let expr = col("a").eq(lit(5)).or(col("b")); + let result = split_conjunction(&expr); + assert_eq!(result, vec![&expr]); + } + + #[test] + fn test_split_binary_owned() { + let expr = col("a"); + assert_eq!(split_binary_owned(expr.clone(), Operator::And), vec![expr]); + } + + #[test] + fn test_split_binary_owned_two() { + assert_eq!( + split_binary_owned(col("a").eq(lit(5)).and(col("b")), Operator::And), + vec![col("a").eq(lit(5)), col("b")] + ); + } + + #[test] + fn test_split_binary_owned_different_op() { + let expr = col("a").eq(lit(5)).or(col("b")); + assert_eq!( + // expr is connected by OR, but pass in AND + split_binary_owned(expr.clone(), Operator::And), + vec![expr] + ); + } + + #[test] + fn test_split_conjunction_owned() { + let expr = col("a"); + assert_eq!(split_conjunction_owned(expr.clone()), vec![expr]); + } + + #[test] + fn test_split_conjunction_owned_two() { + assert_eq!( + split_conjunction_owned(col("a").eq(lit(5)).and(col("b"))), + vec![col("a").eq(lit(5)), col("b")] + ); + } + + #[test] + fn test_split_conjunction_owned_alias() { + assert_eq!( + split_conjunction_owned(col("a").eq(lit(5)).and(col("b").alias("the_alias"))), + vec![ + col("a").eq(lit(5)), + // no alias on b + col("b"), + ] + ); + } + + #[test] + fn test_conjunction_empty() { + assert_eq!(conjunction(vec![]), None); + } + + #[test] + fn test_conjunction() { + // `[A, B, C]` + let expr = conjunction(vec![col("a"), col("b"), col("c")]); + + // --> `(A AND B) AND C` + assert_eq!(expr, Some(col("a").and(col("b")).and(col("c")))); + + // which is different than `A AND (B AND C)` + assert_ne!(expr, Some(col("a").and(col("b").and(col("c"))))); + } + + #[test] + fn test_disjunction_empty() { + assert_eq!(disjunction(vec![]), None); + } + + #[test] + fn test_disjunction() { + // `[A, B, C]` + let expr = disjunction(vec![col("a"), col("b"), col("c")]); + + // --> `(A OR B) OR C` + assert_eq!(expr, Some(col("a").or(col("b")).or(col("c")))); + + // which is different than `A OR (B OR C)` + assert_ne!(expr, Some(col("a").or(col("b").or(col("c"))))); + } + + #[test] + fn test_split_conjunction_owned_or() { + let expr = col("a").eq(lit(5)).or(col("b")); + assert_eq!(split_conjunction_owned(expr.clone()), vec![expr]); + } + + #[test] + fn test_collect_expr() -> Result<()> { + let mut accum: HashSet = HashSet::new(); + expr_to_columns( + &Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)), + &mut accum, + )?; + expr_to_columns( + &Expr::Cast(Cast::new(Box::new(col("a")), DataType::Float64)), + &mut accum, + )?; + assert_eq!(1, accum.len()); + assert!(accum.contains(&Column::from_name("a"))); + Ok(()) + } +} From 4fa155cd4bb41e9eecaa99b99f608c640d75e8af Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 9 Aug 2023 18:13:33 -0400 Subject: [PATCH 52/89] Checkstyle fixes --- src/sql/optimizer/utils.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs index 019d9f853..13baa5609 100644 --- a/src/sql/optimizer/utils.rs +++ b/src/sql/optimizer/utils.rs @@ -37,6 +37,7 @@ use datafusion_python::{ }; use log::{debug, trace}; +#[allow(dead_code)] /// Convenience rule for writing optimizers: recursively invoke /// optimize on plan's children and then return a node of the same /// type. Useful for optimizer rules which want to leave the type @@ -354,6 +355,7 @@ pub(crate) fn replace_qualified_name( replace_col(expr, &replace_map) } +#[allow(dead_code)] /// Log the plan in debug/tracing mode after some part of the optimizer runs pub fn log_plan(description: &str, plan: &LogicalPlan) { debug!("{description}:\n{}\n", plan.display_indent()); From 3f23aedade0fe9a5ad20ed6822abd22f11180b75 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 9 Aug 2023 18:56:52 -0400 Subject: [PATCH 53/89] Remove xfail for queries 58 and 61 which pass now --- tests/unit/test_queries.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py index 9e19ada4c..dbdf94a86 100644 --- a/tests/unit/test_queries.py +++ b/tests/unit/test_queries.py @@ -19,12 +19,10 @@ 39, 41, 44, - 45, 47, 49, 51, 57, - 58, 62, 67, 69, @@ -40,11 +38,12 @@ 99, ) -QUERIES = [ - pytest.param(f"q{i}.sql", marks=pytest.mark.xfail if i in XFAIL_QUERIES else ()) - for i in range(1, 100) -] +# QUERIES = [ +# pytest.param(f"q{i}.sql", marks=pytest.mark.xfail if i in XFAIL_QUERIES else ()) +# for i in range(1, 100) +# ] +QUERIES = ["q95.sql"] @pytest.fixture(scope="module") def c(data_dir): From bcd1f293f17552a20b3a8c0238dac5067aaafeb0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 9 Aug 2023 19:03:21 -0400 Subject: [PATCH 54/89] Fix pytest syntax issue --- tests/unit/test_queries.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py index dbdf94a86..f657f09df 100644 --- a/tests/unit/test_queries.py +++ b/tests/unit/test_queries.py @@ -38,12 +38,10 @@ 99, ) -# QUERIES = [ -# pytest.param(f"q{i}.sql", marks=pytest.mark.xfail if i in XFAIL_QUERIES else ()) -# for i in range(1, 100) -# ] - -QUERIES = ["q95.sql"] +QUERIES = [ + pytest.param(f"q{i}.sql", marks=pytest.mark.xfail if i in XFAIL_QUERIES else ()) + for i in range(1, 100) +] @pytest.fixture(scope="module") def c(data_dir): From d18397636f58437f03144ca2e2d8e7ac947d0ecd Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Wed, 9 Aug 2023 19:09:12 -0400 Subject: [PATCH 55/89] whatever, have it your way black --- tests/unit/test_queries.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_queries.py b/tests/unit/test_queries.py index f657f09df..bfaedfcee 100644 --- a/tests/unit/test_queries.py +++ b/tests/unit/test_queries.py @@ -43,6 +43,7 @@ for i in range(1, 100) ] + @pytest.fixture(scope="module") def c(data_dir): # Lazy import, otherwise the pytest framework has problems From 32f5adffc0cd6385298d12cf0eea81f10a1669a3 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 10 Aug 2023 06:09:35 -0700 Subject: [PATCH 56/89] Remove debugging println --- src/expression.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/expression.rs b/src/expression.rs index 12f827fba..1e63701f4 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -906,7 +906,6 @@ fn unexpected_literal_value(value: &ScalarValue) -> PyErr { } fn get_expr_name(expr: &Expr) -> Result { - println!("get_expr_name: {:?}", expr); match expr { Expr::Alias(Alias { expr, .. }) => get_expr_name(expr), Expr::Wildcard => { From e2d4399f40a490d6c800df746864411e54c48bac Mon Sep 17 00:00:00 2001 From: Ayush Dattagupta Date: Thu, 10 Aug 2023 11:47:25 -0700 Subject: [PATCH 57/89] re-add support for ilike using the case_insensitive member of like --- src/expression.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/expression.rs b/src/expression.rs index 12f827fba..11d1ed60e 100644 --- a/src/expression.rs +++ b/src/expression.rs @@ -480,12 +480,16 @@ impl PyExpr { Expr::InSubquery(..) => "in subquery".to_string(), Expr::Negative(..) => "negative".to_string(), Expr::Not(..) => "not".to_string(), - Expr::Like(Like { negated, .. }) => { - if *negated { - "not like".to_string() - } else { - "like".to_string() - } + Expr::Like(Like { + negated, + case_insensitive, + .. + }) => { + format!( + "{}{}like", + if *negated { "not " } else { "" }, + if *case_insensitive { "i" } else { "" } + ) } Expr::SimilarTo(Like { negated, .. }) => { if *negated { From 67a5d86c33b2316ec4014787137740925f5b9f75 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 11 Aug 2023 07:46:59 -0700 Subject: [PATCH 58/89] Handle non-decimal scalar args for cuDF in RexCall --- dask_sql/physical/rex/core/call.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/dask_sql/physical/rex/core/call.py b/dask_sql/physical/rex/core/call.py index bce6e11c1..8db8ca048 100644 --- a/dask_sql/physical/rex/core/call.py +++ b/dask_sql/physical/rex/core/call.py @@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Any, Callable, Union import dask.array as da +import dask.config as dask_config import dask.dataframe as dd import numpy as np import pandas as pd @@ -1185,6 +1186,21 @@ def convert( for o in expr.getOperands() ] + # FIXME: cuDF doesn't support binops between decimal columns and numpy ints / floats + if dask_config.get("sql.mappings.decimal_support") == "cudf" and any( + str(getattr(o, "dtype", None)) == "decimal128" for o in operands + ): + from decimal import Decimal + + operands = [ + Decimal(str(o)) + if isinstance(o, float) + else o.item() + if np.isscalar(o) and pd.api.types.is_integer_dtype(o) + else o + for o in operands + ] + # Now use the operator name in the mapping schema_name = context.schema_name operator_name = expr.getOperatorName().lower() From d7677260156d07708abf907c4ecd4cbfdd273740 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:07:41 -0700 Subject: [PATCH 59/89] Try using maturin with zig for wheel builds --- .github/workflows/release.yml | 156 +++++++++++++++++++--------------- 1 file changed, 86 insertions(+), 70 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 66db9f92c..b5f69c6a8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -15,83 +15,103 @@ concurrency: env: upload: ${{ github.event_name == 'release' && github.repository == 'dask-contrib/dask-sql' }} -# Required shell entrypoint to have properly activated conda environments -defaults: - run: - shell: bash -l {0} - jobs: - wheels: - name: Build and publish py3.${{ matrix.python }} wheels on ${{ matrix.os }} - runs-on: ${{ matrix.os }} + linux: + name: Build and publish wheels for linux ${{ matrix.target }} + runs-on: ubuntu-latest strategy: - fail-fast: false matrix: - os: [ubuntu-latest, windows-latest, macos-latest] - python: ["8", "9", "10"] # 3.x + target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: - fetch-depth: 0 - - name: Install Protoc - if: matrix.os != 'ubuntu-latest' - uses: arduino/setup-protoc@v1 + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 with: - version: '3.x' - repo-token: ${{ secrets.GITHUB_TOKEN }} - - name: Set up QEMU for linux-aarch64 - if: matrix.os == 'ubuntu-latest' - uses: docker/setup-qemu-action@v2 + target: ${{ matrix.target }} + args: --release --out dist --zig + sccache: 'true' + manylinux: auto + - name: Check dist files + run: | + pip install twine + + twine check dist/* + ls -lh dist/ + - name: Upload binary wheels + uses: actions/upload-artifact@v3 with: - platforms: arm64 - - name: Add rust toolchain target for macos-aarch64 - if: matrix.os == 'macos-latest' - run: rustup target add aarch64-apple-darwin + name: wheels for linux + path: dist/* + - name: Publish package + if: env.upload == 'true' + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/* + + windows: + name: Build and publish wheels for windows + runs-on: windows-latest + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + architecture: x64 - name: Build wheels - uses: pypa/cibuildwheel@v2.11.3 + uses: PyO3/maturin-action@v1 + with: + target: x64 + args: --release --out dist + sccache: 'true' + - name: Check dist files + run: | + pip install twine + + twine check dist/* + ls -lh dist/ + - name: Upload binary wheels + uses: actions/upload-artifact@v3 + with: + name: wheels for windows + path: dist/* + - name: Publish package + if: env.upload == 'true' env: - CIBW_BUILD: 'cp3${{ matrix.python }}-*' - CIBW_SKIP: '*musllinux*' - CIBW_ARCHS_LINUX: 'aarch64 x86_64' - CIBW_ARCHS_WINDOWS: 'AMD64' - CIBW_ARCHS_MACOS: 'x86_64 arm64' - # Without CARGO_NET_GIT_FETCH_WITH_CLI we oom (https://github.com/rust-lang/cargo/issues/10583) - CIBW_ENVIRONMENT_LINUX: > - CARGO_NET_GIT_FETCH_WITH_CLI="true" - PATH="$HOME/.cargo/bin:$HOME/.local/bin:$PATH" - CIBW_ENVIRONMENT_WINDOWS: 'PATH="$UserProfile\.cargo\bin;$PATH"' - CIBW_BEFORE_BUILD: 'pip install -U "maturin>=1.1,<1.2"' - CIBW_BEFORE_BUILD_LINUX: > - ARCH=$([ $(uname -m) == x86_64 ] && echo x86_64 || echo aarch_64) && - DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-${ARCH}.zip$") && - curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && - unzip protoc-*-linux-$ARCH.zip -d $HOME/.local && - protoc --version && - pip install -U "maturin>=1.1,<1.2" && - pip list && - curl --retry 6 --retry-delay 10 https://sh.rustup.rs -sSf | sh -s -- --default-toolchain=stable --profile=minimal -y && - rustup show + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: twine upload dist/* + + macos: + name: Build and publish wheels for macos ${{ matrix.target }} + runs-on: macos-latest + strategy: + matrix: + target: [x86_64, aarch64] + steps: + - uses: actions/checkout@v3 + - uses: actions/setup-python@v4 with: - package-dir: . - output-dir: dist - config-file: "pyproject.toml" - - name: Set up Python - uses: conda-incubator/setup-miniconda@v2.2.0 + python-version: '3.10' + - name: Build wheels + uses: PyO3/maturin-action@v1 with: - miniforge-variant: Mambaforge - use-mamba: true - python-version: "3.8" - channel-priority: strict + target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' - name: Check dist files run: | - mamba install twine + pip install twine twine check dist/* ls -lh dist/ - name: Upload binary wheels uses: actions/upload-artifact@v3 with: - name: wheels for py3.${{ matrix.python }} on ${{ matrix.os }} + name: wheels for macos path: dist/* - name: Publish package if: env.upload == 'true' @@ -99,27 +119,23 @@ jobs: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: twine upload dist/* + sdist: - name: Build and publish source distribution runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 + - name: Build sdist + uses: PyO3/maturin-action@v1 with: - fetch-depth: 0 - - name: Set up Python - uses: conda-incubator/setup-miniconda@v2.2.0 + command: sdist + args: --out dist + - uses: actions/setup-python@v4 with: - miniforge-variant: Mambaforge - use-mamba: true - python-version: "3.8" - channel-priority: strict - - name: Build source distribution - run: | - mamba install "maturin>=1.1,<1.2" twine - - python setup.py sdist + python-version: '3.10' - name: Check dist files run: | + pip install twine + twine check dist/* ls -lh dist/ - name: Publish source distribution From 294cd2872e3c7f61de261e3b9f522984edd4673e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Fri, 11 Aug 2023 10:40:09 -0700 Subject: [PATCH 60/89] Install protoc for all wheel builds and zlib1g-dev in linux builds --- .github/workflows/release.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b5f69c6a8..448541751 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,10 +20,18 @@ jobs: name: Build and publish wheels for linux ${{ matrix.target }} runs-on: ubuntu-latest strategy: + fail-fast: false matrix: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 + - name: Install zlib + run: sudo apt update && sudo apt install -y zlib1g-dev + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v4 with: python-version: '3.10' @@ -57,6 +65,11 @@ jobs: runs-on: windows-latest steps: - uses: actions/checkout@v3 + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v4 with: python-version: '3.10' @@ -89,10 +102,16 @@ jobs: name: Build and publish wheels for macos ${{ matrix.target }} runs-on: macos-latest strategy: + fail-fast: false matrix: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v4 with: python-version: '3.10' From 1bcb71e57f68dfbb60995a98208416afd67d5ed0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 09:30:05 -0400 Subject: [PATCH 61/89] Remove Cargo tests because that code is already being tested in DataFusion anyway --- src/sql/optimizer/decorrelate_where_exists.rs | 555 ----------- src/sql/optimizer/decorrelate_where_in.rs | 933 ------------------ 2 files changed, 1488 deletions(-) diff --git a/src/sql/optimizer/decorrelate_where_exists.rs b/src/sql/optimizer/decorrelate_where_exists.rs index bd2d3348f..5944c83ae 100644 --- a/src/sql/optimizer/decorrelate_where_exists.rs +++ b/src/sql/optimizer/decorrelate_where_exists.rs @@ -226,558 +226,3 @@ impl SubqueryInfo { Self { query, negated } } } - -#[cfg(test)] -mod tests { - use std::ops::Add; - - use arrow::datatypes::DataType; - use datafusion_common::Result; - use datafusion_expr::{ - col, - exists, - lit, - logical_plan::LogicalPlanBuilder, - not_exists, - out_ref_col, - }; - - use super::*; - use crate::test::*; - - fn assert_plan_eq(plan: &LogicalPlan, expected: &str) -> Result<()> { - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereExists::new()), - plan, - expected, - ); - Ok(()) - } - - /// Test for multiple exists subqueries in the same filter expression - #[test] - fn multiple_subqueries() -> Result<()> { - let orders = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(orders.clone()).and(exists(orders)))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ - \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - assert_plan_eq(&plan, expected) - } - - /// Test recursive correlated subqueries - #[test] - fn recursive_subqueries() -> Result<()> { - let lineitem = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("lineitem")) - .filter( - col("lineitem.l_orderkey") - .eq(out_ref_col(DataType::Int64, "orders.o_orderkey")), - )? - .project(vec![col("lineitem.l_orderkey")])? - .build()?, - ); - - let orders = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(exists(lineitem).and( - col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), - ))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(orders))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: orders.o_custkey = customer.c_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n LeftSemi Join: Filter: lineitem.l_orderkey = orders.o_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n Projection: lineitem.l_orderkey [l_orderkey:Int64]\ - \n TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]"; - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists subquery filter with additional subquery filters - #[test] - fn exists_subquery_with_subquery_filters() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .eq(col("orders.o_custkey")) - .and(col("o_orderkey").eq(lit(1))), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_subquery_no_cols() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(out_ref_col(DataType::Int64, "customer.c_custkey").eq(lit(1u32)))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - // Other rule will pushdown `customer.c_custkey = 1`, - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = UInt32(1) [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: []\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for exists subquery with both columns in schema - #[test] - fn exists_subquery_with_no_correlated_cols() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("orders.o_custkey").eq(col("orders.o_custkey")))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) - } - - /// Test for correlated exists subquery not equal - #[test] - fn exists_subquery_where_not_eq() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .not_eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey != orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists subquery less than - #[test] - fn exists_subquery_where_less_than() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").lt(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey < orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists subquery filter with subquery disjunction - #[test] - fn exists_subquery_with_subquery_disjunction() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .eq(col("orders.o_custkey")) - .or(col("o_orderkey").eq(lit(1))), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey OR orders.o_orderkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey, orders.o_orderkey [o_custkey:Int64, o_orderkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists without projection - #[test] - fn exists_subquery_no_projection() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) - } - - /// Test for correlated exists expressions - #[test] - fn exists_subquery_project_expr() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey").add(lit(1))])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists subquery filter with additional filters - #[test] - fn should_support_additional_filters() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq).and(col("c_custkey").eq(lit(1))))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ - \n LeftSemi Join: Filter: customer.c_custkey = orders.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n Projection: orders.o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated exists subquery filter with disjunctions - #[test] - fn exists_subquery_disjunction() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(exists(sq).or(col("customer.c_custkey").eq(lit(1))))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - // not optimized - let expected = r#"Projection: customer.c_custkey [c_custkey:Int64] - Filter: EXISTS () OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8] - Subquery: [o_custkey:Int64] - Projection: orders.o_custkey [o_custkey:Int64] - Filter: customer.c_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: customer [c_custkey:Int64, c_name:Utf8]"#; - - assert_plan_eq(&plan, expected) - } - - /// Test for correlated EXISTS subquery filter - #[test] - fn exists_subquery_correlated() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) - .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq.a")))? - .project(vec![col("c")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(test_table_scan_with_name("test")?) - .filter(exists(sq))? - .project(vec![col("test.c")])? - .build()?; - - let expected = "Projection: test.c [c:UInt32]\ - \n LeftSemi Join: Filter: test.a = sq.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Projection: sq.a [a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - /// Test for single exists subquery filter - #[test] - fn exists_subquery_simple() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(exists(test_subquery_with_name("sq")?))? - .project(vec![col("test.b")])? - .build()?; - - assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) - } - - /// Test for single NOT exists subquery filter - #[test] - fn not_exists_subquery_simple() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(not_exists(test_subquery_with_name("sq")?))? - .project(vec![col("test.b")])? - .build()?; - - assert_optimization_skipped(Arc::new(DecorrelateWhereExists::new()), &plan) - } - - #[test] - fn two_exists_subquery_with_outer_filter() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan1 = test_table_scan_with_name("sq1")?; - let subquery_scan2 = test_table_scan_with_name("sq2")?; - - let subquery1 = LogicalPlanBuilder::from(subquery_scan1) - .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq1.a")))? - .project(vec![col("c")])? - .build()?; - - let subquery2 = LogicalPlanBuilder::from(subquery_scan2) - .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq2.a")))? - .project(vec![col("c")])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter( - exists(Arc::new(subquery1)) - .and(exists(Arc::new(subquery2)).and(col("test.c").gt(lit(1u32)))), - )? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: test.c > UInt32(1) [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.a = sq2.a [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.a = sq1.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Projection: sq1.a [a:UInt32]\ - \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n Projection: sq2.a [a:UInt32]\ - \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_subquery_expr_filter() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), - )? - .project(vec![lit(1u32)])? - .build()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(exists(Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Projection: sq.a [a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_subquery_with_same_table() -> Result<()> { - let outer_scan = test_table_scan()?; - let subquery_scan = test_table_scan()?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter(col("test.a").gt(col("test.b")))? - .project(vec![col("c")])? - .build()?; - - let plan = LogicalPlanBuilder::from(outer_scan) - .filter(exists(Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - // Subquery and outer query refer to the same table. - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: EXISTS () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [c:UInt32]\ - \n Projection: test.c [c:UInt32]\ - \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_distinct_subquery() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), - )? - .project(vec![col("sq.c")])? - .distinct()? - .build()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(exists(Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_distinct_expr_subquery() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), - )? - .project(vec![col("sq.b") + col("sq.c")])? - .distinct()? - .build()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(exists(Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } - - #[test] - fn exists_distinct_subquery_with_literal() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - (lit(1u32) + col("sq.a")).gt(out_ref_col(DataType::UInt32, "test.a") * lit(2u32)), - )? - .project(vec![lit(1u32), col("sq.c")])? - .distinct()? - .build()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(exists(Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: UInt32(1) + sq.a > test.a * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n Distinct: [a:UInt32]\ - \n Projection: sq.a [a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_plan_eq(&plan, expected) - } -} diff --git a/src/sql/optimizer/decorrelate_where_in.rs b/src/sql/optimizer/decorrelate_where_in.rs index d3907293a..014f22092 100644 --- a/src/sql/optimizer/decorrelate_where_in.rs +++ b/src/sql/optimizer/decorrelate_where_in.rs @@ -256,936 +256,3 @@ impl SubqueryInfo { } } } - -#[cfg(test)] -mod tests { - use arrow::datatypes::DataType; - use datafusion_common::Result; - use datafusion_expr::{ - and, - binary_expr, - col, - in_subquery, - lit, - logical_plan::LogicalPlanBuilder, - not_in_subquery, - or, - out_ref_col, - Operator, - }; - - use super::*; - use crate::test::*; - - fn assert_optimized_plan_equal(plan: &LogicalPlan, expected: &str) -> Result<()> { - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - plan, - expected, - ); - Ok(()) - } - - fn test_subquery_with_name(name: &str) -> Result> { - let table_scan = test_table_scan_with_name(name)?; - Ok(Arc::new( - LogicalPlanBuilder::from(table_scan) - .project(vec![col("c")])? - .build()?, - )) - } - - /// Test for several IN subquery expressions - #[test] - fn in_subquery_multiple() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(and( - in_subquery(col("c"), test_subquery_with_name("sq_1")?), - in_subquery(col("b"), test_subquery_with_name("sq_2")?), - ))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.b = __correlated_sq_2.c [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq_1.c AS c [c:UInt32]\ - \n TableScan: sq_1 [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [c:UInt32]\ - \n Projection: sq_2.c AS c [c:UInt32]\ - \n TableScan: sq_2 [a:UInt32, b:UInt32, c:UInt32]"; - assert_optimized_plan_equal(&plan, expected) - } - - /// Test for IN subquery with additional AND filter - #[test] - fn in_subquery_with_and_filters() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(and( - in_subquery(col("c"), test_subquery_with_name("sq")?), - and( - binary_expr(col("a"), Operator::Eq, lit(1_u32)), - binary_expr(col("b"), Operator::Lt, lit(30_u32)), - ), - ))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: test.a = UInt32(1) AND test.b < UInt32(30) [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq.c AS c [c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_equal(&plan, expected) - } - - /// Test for IN subquery with additional OR filter - /// filter expression not modified - #[test] - fn in_subquery_with_or_filters() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(or( - and( - binary_expr(col("a"), Operator::Eq, lit(1_u32)), - binary_expr(col("b"), Operator::Lt, lit(30_u32)), - ), - in_subquery(col("c"), test_subquery_with_name("sq")?), - ))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: test.a = UInt32(1) AND test.b < UInt32(30) OR test.c IN () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [c:UInt32]\ - \n Projection: sq.c [c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_equal(&plan, expected) - } - - #[test] - fn in_subquery_with_and_or_filters() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(and( - or( - binary_expr(col("a"), Operator::Eq, lit(1_u32)), - in_subquery(col("b"), test_subquery_with_name("sq1")?), - ), - in_subquery(col("c"), test_subquery_with_name("sq2")?), - ))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: test.a = UInt32(1) OR test.b IN () [a:UInt32, b:UInt32, c:UInt32]\ - \n Subquery: [c:UInt32]\ - \n Projection: sq1.c [c:UInt32]\ - \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq2.c AS c [c:UInt32]\ - \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_equal(&plan, expected) - } - - /// Test for nested IN subqueries - #[test] - fn in_subquery_nested() -> Result<()> { - let table_scan = test_table_scan()?; - - let subquery = LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) - .filter(in_subquery(col("a"), test_subquery_with_name("sq_nested")?))? - .project(vec![col("a")])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("b"), Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.b = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [a:UInt32]\ - \n Projection: sq.a AS a [a:UInt32]\ - \n LeftSemi Join: Filter: sq.a = __correlated_sq_2.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [c:UInt32]\ - \n Projection: sq_nested.c AS c [c:UInt32]\ - \n TableScan: sq_nested [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_equal(&plan, expected) - } - - /// Test for filter input modification in case filter not supported - /// Outer filter expression not modified while inner converted to join - #[test] - fn in_subquery_input_modified() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("c"), test_subquery_with_name("sq_inner")?))? - .project(vec![col("b"), col("c")])? - .alias("wrapped")? - .filter(or( - binary_expr(col("b"), Operator::Lt, lit(30_u32)), - in_subquery(col("c"), test_subquery_with_name("sq_outer")?), - ))? - .project(vec![col("b")])? - .build()?; - - let expected = "Projection: wrapped.b [b:UInt32]\ - \n Filter: wrapped.b < UInt32(30) OR wrapped.c IN () [b:UInt32, c:UInt32]\ - \n Subquery: [c:UInt32]\ - \n Projection: sq_outer.c [c:UInt32]\ - \n TableScan: sq_outer [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: wrapped [b:UInt32, c:UInt32]\ - \n Projection: test.b, test.c [b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq_inner.c AS c [c:UInt32]\ - \n TableScan: sq_inner [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_equal(&plan, expected) - } - - #[cfg(test)] - #[ctor::ctor] - fn init() { - let _ = env_logger::try_init(); - } - - /// Test multiple correlated subqueries - /// See subqueries.rs where_in_multiple() - #[test] - fn multiple_subqueries() -> Result<()> { - let orders = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter( - in_subquery(col("customer.c_custkey"), orders.clone()) - .and(in_subquery(col("customer.c_custkey"), orders)), - )? - .project(vec![col("customer.c_custkey")])? - .build()?; - debug!("plan to optimize:\n{}", plan.display_indent()); - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_2.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __correlated_sq_2 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test recursive correlated subqueries - /// See subqueries.rs where_in_recursive() - #[test] - fn recursive_subqueries() -> Result<()> { - let lineitem = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("lineitem")) - .filter( - col("lineitem.l_orderkey") - .eq(out_ref_col(DataType::Int64, "orders.o_orderkey")), - )? - .project(vec![col("lineitem.l_orderkey")])? - .build()?, - ); - - let orders = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(in_subquery(col("orders.o_orderkey"), lineitem).and( - col("orders.o_custkey").eq(out_ref_col(DataType::Int64, "customer.c_custkey")), - ))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), orders))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n LeftSemi Join: Filter: orders.o_orderkey = __correlated_sq_2.l_orderkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n SubqueryAlias: __correlated_sq_2 [l_orderkey:Int64]\ - \n Projection: lineitem.l_orderkey AS l_orderkey [l_orderkey:Int64]\ - \n TableScan: lineitem [l_orderkey:Int64, l_partkey:Int64, l_suppkey:Int64, l_linenumber:Int32, l_quantity:Float64, l_extendedprice:Float64]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery filter with additional subquery filters - #[test] - fn in_subquery_with_subquery_filters() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .eq(col("orders.o_custkey")) - .and(col("o_orderkey").eq(lit(1))), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n Filter: orders.o_orderkey = Int32(1) [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery with no columns in schema - #[test] - fn in_subquery_no_cols() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .eq(out_ref_col(DataType::Int64, "customer.c_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for IN subquery with both columns in schema - #[test] - fn in_subquery_with_no_correlated_cols() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("orders.o_custkey").eq(col("orders.o_custkey")))? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n Filter: orders.o_custkey = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery not equal - #[test] - fn in_subquery_where_not_eq() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .not_eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND customer.c_custkey != __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery less than - #[test] - fn in_subquery_where_less_than() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").lt(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND customer.c_custkey < __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery filter with subquery disjunction - #[test] - fn in_subquery_with_subquery_disjunction() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey") - .eq(col("orders.o_custkey")) - .or(col("o_orderkey").eq(lit(1))), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey AND (customer.c_custkey = __correlated_sq_1.o_custkey OR __correlated_sq_1.o_orderkey = Int32(1)) [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64, o_orderkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey, orders.o_orderkey [o_custkey:Int64, o_orderkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - - Ok(()) - } - - /// Test for correlated IN without projection - #[test] - fn in_subquery_no_projection() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter(col("customer.c_custkey").eq(col("orders.o_custkey")))? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - // Maybe okay if the table only has a single column? - assert_optimizer_err( - Arc::new(DecorrelateWhereIn::new()), - &plan, - "a projection is required", - ); - Ok(()) - } - - /// Test for correlated IN subquery join on expression - #[test] - fn in_subquery_join_expr() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey").add(lit(1)), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey + Int32(1) = __correlated_sq_1.o_custkey AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN expressions - #[test] - fn in_subquery_project_expr() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey").add(lit(1))])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey + Int32(1) AND customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey + Int32(1):Int64, o_custkey:Int64]\ - \n Projection: orders.o_custkey + Int32(1) AS o_custkey + Int32(1), orders.o_custkey [o_custkey + Int32(1):Int64, o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery multiple projected columns - #[test] - fn in_subquery_multi_col() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey"), col("orders.o_orderkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq).and(col("c_custkey").eq(lit(1))))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - assert_optimizer_err( - Arc::new(DecorrelateWhereIn::new()), - &plan, - "single expression projection required", - ); - Ok(()) - } - - /// Test for correlated IN subquery filter with additional filters - #[test] - fn should_support_additional_filters() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter(in_subquery(col("customer.c_custkey"), sq).and(col("c_custkey").eq(lit(1))))? - .project(vec![col("customer.c_custkey")])? - .build()?; - - let expected = "Projection: customer.c_custkey [c_custkey:Int64]\ - \n Filter: customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8]\ - \n LeftSemi Join: Filter: customer.c_custkey = __correlated_sq_1.o_custkey [c_custkey:Int64, c_name:Utf8]\ - \n TableScan: customer [c_custkey:Int64, c_name:Utf8]\ - \n SubqueryAlias: __correlated_sq_1 [o_custkey:Int64]\ - \n Projection: orders.o_custkey AS o_custkey [o_custkey:Int64]\ - \n TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery filter with disjustions - #[test] - fn in_subquery_disjunction() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(scan_tpch_table("orders")) - .filter( - out_ref_col(DataType::Int64, "customer.c_custkey").eq(col("orders.o_custkey")), - )? - .project(vec![col("orders.o_custkey")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(scan_tpch_table("customer")) - .filter( - in_subquery(col("customer.c_custkey"), sq).or(col("customer.c_custkey").eq(lit(1))), - )? - .project(vec![col("customer.c_custkey")])? - .build()?; - - // TODO: support disjunction - for now expect unaltered plan - let expected = r#"Projection: customer.c_custkey [c_custkey:Int64] - Filter: customer.c_custkey IN () OR customer.c_custkey = Int32(1) [c_custkey:Int64, c_name:Utf8] - Subquery: [o_custkey:Int64] - Projection: orders.o_custkey [o_custkey:Int64] - Filter: outer_ref(customer.c_custkey) = orders.o_custkey [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: orders [o_orderkey:Int64, o_custkey:Int64, o_orderstatus:Utf8, o_totalprice:Float64;N] - TableScan: customer [c_custkey:Int64, c_name:Utf8]"#; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for correlated IN subquery filter - #[test] - fn in_subquery_correlated() -> Result<()> { - let sq = Arc::new( - LogicalPlanBuilder::from(test_table_scan_with_name("sq")?) - .filter(out_ref_col(DataType::UInt32, "test.a").eq(col("sq.a")))? - .project(vec![col("c")])? - .build()?, - ); - - let plan = LogicalPlanBuilder::from(test_table_scan_with_name("test")?) - .filter(in_subquery(col("c"), sq))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32, a:UInt32]\ - \n Projection: sq.c AS c, sq.a [c:UInt32, a:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for single IN subquery filter - #[test] - fn in_subquery_simple() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("c"), test_subquery_with_name("sq")?))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq.c AS c [c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - /// Test for single NOT IN subquery filter - #[test] - fn not_in_subquery_simple() -> Result<()> { - let table_scan = test_table_scan()?; - let plan = LogicalPlanBuilder::from(table_scan) - .filter(not_in_subquery(col("c"), test_subquery_with_name("sq")?))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftAnti Join: Filter: test.c = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: sq.c AS c [c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - #[test] - fn in_subquery_both_side_expr() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - - let subquery = LogicalPlanBuilder::from(subquery_scan) - .project(vec![col("c") * lit(2u32)])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2) [c * UInt32(2):UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - #[test] - fn in_subquery_join_filter_and_inner_filter() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - out_ref_col(DataType::UInt32, "test.a") - .eq(col("sq.a")) - .and(col("sq.a").add(lit(1u32)).eq(col("sq.b"))), - )? - .project(vec![col("c") * lit(2u32)])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a = __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a [c * UInt32(2):UInt32, a:UInt32]\ - \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - #[test] - fn in_subquery_muti_project_subquery_cols() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan = test_table_scan_with_name("sq")?; - - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter( - out_ref_col(DataType::UInt32, "test.a") - .add(out_ref_col(DataType::UInt32, "test.b")) - .eq(col("sq.a").add(col("sq.b"))) - .and(col("sq.a").add(lit(1u32)).eq(col("sq.b"))), - )? - .project(vec![col("c") * lit(2u32)])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter(in_subquery(col("c") + lit(1u32), Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a + test.b = __correlated_sq_1.a + __correlated_sq_1.b [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ - \n Projection: sq.c * UInt32(2) AS c * UInt32(2), sq.a, sq.b [c * UInt32(2):UInt32, a:UInt32, b:UInt32]\ - \n Filter: sq.a + UInt32(1) = sq.b [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: sq [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - #[test] - fn two_in_subquery_with_outer_filter() -> Result<()> { - let table_scan = test_table_scan()?; - let subquery_scan1 = test_table_scan_with_name("sq1")?; - let subquery_scan2 = test_table_scan_with_name("sq2")?; - - let subquery1 = LogicalPlanBuilder::from(subquery_scan1) - .filter(out_ref_col(DataType::UInt32, "test.a").gt(col("sq1.a")))? - .project(vec![col("c") * lit(2u32)])? - .build()?; - - let subquery2 = LogicalPlanBuilder::from(subquery_scan2) - .filter(out_ref_col(DataType::UInt32, "test.a").gt(col("sq2.a")))? - .project(vec![col("c") * lit(2u32)])? - .build()?; - - let plan = LogicalPlanBuilder::from(table_scan) - .filter( - in_subquery(col("c") + lit(1u32), Arc::new(subquery1)).and( - in_subquery(col("c") * lit(2u32), Arc::new(subquery2)) - .and(col("test.c").gt(lit(1u32))), - ), - )? - .project(vec![col("test.b")])? - .build()?; - - let expected = "Projection: test.b [b:UInt32]\ - \n Filter: test.c > UInt32(1) [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c * UInt32(2) = __correlated_sq_2.c * UInt32(2) AND test.a > __correlated_sq_2.a [a:UInt32, b:UInt32, c:UInt32]\ - \n LeftSemi Join: Filter: test.c + UInt32(1) = __correlated_sq_1.c * UInt32(2) AND test.a > __correlated_sq_1.a [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq1.c * UInt32(2) AS c * UInt32(2), sq1.a [c * UInt32(2):UInt32, a:UInt32]\ - \n TableScan: sq1 [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_2 [c * UInt32(2):UInt32, a:UInt32]\ - \n Projection: sq2.c * UInt32(2) AS c * UInt32(2), sq2.a [c * UInt32(2):UInt32, a:UInt32]\ - \n TableScan: sq2 [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } - - #[test] - fn in_subquery_with_same_table() -> Result<()> { - let outer_scan = test_table_scan()?; - let subquery_scan = test_table_scan()?; - let subquery = LogicalPlanBuilder::from(subquery_scan) - .filter(col("test.a").gt(col("test.b")))? - .project(vec![col("c")])? - .build()?; - - let plan = LogicalPlanBuilder::from(outer_scan) - .filter(in_subquery(col("test.a"), Arc::new(subquery)))? - .project(vec![col("test.b")])? - .build()?; - - // Subquery and outer query refer to the same table. - let expected = "Projection: test.b [b:UInt32]\ - \n LeftSemi Join: Filter: test.a = __correlated_sq_1.c [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]\ - \n SubqueryAlias: __correlated_sq_1 [c:UInt32]\ - \n Projection: test.c AS c [c:UInt32]\ - \n Filter: test.a > test.b [a:UInt32, b:UInt32, c:UInt32]\ - \n TableScan: test [a:UInt32, b:UInt32, c:UInt32]"; - - assert_optimized_plan_eq_display_indent( - Arc::new(DecorrelateWhereIn::new()), - &plan, - expected, - ); - Ok(()) - } -} From caf67617413481a65c7c6b3a09397f09925091d0 Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 09:32:37 -0400 Subject: [PATCH 62/89] Adjust optimizer/utils test includes --- src/sql/optimizer/utils.rs | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs index 13baa5609..7711ca971 100644 --- a/src/sql/optimizer/utils.rs +++ b/src/sql/optimizer/utils.rs @@ -366,9 +366,11 @@ pub fn log_plan(description: &str, plan: &LogicalPlan) { mod tests { use std::collections::HashSet; - use arrow::datatypes::DataType; - use datafusion_common::Column; - use datafusion_expr::{col, expr::Cast, lit, utils::expr_to_columns}; + use datafusion_python::{ + datafusion::arrow::datatypes::DataType, + datafusion_common::Column, + datafusion_expr::{col, expr::Cast, lit, utils::expr_to_columns}, + }; use super::*; From edbc66971452a7347c78b97227ff0a9478eccc2a Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 09:35:35 -0400 Subject: [PATCH 63/89] Adjust import path for doctest --- src/sql/optimizer/utils.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs index 7711ca971..a48f85dc9 100644 --- a/src/sql/optimizer/utils.rs +++ b/src/sql/optimizer/utils.rs @@ -213,8 +213,8 @@ fn split_binary_impl<'a>( /// /// # Example /// ``` -/// # use datafusion_expr::{col, lit}; -/// # use datafusion_optimizer::utils::conjunction; +/// # use datafusion_python::datafusion_expr::{col, lit}; +/// # use datafusion_python::datafusion_optimizer::utils::conjunction; /// // a=1 AND b=2 /// let expr = col("a").eq(lit(1)).and(col("b").eq(lit(2))); /// From f6411a4a8a371997e12aaba5730c28d62b8adbac Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 09:37:32 -0400 Subject: [PATCH 64/89] Adjust import path for doctest (more) --- src/sql/optimizer/utils.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sql/optimizer/utils.rs b/src/sql/optimizer/utils.rs index a48f85dc9..f72bbe5c3 100644 --- a/src/sql/optimizer/utils.rs +++ b/src/sql/optimizer/utils.rs @@ -118,8 +118,8 @@ pub(crate) fn extract_join_filters(maybe_filter: &LogicalPlan) -> Result<(Vec Vec { /// /// # Example /// ``` -/// # use datafusion_expr::{col, lit, Operator}; -/// # use datafusion_optimizer::utils::split_binary_owned; +/// # use datafusion_python::datafusion_expr::{col, lit, Operator}; +/// # use datafusion_python::datafusion_optimizer::utils::split_binary_owned; /// # use std::ops::Add; /// // a=1 + b=2 /// let expr = col("a").eq(lit(1)).add(col("b").eq(lit(2))); From a94864abf34f765e0e1e918fa1a6ee62f52a2b68 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:04:21 -0400 Subject: [PATCH 65/89] Check if zlib is installed on ubuntu runners --- .github/workflows/release.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 448541751..b1ae4fda2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,8 +25,6 @@ jobs: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 - - name: Install zlib - run: sudo apt update && sudo apt install -y zlib1g-dev - name: Install Protoc uses: arduino/setup-protoc@v1 with: @@ -35,6 +33,11 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' + - name: Check is zlib is installed + run: | + dpkg --list | grep zlib + + ls /usr/include | grep zlib - name: Build wheels uses: PyO3/maturin-action@v1 with: @@ -85,7 +88,7 @@ jobs: pip install twine twine check dist/* - ls -lh dist/ + ls dist/ - name: Upload binary wheels uses: actions/upload-artifact@v3 with: From 24f465f86fab9f3d46e09b60c14785f756f14329 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 14 Aug 2023 12:17:59 -0700 Subject: [PATCH 66/89] Try invoking maturin directly for conda builds --- continuous_integration/recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index 48cc8b748..c3a8f15f9 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -18,7 +18,7 @@ build: - dask-sql-server = dask_sql.server.app:main - dask-sql = dask_sql.cmd:main string: py{{ python | replace(".", "") }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script: RUST_BACKTRACE=full {{ PYTHON }} -m pip install . --no-deps -vv + script: maturin build -i {{ PYTHON }} --compatibility off --release requirements: build: From a0acebc83ad26d5290daea3475eafdf1add1033e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 14 Aug 2023 12:57:39 -0700 Subject: [PATCH 67/89] Revert "Try invoking maturin directly for conda builds" This reverts commit 24f465f86fab9f3d46e09b60c14785f756f14329. --- continuous_integration/recipe/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/continuous_integration/recipe/meta.yaml b/continuous_integration/recipe/meta.yaml index c3a8f15f9..48cc8b748 100644 --- a/continuous_integration/recipe/meta.yaml +++ b/continuous_integration/recipe/meta.yaml @@ -18,7 +18,7 @@ build: - dask-sql-server = dask_sql.server.app:main - dask-sql = dask_sql.cmd:main string: py{{ python | replace(".", "") }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }} - script: maturin build -i {{ PYTHON }} --compatibility off --release + script: RUST_BACKTRACE=full {{ PYTHON }} -m pip install . --no-deps -vv requirements: build: From 488cbaf4086f08c5b1c60dc1a23c33d84e7dee29 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Mon, 14 Aug 2023 13:04:32 -0700 Subject: [PATCH 68/89] Install protoc via apt --- .github/workflows/release.yml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b1ae4fda2..835d9e150 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,11 +25,8 @@ jobs: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - version: '3.x' - repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Install protoc + run: sudo apt update && sudo apt install -y protobuf-compiler - uses: actions/setup-python@v4 with: python-version: '3.10' From 6ca444bf66c591768bb8aaf7c260ae4812f9cb0d Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 17:41:48 -0400 Subject: [PATCH 69/89] Add zlib to conda environment so that conda install c-compiler can locate the necessary zlib header files --- continuous_integration/environment-3.10-dev.yaml | 1 + continuous_integration/environment-3.8-dev.yaml | 1 + continuous_integration/environment-3.9-dev.yaml | 1 + continuous_integration/gpuci/environment-3.10.yaml | 1 + continuous_integration/gpuci/environment-3.9.yaml | 1 + 5 files changed, 5 insertions(+) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index b2f66081e..6703e2007 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -34,3 +34,4 @@ dependencies: - tzlocal>=2.1 - uvicorn>=0.13.4 - libprotobuf=3 +- zlib diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 5bf6aa424..106eb90fe 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -34,3 +34,4 @@ dependencies: - tzlocal=2.1 - uvicorn=0.13.4 - libprotobuf=3 +- zlib diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 5df723d45..2566987a8 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -34,3 +34,4 @@ dependencies: - tzlocal>=2.1 - uvicorn>=0.13.4 - libprotobuf=3 +- zlib diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index 93479ebb1..fafb4a5e4 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -46,3 +46,4 @@ dependencies: - ucx-py=0.33 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* +- zlib diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 985483adc..362a02022 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -46,3 +46,4 @@ dependencies: - ucx-py=0.33 - xgboost=*=rapidsai_py* - libxgboost=*=rapidsai_h* +- zlib From 4109b382e3ab099c5e0bdebed6a970f8f9053faf Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Mon, 14 Aug 2023 18:32:35 -0400 Subject: [PATCH 70/89] Remove pytest coalesce option for Sum(b) with a string conditional result as that is not valid sql in some cases --- tests/integration/test_rex.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rex.py b/tests/integration/test_rex.py index 3f720e6d0..e099a3ddb 100644 --- a/tests/integration/test_rex.py +++ b/tests/integration/test_rex.py @@ -407,8 +407,7 @@ def test_coalesce(c, gpu): COALESCE(NULL, 'hi') as c3, COALESCE(NULL, NULL, 'bye', 5/0) as c4, COALESCE(NULL, 3/2, NULL, 'fly') as c5, - COALESCE(SUM(b), 'why', 2.2) as c6, - COALESCE(NULL, MEAN(b), MEAN(a), 4/0) as c7 + COALESCE(NULL, MEAN(b), MEAN(a), 4/0) as c6 FROM df """ ) @@ -420,8 +419,7 @@ def test_coalesce(c, gpu): "c3": ["hi"], "c4": ["bye"], "c5": ["1.5"], - "c6": ["why"], - "c7": [2.0], + "c6": [2.0], } ) From 3cadd47f3d74fb4ce089b292f9704e9980a09c1e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 06:48:24 -0700 Subject: [PATCH 71/89] Revert "Install protoc via apt" This reverts commit 488cbaf4086f08c5b1c60dc1a23c33d84e7dee29. --- .github/workflows/release.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 835d9e150..b1ae4fda2 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,8 +25,11 @@ jobs: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 - - name: Install protoc - run: sudo apt update && sudo apt install -y protobuf-compiler + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v4 with: python-version: '3.10' From 276f2fab532510e0115efd73dcfeee2e8cfc96b4 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 06:52:19 -0700 Subject: [PATCH 72/89] Try not using zig for x86_64 builds --- .github/workflows/release.yml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b1ae4fda2..5b6bb0d9d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,12 +33,16 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - name: Check is zlib is installed - run: | - dpkg --list | grep zlib - - ls /usr/include | grep zlib - - name: Build wheels + - name: Build wheels for x86_64 + if: matrix.target == 'x86_64' + uses: PyO3/maturin-action@v1 + with: + target: ${{ matrix.target }} + args: --release --out dist + sccache: 'true' + manylinux: auto + - name: Build wheels for aarch64 + if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 with: target: ${{ matrix.target }} From 66ebed445f5b3b0389b9bdfceb8c99827349b4aa Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 07:12:46 -0700 Subject: [PATCH 73/89] Try installing protoc from apt again --- .github/workflows/release.yml | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5b6bb0d9d..369caa109 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,11 +25,9 @@ jobs: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 - - name: Install Protoc - uses: arduino/setup-protoc@v1 - with: - version: '3.x' - repo-token: ${{ secrets.GITHUB_TOKEN }} + - name: Install protoc + run: | + sudo apt update && sudo apt install -y protobuf-compiler - uses: actions/setup-python@v4 with: python-version: '3.10' From 789dd3517dedc3db881ad4a918016bfb6ab71429 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 07:32:27 -0700 Subject: [PATCH 74/89] Revert "Try installing protoc from apt again" This reverts commit 66ebed445f5b3b0389b9bdfceb8c99827349b4aa. --- .github/workflows/release.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 369caa109..5b6bb0d9d 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -25,9 +25,11 @@ jobs: target: [x86_64, aarch64] steps: - uses: actions/checkout@v3 - - name: Install protoc - run: | - sudo apt update && sudo apt install -y protobuf-compiler + - name: Install Protoc + uses: arduino/setup-protoc@v1 + with: + version: '3.x' + repo-token: ${{ secrets.GITHUB_TOKEN }} - uses: actions/setup-python@v4 with: python-version: '3.10' From d154e6663bbbc9d6e825cc0bb65b7e689653a952 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 07:38:36 -0700 Subject: [PATCH 75/89] Try explicitly setting PROTOC location for x86_64 builds --- .github/workflows/release.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 5b6bb0d9d..eb4677e96 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -41,6 +41,8 @@ jobs: args: --release --out dist sccache: 'true' manylinux: auto + env: + PROTOC: '/opt/hostedtoolcache/protoc/3.20.3/x64/protoc' - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 3ac265ce1b64b666d995cf30926249de6c57407e Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 07:44:39 -0700 Subject: [PATCH 76/89] Where is protoc? --- .github/workflows/release.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index eb4677e96..7fdd64021 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,6 +33,9 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' + - name: Where is protoc? + run: | + which protoc - name: Build wheels for x86_64 if: matrix.target == 'x86_64' uses: PyO3/maturin-action@v1 From 64411d465c455976687f86896cc1baea50bf98fb Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 07:47:33 -0700 Subject: [PATCH 77/89] Fix protoc binary location --- .github/workflows/release.yml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7fdd64021..e5c35fd9c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -33,9 +33,6 @@ jobs: - uses: actions/setup-python@v4 with: python-version: '3.10' - - name: Where is protoc? - run: | - which protoc - name: Build wheels for x86_64 if: matrix.target == 'x86_64' uses: PyO3/maturin-action@v1 @@ -45,7 +42,7 @@ jobs: sccache: 'true' manylinux: auto env: - PROTOC: '/opt/hostedtoolcache/protoc/3.20.3/x64/protoc' + PROTOC: '/opt/hostedtoolcache/protoc/3.20.3/x64/bin/protoc' - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 9ed05503f7358f319361017d9058e9a6d132ffd6 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:03:47 -0700 Subject: [PATCH 78/89] Disable docker container for linux x86_64 build --- .github/workflows/release.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e5c35fd9c..f60ba0280 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -41,8 +41,7 @@ jobs: args: --release --out dist sccache: 'true' manylinux: auto - env: - PROTOC: '/opt/hostedtoolcache/protoc/3.20.3/x64/bin/protoc' + container: 'off' - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 1688ce0976ba6c6adc3c19a9c847417de00e8221 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:36:31 -0700 Subject: [PATCH 79/89] Properly upload artifacts for ARM/intel --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index f60ba0280..7ac2675d8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -59,7 +59,7 @@ jobs: - name: Upload binary wheels uses: actions/upload-artifact@v3 with: - name: wheels for linux + name: wheels for linux ${{ matrix.target }} path: dist/* - name: Publish package if: env.upload == 'true' @@ -138,7 +138,7 @@ jobs: - name: Upload binary wheels uses: actions/upload-artifact@v3 with: - name: wheels for macos + name: wheels for macos ${{ matrix.target }} path: dist/* - name: Publish package if: env.upload == 'true' From e3210f7be672af0fed99e73cee36dfd11f426c66 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 08:37:47 -0700 Subject: [PATCH 80/89] Disable aarch64 builds for now --- .github/workflows/conda.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/conda.yml b/.github/workflows/conda.yml index e7aae8096..833f37293 100644 --- a/.github/workflows/conda.yml +++ b/.github/workflows/conda.yml @@ -33,7 +33,9 @@ jobs: fail-fast: false matrix: python: ["3.8", "3.9", "3.10"] - arch: ["linux-64", "linux-aarch64"] + # FIXME: aarch64 builds are consuming too much memory to run on GHA + # arch: ["linux-64", "linux-aarch64"] + arch: ["linux-64"] steps: - uses: actions/checkout@v3 with: From 4efeb0185ac8ac42babd696775d39f8966e6dc50 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 09:27:18 -0700 Subject: [PATCH 81/89] Constrain mlflow to avoid import error --- continuous_integration/environment-3.10-dev.yaml | 4 +++- continuous_integration/environment-3.8-dev.yaml | 4 +++- continuous_integration/environment-3.9-dev.yaml | 4 +++- continuous_integration/gpuci/environment-3.10.yaml | 4 +++- continuous_integration/gpuci/environment-3.9.yaml | 4 +++- 5 files changed, 15 insertions(+), 5 deletions(-) diff --git a/continuous_integration/environment-3.10-dev.yaml b/continuous_integration/environment-3.10-dev.yaml index 6703e2007..4589cc1b5 100644 --- a/continuous_integration/environment-3.10-dev.yaml +++ b/continuous_integration/environment-3.10-dev.yaml @@ -12,7 +12,9 @@ dependencies: - jsonschema - lightgbm - maturin>=1.1,<1.2 -- mlflow +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock - numpy>=1.21.6 - pandas>=1.4.0 diff --git a/continuous_integration/environment-3.8-dev.yaml b/continuous_integration/environment-3.8-dev.yaml index 106eb90fe..5a9255390 100644 --- a/continuous_integration/environment-3.8-dev.yaml +++ b/continuous_integration/environment-3.8-dev.yaml @@ -12,7 +12,9 @@ dependencies: - jsonschema - lightgbm - maturin=1.1 -- mlflow +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock - numpy=1.21.6 - pandas=1.4.0 diff --git a/continuous_integration/environment-3.9-dev.yaml b/continuous_integration/environment-3.9-dev.yaml index 2566987a8..f807d2e87 100644 --- a/continuous_integration/environment-3.9-dev.yaml +++ b/continuous_integration/environment-3.9-dev.yaml @@ -12,7 +12,9 @@ dependencies: - jsonschema - lightgbm - maturin>=1.1,<1.2 -- mlflow +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock - numpy>=1.21.6 - pandas>=1.4.0 diff --git a/continuous_integration/gpuci/environment-3.10.yaml b/continuous_integration/gpuci/environment-3.10.yaml index fafb4a5e4..6d8372da4 100644 --- a/continuous_integration/gpuci/environment-3.10.yaml +++ b/continuous_integration/gpuci/environment-3.10.yaml @@ -15,7 +15,9 @@ dependencies: - jsonschema - lightgbm - maturin>=1.1,<1.2 -- mlflow +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock - numpy>=1.21.6 - pandas>=1.4.0 diff --git a/continuous_integration/gpuci/environment-3.9.yaml b/continuous_integration/gpuci/environment-3.9.yaml index 362a02022..52c82a1e2 100644 --- a/continuous_integration/gpuci/environment-3.9.yaml +++ b/continuous_integration/gpuci/environment-3.9.yaml @@ -15,7 +15,9 @@ dependencies: - jsonschema - lightgbm - maturin>=1.1,<1.2 -- mlflow +# FIXME: mlflow 2.6.0 has import issues related to pydantic +# https://github.com/mlflow/mlflow/issues/9331 +- mlflow<2.6 - mock - numpy>=1.21.6 - pandas>=1.4.0 From 6020cdff7edf70caba5b5bd654b4af3f6084e062 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:18:43 -0700 Subject: [PATCH 82/89] Set wheel tags to manylinux_2_17 --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 7ac2675d8..9896eae33 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -40,7 +40,7 @@ jobs: target: ${{ matrix.target }} args: --release --out dist sccache: 'true' - manylinux: auto + manylinux: '2_17' container: 'off' - name: Build wheels for aarch64 if: matrix.target == 'aarch64' @@ -49,7 +49,7 @@ jobs: target: ${{ matrix.target }} args: --release --out dist --zig sccache: 'true' - manylinux: auto + manylinux: '2_17' - name: Check dist files run: | pip install twine From d7d84132707f5aea9284ee62faf5a4a07ca17295 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:44:47 -0700 Subject: [PATCH 83/89] Use manylinux docker container for x86_64 builds --- .github/workflows/release.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9896eae33..b9d9f818c 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -27,6 +27,7 @@ jobs: - uses: actions/checkout@v3 - name: Install Protoc uses: arduino/setup-protoc@v1 + if: matrix.target == 'aarch64' with: version: '3.x' repo-token: ${{ secrets.GITHUB_TOKEN }} @@ -41,7 +42,8 @@ jobs: args: --release --out dist sccache: 'true' manylinux: '2_17' - container: 'off' + before-script-linux: | + sudo apt-get update && sudo apt-get install protobuf-compiler - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 319e2ef980de2485d815f5dbd101d7f9ce6acb58 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:50:13 -0700 Subject: [PATCH 84/89] No sudo for protoc installation --- .github/workflows/release.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b9d9f818c..924fcf128 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -43,7 +43,7 @@ jobs: sccache: 'true' manylinux: '2_17' before-script-linux: | - sudo apt-get update && sudo apt-get install protobuf-compiler + apt-get update && apt-get install protobuf-compiler - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 70dca3fc6c746881177ea5aabc70f104300c63e7 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 10:56:06 -0700 Subject: [PATCH 85/89] Install protoc directly from github --- .github/workflows/release.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 924fcf128..6e992b5fc 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -42,8 +42,11 @@ jobs: args: --release --out dist sccache: 'true' manylinux: '2_17' - before-script-linux: | - apt-get update && apt-get install protobuf-compiler + before-script-linux: > + DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-x86_64.zip$") && + curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && + unzip protoc-*-linux-x86_64.zip -d $HOME/.local && + protoc --version - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 7b7d7a9790026f891d25c789628ff04e1f1f4798 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Tue, 15 Aug 2023 11:08:32 -0700 Subject: [PATCH 86/89] Specify PROTOC environment variable for x86_64 runs --- .github/workflows/release.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 6e992b5fc..6d63f6373 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -45,8 +45,8 @@ jobs: before-script-linux: > DOWNLOAD_URL=$(curl --retry 6 --retry-delay 10 -s https://api.github.com/repos/protocolbuffers/protobuf/releases/latest | grep -o '"browser_download_url": "[^"]*' | cut -d'"' -f4 | grep "\linux-x86_64.zip$") && curl --retry 6 --retry-delay 10 -LO $DOWNLOAD_URL && - unzip protoc-*-linux-x86_64.zip -d $HOME/.local && - protoc --version + unzip protoc-*-linux-x86_64.zip -d $HOME/.local + docker-options: --env PROTOC=/root/.local/bin/protoc - name: Build wheels for aarch64 if: matrix.target == 'aarch64' uses: PyO3/maturin-action@v1 From 60ef2e74eeda18f410317779ab6d4a4dc74fde39 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 17 Aug 2023 08:45:41 -0700 Subject: [PATCH 87/89] More doc updates to reflect new installation style --- CONTRIBUTING.md | 10 ++++------ README.md | 5 +---- docs/source/installation.rst | 7 +------ 3 files changed, 6 insertions(+), 16 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a6cd56c59..0f5adc85a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -39,14 +39,14 @@ DataFusion provides Dask-SQL with key functionality. ### Building Building the Dask-SQL Rust codebase is a straightforward process. If you create and activate the Dask-SQL Conda environment the Rust compiler and all necessary components will be installed for you during that process and therefore requires no further manual setup. -`setuptools-rust` is used by Dask-SQL for building and bundling the resulting Rust binaries. This helps make building and installing the Rust binaries feel much more like a native Python workflow. +`maturin` is used by Dask-SQL for building and bundling the resulting Rust binaries. This helps make building and installing the Rust binaries feel much more like a native Python workflow. -More details about the building setup can be found at [setup.py](setup.py) and searching for `rust_extensions` which is the hook for the Rust code build and inclusion. +More details about the building setup can be found in [pyproject.toml](pyproject.toml) and [Cargo.toml](Cargo.toml) -Note that while `setuptools-rust` is used by CI and should be used during your development cycle, if the need arises to do something more specific that is not yet supported by `setuptools-rust` you can opt to use `cargo` directly from the command line. +Note that while `maturin` is used by CI and should be used during your development cycle, if the need arises to do something more specific that is not yet supported by `maturin` you can opt to use `cargo` directly from the command line. #### Building with Python -Building Dask-SQL is straightforward with Python. To build run ```python setup.py install```. This will build both the Rust and Python codebase and install it into your locally activated conda environment. While not required, if you have updated dependencies for Rust you might prefer a clean build. To clean your setup run ```python setup.py clean``` and then run ```python setup.py install``` +Building Dask-SQL is straightforward with Python. To build run ```pip install .```. This will build both the Rust and Python codebase and install it into your locally activated conda environment; note that if your Rust dependencies have been updated, this command must be rerun to rebuild the Rust codebase. #### DataFusion Modules DataFusion is broken down into a few modules. We consume those modules in our [Cargo.toml](Cargo.toml). The modules that we use currently are @@ -59,8 +59,6 @@ DataFusion is broken down into a few modules. We consume those modules in our [C #### Retrieving Upstream Dependencies During development you might find yourself needing some upstream DataFusion changes not present in the projects current version. Luckily this can easily be achieved by updating [Cargo.toml](Cargo.toml) and changing the `rev` to the SHA of the version you need. Note that the same SHA should be used for all DataFusion modules. -After updating the `Cargo.toml` file the codebase can be re-built to reflect those changes by running `python setup.py install` - #### Local Documentation Sometimes when building against the latest Github commits for DataFusion you may find that the features you are consuming do not have their documentation public yet. In this case it can be helpful to build the DataFusion documentation locally so that it can be referenced to assist with development. Here is a rough outline for building that documentation locally. diff --git a/README.md b/README.md index ac27aea33..d08aa0328 100644 --- a/README.md +++ b/README.md @@ -110,10 +110,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" The Rust DataFusion bindings are built as part of the `pip install`. -If changes are made to the Rust source in `src/`, another build/install must be run to recompile the bindings: - - python setup.py build install - +Note that if changes are made to the Rust source in `src/`, another build must be run to recompile the bindings. This repository uses [pre-commit](https://pre-commit.com/) hooks. To install them, call pre-commit install diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 71ce17959..a2a3ee895 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -84,12 +84,7 @@ After that, you can install the package in development mode pip install -e ".[dev]" -To compile the Rust code (after changes), run - -.. code-block:: bash - - python setup.py build_ext - +To compile the Rust code (after changes), the above command must be rerun. You can run the tests (after installation) with .. code-block:: bash From 8a495525240aee26fb65985f3bddfa1fbe00ecc3 Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 17 Aug 2023 09:20:48 -0700 Subject: [PATCH 88/89] Fix docker builds --- docker/main.dockerfile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/main.dockerfile b/docker/main.dockerfile index 9f16958d7..5b56bb879 100644 --- a/docker/main.dockerfile +++ b/docker/main.dockerfile @@ -32,11 +32,15 @@ RUN mamba install -y \ && conda clean -ay # install dask-sql +COPY Cargo.toml /opt/dask_sql/ +COPY Cargo.lock /opt/dask_sql/ +COPY pyproject.toml /opt/dask_sql/ COPY setup.py /opt/dask_sql/ COPY setup.cfg /opt/dask_sql/ COPY versioneer.py /opt/dask_sql/ +COPY README.md /opt/dask_sql/ COPY .git /opt/dask_sql/.git -COPY dask_planner /opt/dask_sql/dask_planner +COPY src /opt/dask_sql/src COPY dask_sql /opt/dask_sql/dask_sql RUN cd /opt/dask_sql/ \ && pip install -e . -vv From 6e7c4516377e0d14ac7273d17eeedd241346cfed Mon Sep 17 00:00:00 2001 From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com> Date: Thu, 17 Aug 2023 09:41:15 -0700 Subject: [PATCH 89/89] Bump ADP to stable 28.0.0 --- Cargo.lock | 27 ++++++++++++++++++--------- Cargo.toml | 2 +- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7ad7d07f5..c3f7d8600 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -674,7 +674,8 @@ dependencies = [ [[package]] name = "datafusion" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ddbcb2dda5b5033537457992ebde78938014390b2b19f9f4282e3be0e18b0c3" dependencies = [ "ahash", "apache-avro", @@ -724,7 +725,8 @@ dependencies = [ [[package]] name = "datafusion-common" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85fbb7b4da925031311743ab96662d55f0f7342d3692744f184f99b2257ef435" dependencies = [ "apache-avro", "arrow", @@ -740,7 +742,8 @@ dependencies = [ [[package]] name = "datafusion-execution" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5bb3617466d894eb0ad11d06bab1e6e89c571c0a27d660685d327d0c6e1e1ccd" dependencies = [ "dashmap", "datafusion-common", @@ -757,7 +760,8 @@ dependencies = [ [[package]] name = "datafusion-expr" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3bd8220a0dfcdfddcc785cd7e71770ef1ce54fbe1e08984e5adf537027ecb6de" dependencies = [ "ahash", "arrow", @@ -771,7 +775,8 @@ dependencies = [ [[package]] name = "datafusion-optimizer" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d685a100c66952aaadd0cbe766df46d1887d58fc8bcf3589e6387787f18492b" dependencies = [ "arrow", "async-trait", @@ -788,7 +793,8 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f2c635da9b05b4b4c6c8d935f46fd99f9b6225f834091cf4e3c8a045b68beab" dependencies = [ "ahash", "arrow", @@ -822,7 +828,8 @@ dependencies = [ [[package]] name = "datafusion-python" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion-python.git?rev=309fc486c47d86776aeec07d86cd04b5d70d97a1#309fc486c47d86776aeec07d86cd04b5d70d97a1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a2441774e84875ae16a8b5277090ed6ab77ce94ab1820c315ed02cd3813de29" dependencies = [ "async-trait", "datafusion", @@ -850,7 +857,8 @@ dependencies = [ [[package]] name = "datafusion-sql" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b3ef8abf4dd84d3f20c910822b52779c035ab7f4f2d5e7125ede3bae618e9de8" dependencies = [ "arrow", "arrow-schema", @@ -863,7 +871,8 @@ dependencies = [ [[package]] name = "datafusion-substrait" version = "28.0.0" -source = "git+https://github.com/apache/arrow-datafusion.git?rev=28.0.0-rc1#51b4392577554becf637a8adcefa0e7fdc79e41f" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c97d351bbd6bd6497e7c9606ddd3c00cd63e9d185d7ab96fc8a66cf3c449177" dependencies = [ "async-recursion", "chrono", diff --git a/Cargo.toml b/Cargo.toml index 6da783b80..826a5df7f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ include = ["/src", "/dask_sql", "/LICENSE.txt", "pyproject.toml", "Cargo.toml", [dependencies] async-trait = "0.1.71" -datafusion-python = { git = "https://github.com/apache/arrow-datafusion-python.git", rev = "309fc486c47d86776aeec07d86cd04b5d70d97a1" } +datafusion-python = "28.0.0" env_logger = "0.10" log = "^0.4" pyo3 = { version = "0.19.1", features = ["extension-module", "abi3", "abi3-py38"] }