From b0b41fa6880345f18bc1408707721072552aab46 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Feb 2026 09:06:38 +0000 Subject: [PATCH 1/5] Initial plan From 6f7d87073f17df530adbb88b2f1347a1d6e36c48 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Feb 2026 09:44:53 +0000 Subject: [PATCH 2/5] chore: remove `sql` feature and sqlparser dependency from DataFusion Co-authored-by: linhr <5601366+linhr@users.noreply.github.com> --- Cargo.lock | 5 -- Cargo.toml | 24 ++++++-- crates/sail-common-datafusion/src/error.rs | 1 - .../src/resolver/expression/wildcard.rs | 55 +++++++++++-------- 4 files changed, 51 insertions(+), 34 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6da835387d..e3362bfa2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1831,7 +1831,6 @@ dependencies = [ "rand 0.9.2", "regex", "serde", - "sqlparser", "tempfile", "tokio", "url", @@ -2298,7 +2297,6 @@ dependencies = [ "indexmap 2.13.0", "itertools", "log", - "recursive", "regex", "regex-syntax", ] @@ -2375,7 +2373,6 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools", - "recursive", ] [[package]] @@ -2514,7 +2511,6 @@ dependencies = [ "datafusion-expr", "indexmap 2.13.0", "log", - "recursive", "regex", "sqlparser", ] @@ -7047,7 +7043,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" dependencies = [ "log", - "recursive", "sqlparser_derive", ] diff --git a/Cargo.toml b/Cargo.toml index 2621ab6f9a..438813723e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -141,15 +141,29 @@ prost-types = "0.14" # The `axum` version must match the one used in `tonic` (replace `RELEASE` with the release we are using): # https://github.com/hyperium/tonic/blob/vRELEASE/tonic/Cargo.toml axum = "0.8.8" -datafusion = { version = "52.1.0", features = ["serde", "avro", "sql"] } -datafusion-common = { version = "52.1.0", features = ["object_store", "avro"] } +datafusion = { version = "52.1.0", default-features = false, features = [ + "nested_expressions", + "crypto_expressions", + "datetime_expressions", + "encoding_expressions", + "regex_expressions", + "string_expressions", + "unicode_expressions", + "compression", + "parquet", + "serde", + "avro", +] } +datafusion-common = { version = "52.1.0", features = ["object_store", "avro", "recursive_protection"] } datafusion-datasource = { version = "52.1.0" } -datafusion-expr = { version = "52.1.0" } +datafusion-expr = { version = "52.1.0", default-features = false, features = ["recursive_protection"] } datafusion-expr-common = { version = "52.1.0" } +datafusion-optimizer = { version = "52.1.0", features = ["recursive_protection"] } +datafusion-physical-optimizer = { version = "52.1.0", features = ["recursive_protection"] } datafusion-proto = { version = "52.1.0" } datafusion-functions = { version = "52.1.0" } -datafusion-functions-nested = { version = "52.1.0" } -datafusion-physical-expr = { version = "52.1.0" } +datafusion-functions-nested = { version = "52.1.0", default-features = false } +datafusion-physical-expr = { version = "52.1.0", features = ["recursive_protection"] } datafusion-session = { version = "52.1.0" } datafusion-spark = { version = "52.1.0" } # The `pyo3` version must match the one used in `arrow-pyarrow` (replace `RELEASE` with the release we are using): diff --git a/crates/sail-common-datafusion/src/error.rs b/crates/sail-common-datafusion/src/error.rs index 207c30ef1a..3d560c14ad 100644 --- a/crates/sail-common-datafusion/src/error.rs +++ b/crates/sail-common-datafusion/src/error.rs @@ -128,7 +128,6 @@ impl CommonErrorCause { DataFusionError::AvroError(e) => Self::FormatAvro(e.to_string()), DataFusionError::ObjectStore(e) => Self::Io(e.to_string()), DataFusionError::IoError(e) => Self::Io(e.to_string()), - DataFusionError::SQL(e, _) => Self::Unknown(e.to_string()), DataFusionError::NotImplemented(x) => Self::NotImplemented(x.clone()), DataFusionError::Internal(x) => Self::Internal(x.clone()), DataFusionError::Plan(x) => Self::Plan(x.clone()), diff --git a/crates/sail-plan/src/resolver/expression/wildcard.rs b/crates/sail-plan/src/resolver/expression/wildcard.rs index 574409e46b..da5e6bdc93 100644 --- a/crates/sail-plan/src/resolver/expression/wildcard.rs +++ b/crates/sail-plan/src/resolver/expression/wildcard.rs @@ -1,7 +1,6 @@ use std::collections::VecDeque; use arrow::datatypes::DataType; -use datafusion::sql::unparser::expr_to_sql; use datafusion_common::{DFSchemaRef, TableReference}; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::{col, expr, lit, ScalarUDF}; @@ -148,18 +147,24 @@ impl PlanResolver<'_> { schema: &DFSchemaRef, state: &mut PlanResolverState, ) -> PlanResult { - use datafusion::sql::sqlparser::ast; + fn make_ident(value: impl Into) -> expr::Ident { + expr::Ident { + value: value.into(), + quote_style: None, + span: String::new(), + } + } let ilike = wildcard_options .ilike_pattern - .map(|x| ast::IlikeSelectItem { pattern: x }); + .map(|x| expr::IlikeSelectItem { pattern: x }); let exclude = wildcard_options .exclude_columns .map(|x| { let exclude = if x.len() > 1 { - ast::ExcludeSelectItem::Multiple(x.into_iter().map(ast::Ident::new).collect()) + expr::ExcludeSelectItem::Multiple(x.into_iter().map(make_ident).collect()) } else if let Some(x) = x.into_iter().next() { - ast::ExcludeSelectItem::Single(ast::Ident::new(x)) + expr::ExcludeSelectItem::Single(make_ident(x)) } else { return Err(PlanError::invalid( "exclude columns must have at least one column", @@ -176,14 +181,14 @@ impl PlanResolver<'_> { let first_element = deque.pop_front().ok_or_else(|| { PlanError::invalid("except columns must have at least one column") })?; - let additional_elements = deque.into_iter().map(ast::Ident::new).collect(); - ast::ExceptSelectItem { - first_element: ast::Ident::new(first_element), + let additional_elements = deque.into_iter().map(make_ident).collect(); + expr::ExceptSelectItem { + first_element: make_ident(first_element), additional_elements, } } else if let Some(x) = x.into_iter().next() { - ast::ExceptSelectItem { - first_element: ast::Ident::new(x), + expr::ExceptSelectItem { + first_element: make_ident(x), additional_elements: vec![], } } else { @@ -202,9 +207,9 @@ impl PlanResolver<'_> { let expression = self .resolve_expression(*elem.expression, schema, state) .await?; - let item = ast::ReplaceSelectElement { - expr: expr_to_sql(&expression)?, - column_name: ast::Ident::new(elem.column_name), + let item = expr::ReplaceSelectElement { + expr: expression.to_string(), + column_name: make_ident(elem.column_name), as_keyword: elem.as_keyword, }; items.push(item); @@ -220,26 +225,30 @@ impl PlanResolver<'_> { let rename = wildcard_options .rename_columns .map(|x| { - let exclude = if x.len() > 1 { - ast::RenameSelectItem::Multiple( + let rename = if x.len() > 1 { + expr::RenameSelectItem::Multiple( x.into_iter() - .map(|x| ast::IdentWithAlias { - ident: ast::Ident::new(x.identifier), - alias: ast::Ident::new(x.alias), + .map(|x| { + format!( + "{} AS {}", + String::from(x.identifier), + String::from(x.alias) + ) }) .collect(), ) } else if let Some(x) = x.into_iter().next() { - ast::RenameSelectItem::Single(ast::IdentWithAlias { - ident: ast::Ident::new(x.identifier), - alias: ast::Ident::new(x.alias), - }) + expr::RenameSelectItem::Single(format!( + "{} AS {}", + String::from(x.identifier), + String::from(x.alias) + )) } else { return Err(PlanError::invalid( "exclude columns must have at least one column", )); }; - Ok(exclude) + Ok(rename) }) .transpose()?; Ok(expr::WildcardOptions { From facc8829721c69b1e0e7c172825c269158137d2b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 27 Feb 2026 10:26:05 +0000 Subject: [PATCH 3/5] fix: remove avro feature from datafusion to eliminate datafusion-sql and sqlparser from compiled deps Co-authored-by: linhr <5601366+linhr@users.noreply.github.com> --- Cargo.lock | 3 ++- Cargo.toml | 2 +- crates/sail-data-source/Cargo.toml | 1 + crates/sail-data-source/src/formats/avro/mod.rs | 2 +- crates/sail-execution/Cargo.toml | 1 + crates/sail-execution/src/codec.rs | 4 ++-- 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e3362bfa2b..c4e1bed80e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1800,7 +1800,6 @@ dependencies = [ "datafusion-common-runtime", "datafusion-datasource", "datafusion-datasource-arrow", - "datafusion-datasource-avro", "datafusion-datasource-csv", "datafusion-datasource-json", "datafusion-datasource-parquet", @@ -6083,6 +6082,7 @@ dependencies = [ "datafusion", "datafusion-common", "datafusion-datasource", + "datafusion-datasource-avro", "datafusion-session", "futures", "glob", @@ -6149,6 +6149,7 @@ dependencies = [ "arrow-flight", "chrono", "datafusion", + "datafusion-datasource-avro", "datafusion-proto", "datafusion-spark", "fastrace", diff --git a/Cargo.toml b/Cargo.toml index 438813723e..b037acd0a8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -152,10 +152,10 @@ datafusion = { version = "52.1.0", default-features = false, features = [ "compression", "parquet", "serde", - "avro", ] } datafusion-common = { version = "52.1.0", features = ["object_store", "avro", "recursive_protection"] } datafusion-datasource = { version = "52.1.0" } +datafusion-datasource-avro = { version = "52.1.0" } datafusion-expr = { version = "52.1.0", default-features = false, features = ["recursive_protection"] } datafusion-expr-common = { version = "52.1.0" } datafusion-optimizer = { version = "52.1.0", features = ["recursive_protection"] } diff --git a/crates/sail-data-source/Cargo.toml b/crates/sail-data-source/Cargo.toml index 55bc1b69c3..94ac5be1b1 100644 --- a/crates/sail-data-source/Cargo.toml +++ b/crates/sail-data-source/Cargo.toml @@ -17,6 +17,7 @@ futures = { workspace = true } datafusion = { workspace = true } datafusion-common = { workspace = true } datafusion-datasource = { workspace = true } +datafusion-datasource-avro = { workspace = true } datafusion-session = { workspace = true } object_store = { workspace = true } chumsky = { workspace = true } diff --git a/crates/sail-data-source/src/formats/avro/mod.rs b/crates/sail-data-source/src/formats/avro/mod.rs index 2d5fc2452f..a845778002 100644 --- a/crates/sail-data-source/src/formats/avro/mod.rs +++ b/crates/sail-data-source/src/formats/avro/mod.rs @@ -2,9 +2,9 @@ use std::collections::HashMap; use std::sync::Arc; use datafusion::catalog::Session; -use datafusion::datasource::file_format::avro::AvroFormat; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_datasource::file_format::FileFormat; +use datafusion_datasource_avro::AvroFormat; use crate::formats::listing::{DefaultSchemaInfer, ListingFormat, ListingTableFormat, SchemaInfer}; diff --git a/crates/sail-execution/Cargo.toml b/crates/sail-execution/Cargo.toml index 40495ee605..9c281d4123 100644 --- a/crates/sail-execution/Cargo.toml +++ b/crates/sail-execution/Cargo.toml @@ -29,6 +29,7 @@ fastrace = { workspace = true } tower = { workspace = true } prost = { workspace = true } datafusion = { workspace = true } +datafusion-datasource-avro = { workspace = true } datafusion-proto = { workspace = true } datafusion-spark = { workspace = true } arrow-flight = { workspace = true } diff --git a/crates/sail-execution/src/codec.rs b/crates/sail-execution/src/codec.rs index cdca3ba84d..99750ef107 100644 --- a/crates/sail-execution/src/codec.rs +++ b/crates/sail-execution/src/codec.rs @@ -11,8 +11,7 @@ use datafusion::common::{ use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::memory::MemorySourceConfig; use datafusion::datasource::physical_plan::{ - ArrowSource, AvroSource, FileScanConfig, FileScanConfigBuilder, FileSink, FileSinkConfig, - JsonSource, + ArrowSource, FileScanConfig, FileScanConfigBuilder, FileSink, FileSinkConfig, JsonSource, }; use datafusion::datasource::sink::DataSinkExec; use datafusion::datasource::source::{DataSource, DataSourceExec}; @@ -33,6 +32,7 @@ use datafusion::physical_plan::recursive_query::RecursiveQueryExec; use datafusion::physical_plan::sorts::partial_sort::PartialSortExec; use datafusion::physical_plan::work_table::WorkTableExec; use datafusion::physical_plan::{ExecutionPlan, PlanProperties}; +use datafusion_datasource_avro::source::AvroSource; use datafusion_proto::generated::datafusion_common as gen_datafusion_common; use datafusion_proto::physical_plan::from_proto::{ parse_physical_expr, parse_physical_sort_exprs, parse_protobuf_file_scan_config, From d5ab5c5351f430e0b64a5e44b72ecb192166d6a2 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 26 Mar 2026 12:37:09 +0000 Subject: [PATCH 4/5] chore: merge origin/main and update to DataFusion 53 without sql feature Co-authored-by: lonless9 <167735979+lonless9@users.noreply.github.com> Agent-Logs-Url: https://github.com/lakehq/sail/sessions/619d7a07-cf3f-4a5e-88b8-90875f162854 --- .devcontainer/devcontainer.json | 2 +- .gitattributes | 2 - .github/instructions/dev.instructions.md | 10 + .github/workflows/release.yml | 21 +- .github/workflows/rust-tests.yml | 2 +- .gitignore | 1 + Cargo.lock | 1488 ++-- Cargo.toml | 92 +- clippy.toml | 6 - compose.yml | 23 - crates/sail-cache/src/file_listing_cache.rs | 73 +- crates/sail-cache/src/file_metadata_cache.rs | 112 +- .../sail-cache/src/file_statistics_cache.rs | 103 +- crates/sail-catalog-glue/src/format.rs | 11 + crates/sail-catalog-glue/src/hive.rs | 4 +- crates/sail-catalog-glue/src/iceberg.rs | 4 +- crates/sail-catalog-glue/src/provider.rs | 91 +- crates/sail-catalog-glue/tests/table_tests.rs | 8 +- crates/sail-catalog-iceberg/src/provider.rs | 109 +- .../tests/rest_integration_test.rs | 56 +- crates/sail-catalog-memory/src/provider.rs | 53 +- crates/sail-catalog-onelake/src/provider.rs | 18 +- .../sail-catalog-system/src/physical_plan.rs | 8 +- crates/sail-catalog-system/src/provider.rs | 26 +- crates/sail-catalog-system/src/service.rs | 15 + .../sail-catalog-system/src/table_source.rs | 1 + crates/sail-catalog-unity/src/provider.rs | 91 +- .../tests/rest_integration_test.rs | 28 +- crates/sail-catalog/src/command.rs | 34 +- crates/sail-catalog/src/manager/database.rs | 11 + crates/sail-catalog/src/manager/function.rs | 39 +- crates/sail-catalog/src/manager/mod.rs | 46 +- crates/sail-catalog/src/manager/tracker.rs | 78 + crates/sail-catalog/src/provider/namespace.rs | 20 - crates/sail-catalog/src/provider/options.rs | 49 +- crates/sail-cli/src/python.rs | 2 + crates/sail-cli/src/python/spark_run.py | 31 + crates/sail-cli/src/python/spark_shell.py | 9 +- crates/sail-cli/src/runner.rs | 35 +- crates/sail-cli/src/spark/mcp_server.rs | 80 +- crates/sail-cli/src/spark/mod.rs | 1 + crates/sail-cli/src/spark/run.rs | 30 + crates/sail-cli/src/spark/server.rs | 109 +- crates/sail-cli/src/spark/shell.rs | 57 +- .../data/system/databases.yaml | 17 + .../src/array/record_batch.rs | 7 + .../sail-common-datafusion/src/catalog/mod.rs | 25 +- .../src/catalog/status.rs | 17 +- .../sail-common-datafusion/src/datasource.rs | 34 +- .../src/system/observable.rs | 13 +- crates/sail-common/Cargo.toml | 1 + crates/sail-common/src/config/application.rs | 215 +- .../sail-common/src/config/application.yaml | 6 + crates/sail-common/src/config/mod.rs | 1 + crates/sail-common/src/config/observer.rs | 66 + crates/sail-common/src/spec/data_type.rs | 13 +- crates/sail-common/src/spec/expression.rs | 3 + crates/sail-common/src/spec/plan.rs | 21 +- .../src/formats/binary/source.rs | 4 +- .../src/formats/console/mod.rs | 3 +- .../src/formats/console/writer.rs | 8 +- .../sail-data-source/src/formats/listing.rs | 11 +- .../src/formats/python/arrow_utils.rs | 4 +- .../src/formats/python/commit_exec.rs | 8 +- .../{python_datasource.rs => datasource.rs} | 82 +- .../src/formats/python/exec.rs | 8 +- .../src/formats/python/executor.rs | 41 +- .../src/formats/python/mod.rs | 8 +- .../src/formats/python/table_format.rs | 27 +- ...on_table_provider.rs => table_provider.rs} | 0 .../src/formats/python/write_exec.rs | 8 +- .../src/formats/rate/reader.rs | 8 +- .../src/formats/socket/reader.rs | 8 +- crates/sail-data-source/src/listing.rs | 7 +- .../src/options/data/delta_read.yaml | 31 + .../src/options/data/delta_write.yaml | 29 +- crates/sail-data-source/src/url.rs | 1 + crates/sail-delta-lake/Cargo.toml | 4 +- crates/sail-delta-lake/src/conversion/mod.rs | 5 +- .../sail-delta-lake/src/conversion/scalar.rs | 589 +- .../sail-delta-lake/src/datasource/actions.rs | 16 +- .../src/datasource/expressions.rs | 6 +- crates/sail-delta-lake/src/datasource/mod.rs | 62 +- .../src/datasource/provider.rs | 19 +- .../sail-delta-lake/src/datasource/pruning.rs | 492 +- crates/sail-delta-lake/src/datasource/scan.rs | 425 +- .../sail-delta-lake/src/datasource/schema.rs | 72 +- .../sail-delta-lake/src/delta_log/cleanup.rs | 455 ++ .../sail-delta-lake/src/delta_log/listing.rs | 186 + crates/sail-delta-lake/src/delta_log/mod.rs | 21 + .../sail-delta-lake/src/delta_log/replay.rs | 227 + .../sail-delta-lake/src/delta_log/segment.rs | 621 ++ .../src/delta_log/timestamps.rs | 318 + crates/sail-delta-lake/src/error.rs | 176 - .../src/kernel/arrow/engine_ext.rs | 708 -- .../sail-delta-lake/src/kernel/arrow/mod.rs | 13 - .../sail-delta-lake/src/kernel/checkpoints.rs | 1296 +++- crates/sail-delta-lake/src/kernel/error.rs | 19 - .../sail-delta-lake/src/kernel/log_segment.rs | 84 + crates/sail-delta-lake/src/kernel/mod.rs | 17 +- .../src/kernel/models/metadata.rs | 137 - .../sail-delta-lake/src/kernel/models/mod.rs | 42 - .../src/kernel/snapshot/iterators.rs | 463 -- .../src/kernel/snapshot/log_data.rs | 559 -- .../src/kernel/snapshot/materialize.rs | 301 + .../src/kernel/snapshot/mod.rs | 1253 ++- .../src/kernel/snapshot/stats.rs | 427 ++ .../src/kernel/snapshot/stream.rs | 200 - .../sail-delta-lake/src/kernel/statistics.rs | 101 - .../src/kernel/table_properties.rs | 72 - .../kernel/transaction/conflict_checker.rs | 66 +- .../src/kernel/transaction/mod.rs | 903 ++- .../src/kernel/transaction/protocol.rs | 54 +- crates/sail-delta-lake/src/lib.rs | 7 +- crates/sail-delta-lake/src/logical/handle.rs | 4 +- .../src/logical/table_source.rs | 15 +- .../src/operations/write/partitioning.rs | 10 +- .../src/operations/write/stats.rs | 54 +- .../src/operations/write/writer.rs | 22 +- crates/sail-delta-lake/src/options.rs | 45 +- .../src/physical/scan_planner.rs | 291 +- .../src/physical/table_scan_planner.rs | 8 +- .../src/physical_plan/action_schema.rs | 541 +- .../src/physical_plan/commit_exec.rs | 101 +- .../src/physical_plan/discovery_exec.rs | 63 +- .../src/physical_plan/expr_adapter.rs | 6 +- .../src/physical_plan/log_replay_exec.rs | 1234 ++- .../src/physical_plan/meta_adds.rs | 13 +- .../src/physical_plan/metadata_stats_exec.rs | 357 + .../sail-delta-lake/src/physical_plan/mod.rs | 4 +- .../src/physical_plan/planner/context.rs | 38 +- .../src/physical_plan/planner/log_scan.rs | 186 +- .../src/physical_plan/planner/log_segment.rs | 58 + .../planner/metadata_predicate.rs | 548 ++ .../src/physical_plan/planner/mod.rs | 29 +- .../src/physical_plan/planner/op_delete.rs | 76 +- .../src/physical_plan/planner/op_merge.rs | 56 +- .../src/physical_plan/planner/op_write.rs | 170 +- .../src/physical_plan/planner/utils.rs | 360 +- .../src/physical_plan/remove_actions_exec.rs | 54 +- .../src/physical_plan/scan_by_adds_exec.rs | 586 +- .../src/physical_plan/writer_exec.rs | 395 +- .../src/schema/arrow_conversions.rs | 261 + .../sail-delta-lake/src/schema/converter.rs | 155 +- crates/sail-delta-lake/src/schema/manager.rs | 92 +- crates/sail-delta-lake/src/schema/mapping.rs | 2 +- crates/sail-delta-lake/src/schema/mod.rs | 8 +- .../sail-delta-lake/src/session_extension.rs | 99 + .../sail-delta-lake/src/spec/action_schema.rs | 124 + .../src/{kernel/models => spec}/actions.rs | 115 +- crates/sail-delta-lake/src/spec/checkpoint.rs | 81 + crates/sail-delta-lake/src/spec/checksum.rs | 95 + crates/sail-delta-lake/src/spec/error.rs | 266 + crates/sail-delta-lake/src/spec/fields.rs | 43 + crates/sail-delta-lake/src/spec/log.rs | 90 + crates/sail-delta-lake/src/spec/metadata.rs | 161 + crates/sail-delta-lake/src/spec/mod.rs | 46 + .../src/{kernel => spec}/operation.rs | 123 +- crates/sail-delta-lake/src/spec/properties.rs | 529 ++ crates/sail-delta-lake/src/spec/protocol.rs | 137 + crates/sail-delta-lake/src/spec/schema.rs | 841 ++ crates/sail-delta-lake/src/spec/statistics.rs | 426 ++ crates/sail-delta-lake/src/spec/utils.rs | 113 + crates/sail-delta-lake/src/storage/config.rs | 2 +- crates/sail-delta-lake/src/storage/mod.rs | 85 +- crates/sail-delta-lake/src/table/mod.rs | 164 +- crates/sail-delta-lake/src/table/state.rs | 244 - crates/sail-delta-lake/src/table_format.rs | 167 +- crates/sail-execution/Cargo.toml | 1 + .../proto/sail/plan/physical.proto | 89 +- crates/sail-execution/src/codec.rs | 400 +- .../src/driver/actor/handler.rs | 5 + .../src/driver/job_scheduler/core.rs | 1 + .../src/driver/job_scheduler/state.rs | 1 + .../src/driver/job_scheduler/topology.rs | 2 +- .../src/driver/task_assigner/core.rs | 3 + .../src/driver/task_assigner/mod.rs | 4 + .../src/driver/task_assigner/state.rs | 3 + .../src/driver/worker_pool/core.rs | 1 + crates/sail-execution/src/id.rs | 1 + crates/sail-execution/src/job_graph/mod.rs | 1 + .../sail-execution/src/job_graph/planner.rs | 22 +- crates/sail-execution/src/job_runner.rs | 4 +- .../sail-execution/src/plan/shuffle_read.rs | 6 +- .../sail-execution/src/plan/shuffle_write.rs | 8 +- crates/sail-execution/src/plan/stage_input.rs | 6 +- crates/sail-execution/src/task/scheduling.rs | 3 + crates/sail-execution/src/task_runner/core.rs | 5 +- .../sail-execution/src/task_runner/monitor.rs | 4 + crates/sail-execution/src/worker/client.rs | 1 + .../src/worker_manager/kubernetes.rs | 2 +- crates/sail-function/Cargo.toml | 1 + .../sail-function/src/aggregate/max_min_by.rs | 6 +- .../src/functions_nested_utils.rs | 3 +- .../src/scalar/array/spark_array.rs | 10 + .../src/scalar/array/spark_array_min_max.rs | 20 +- .../src/scalar/array/spark_sequence.rs | 8 +- .../sail-function/src/scalar/datetime/mod.rs | 3 + .../src/scalar/datetime/spark_last_day.rs | 4 +- .../src/scalar/datetime/spark_make_time.rs | 175 + .../src/scalar/datetime/spark_next_day.rs | 2 +- .../src/scalar/datetime/spark_time_diff.rs | 219 + .../src/scalar/datetime/spark_time_trunc.rs | 209 + .../src/scalar/datetime/timestamp_now.rs | 6 +- .../src/scalar/datetime/utils.rs | 23 +- crates/sail-function/src/scalar/geo/mod.rs | 4 + .../src/scalar/geo/st_asbinary.rs | 76 + .../src/scalar/geo/st_geogfromwkb.rs | 103 + .../src/scalar/geo/st_geomfromwkb.rs | 103 + .../src/scalar/geo/wkb_reader.rs | 1137 +++ .../src/scalar/hash/spark_murmur3_hash.rs | 10 +- .../src/scalar/hash/spark_xxhash64.rs | 12 +- .../sail-function/src/scalar/json/to_json.rs | 10 + .../sail-function/src/scalar/math/random.rs | 2 +- .../src/scalar/math/spark_abs.rs | 4 +- crates/sail-function/src/scalar/mod.rs | 1 + .../src/scalar/string/soundex.rs | 39 +- .../src/scalar/string/spark_to_binary.rs | 4 +- .../sail-function/src/scalar/url/parse_url.rs | 48 +- .../src/datasource/expr_adapter.rs | 6 +- .../src/datasource/expressions.rs | 4 +- .../sail-iceberg/src/datasource/provider.rs | 76 +- crates/sail-iceberg/src/datasource/pruning.rs | 3 +- crates/sail-iceberg/src/io/mod.rs | 22 +- .../sail-iceberg/src/operations/bootstrap.rs | 1 + .../sail-iceberg/src/operations/snapshot.rs | 1 + .../src/operations/write/config.rs | 3 +- .../write/file_writer/location_generator.rs | 13 +- .../src/operations/write/partition.rs | 79 +- .../src/operations/write/table_writer.rs | 1 + .../src/physical_plan/action_schema.rs | 45 +- .../src/physical_plan/commit/commit_exec.rs | 10 +- .../src/physical_plan/plan_builder.rs | 70 +- .../src/physical_plan/writer_exec.rs | 116 +- .../sail-iceberg/src/table/metadata_loader.rs | 12 +- crates/sail-iceberg/src/table/mod.rs | 193 +- crates/sail-iceberg/src/table_format.rs | 24 +- crates/sail-iceberg/src/utils/conversions.rs | 14 +- crates/sail-iceberg/src/utils/mod.rs | 23 + .../src/utils/partition_transform.rs | 116 + .../src/{precondition.rs => barrier.rs} | 15 +- crates/sail-logical-plan/src/file_write.rs | 5 +- crates/sail-logical-plan/src/lib.rs | 2 +- crates/sail-logical-plan/src/merge.rs | 2 + crates/sail-object-store/src/hugging_face.rs | 48 +- crates/sail-object-store/src/layers/lazy.rs | 69 +- .../sail-object-store/src/layers/logging.rs | 177 +- .../sail-object-store/src/layers/runtime.rs | 100 +- crates/sail-physical-optimizer/src/barrier.rs | 94 + .../src/collect_left.rs | 211 + .../src/join_reorder/builder.rs | 1253 ++- .../src/join_reorder/cardinality_estimator.rs | 567 +- .../src/join_reorder/enumerator.rs | 762 +- .../src/join_reorder/graph.rs | 99 +- .../src/join_reorder/mod.rs | 176 +- .../src/join_reorder/reconstructor.rs | 871 ++- crates/sail-physical-optimizer/src/lib.rs | 8 +- crates/sail-physical-plan/Cargo.toml | 1 + crates/sail-physical-plan/src/barrier.rs | 138 + .../sail-physical-plan/src/catalog_command.rs | 104 + crates/sail-physical-plan/src/file_write.rs | 4 +- crates/sail-physical-plan/src/lib.rs | 2 + .../sail-physical-plan/src/map_partitions.rs | 8 +- .../src/merge_cardinality_check.rs | 4 +- crates/sail-physical-plan/src/monotonic_id.rs | 8 +- crates/sail-physical-plan/src/range.rs | 8 +- crates/sail-physical-plan/src/repartition.rs | 26 +- crates/sail-physical-plan/src/schema_pivot.rs | 8 +- crates/sail-physical-plan/src/show_string.rs | 8 +- .../src/streaming/collector.rs | 8 +- .../src/streaming/filter.rs | 12 +- .../sail-physical-plan/src/streaming/limit.rs | 12 +- .../src/streaming/source_adapter.rs | 8 +- crates/sail-plan-lakehouse/src/lib.rs | 1 - crates/sail-plan-lakehouse/src/optimizer.rs | 8 +- crates/sail-plan/Cargo.toml | 3 +- crates/sail-plan/src/catalog.rs | 30 +- crates/sail-plan/src/explain.rs | 40 +- crates/sail-plan/src/formatter.rs | 4 + crates/sail-plan/src/function/aggregate.rs | 24 +- .../sail-plan/src/function/scalar/datetime.rs | 14 + crates/sail-plan/src/function/scalar/geo.rs | 56 + crates/sail-plan/src/function/scalar/mod.rs | 2 + crates/sail-plan/src/lib.rs | 24 +- .../src/resolver/command/catalog/table.rs | 103 +- .../src/resolver/command/catalog/view.rs | 7 +- .../src/resolver/command/function.rs | 16 +- .../sail-plan/src/resolver/command/insert.rs | 66 +- .../sail-plan/src/resolver/command/merge.rs | 12 +- crates/sail-plan/src/resolver/command/mod.rs | 1 + .../sail-plan/src/resolver/command/write.rs | 639 +- .../src/resolver/command/write_stream.rs | 16 +- .../src/resolver/command/write_v1.rs | 63 +- .../src/resolver/command/write_v2.rs | 182 +- crates/sail-plan/src/resolver/data_type.rs | 30 +- .../sail-plan/src/resolver/expression/cast.rs | 32 +- .../src/resolver/expression/function.rs | 127 +- .../sail-plan/src/resolver/expression/misc.rs | 80 +- .../sail-plan/src/resolver/expression/mod.rs | 131 +- .../src/resolver/expression/window.rs | 66 +- crates/sail-plan/src/resolver/plan.rs | 1 + .../sail-plan/src/resolver/query/lateral.rs | 8 +- crates/sail-plan/src/resolver/query/misc.rs | 61 +- crates/sail-plan/src/resolver/query/mod.rs | 10 +- crates/sail-plan/src/resolver/query/read.rs | 85 +- crates/sail-plan/src/resolver/query/sample.rs | 2 +- .../src/resolver/query/time_travel.rs | 357 + crates/sail-plan/src/resolver/state.rs | 67 +- crates/sail-plan/src/streaming/rewriter.rs | 4 + crates/sail-python-udf/src/config.rs | 2 +- crates/sail-session/src/catalog.rs | 2 +- crates/sail-session/src/optimizer.rs | 6 +- crates/sail-session/src/planner.rs | 19 + .../src/session_factory/server.rs | 35 +- .../src/session_factory/worker.rs | 6 +- .../src/session_manager/actor/handler.rs | 19 +- .../src/session_manager/options.rs | 9 + crates/sail-spark-connect/src/error.rs | 19 +- .../sail-spark-connect/src/proto/data_type.rs | 27 +- crates/sail-spark-connect/src/proto/plan.rs | 17 +- .../sail-spark-connect/src/session_manager.rs | 5 +- .../tests/gold_data/data_type.json | 6 +- .../tests/gold_data/function/agg.json | 16 +- .../tests/gold_data/function/datetime.json | 18 +- .../tests/gold_data/function/st.json | 12 +- .../tests/gold_data/function/string.json | 4 +- .../gold_data/plan/ddl_create_table.json | 492 +- .../gold_data/plan/ddl_replace_table.json | 492 +- .../tests/gold_data/plan/error_misc.json | 2 +- crates/sail-sql-analyzer/src/data_type.rs | 9 +- crates/sail-sql-analyzer/src/expression.rs | 7 +- .../sail-sql-analyzer/src/literal/interval.rs | 3 +- crates/sail-sql-analyzer/src/query.rs | 43 +- crates/sail-sql-analyzer/src/statement.rs | 58 +- crates/sail-sql-parser/src/ast/expression.rs | 16 +- crates/sail-sql-parser/src/ast/query.rs | 48 +- crates/sail-sql-parser/src/ast/statement.rs | 49 +- .../tests/gold_data/syntax.json | 264 +- .../src/execution/metrics/join.rs | 5 + .../src/execution/metrics/testing.rs | 8 +- .../src/execution/physical_plan.rs | 46 +- crates/sail-telemetry/src/lib.rs | 1 + crates/sail-telemetry/src/metrics/mod.rs | 10 + crates/sail-telemetry/src/telemetry.rs | 13 +- docker/dev/Dockerfile | 2 +- docker/release/Dockerfile | 2 +- .../dataframe/data-types/compatibility.md | 4 +- docs/guide/formats/index.md | 29 - .../guide/integrations/_code/pyspark-skill.md | 43 + docs/guide/integrations/agent-skills.md | 19 + docs/guide/integrations/jdbc.md | 139 - .../delta.md => sources/delta/examples.md} | 23 +- docs/guide/sources/delta/features.md | 56 + docs/guide/sources/delta/index.data.ts | 5 + docs/guide/sources/delta/index.md | 18 + .../iceberg/examples.md} | 21 +- docs/guide/sources/iceberg/features.md | 62 + docs/guide/sources/iceberg/index.data.ts | 5 + docs/guide/sources/iceberg/index.md | 18 + docs/guide/sources/index.md | 30 + docs/guide/sources/jdbc/index.md | 161 + docs/guide/sources/python/index.md | 29 + docs/guide/sql/data-types/compatibility.md | 2 +- docs/guide/sql/features.md | 6 +- docs/reference/changelog/index.md | 75 + package.json | 22 +- pnpm-lock.yaml | 817 +- pyproject.toml | 9 +- python/pysail/__init__.py | 2 +- .../functions/scalar/datetime.json | 21 +- .../examples/spark/compatibility_check.py | 35 +- .../datasource}/__init__.py | 0 .../datasource/jdbc.py} | 2 +- .../jdbc => spark/utils}/__init__.py | 0 .../utils/_function_scanner.py} | 14 +- .../utils/_function_support.py} | 16 +- python/pysail/testing/__init__.py | 1 + .../snapshot}/__init__.py | 0 .../snapshot/yaml.py} | 57 +- python/pysail/testing/spark/__init__.py | 1 + .../spark/steps/__init__.py | 0 .../spark/steps/delta_log.py | 162 +- .../spark/steps/file_tree.py | 91 +- .../testing/spark/steps/iceberg_metadata.py | 543 ++ .../{tests => testing}/spark/steps/plan.py | 59 +- .../{tests => testing}/spark/steps/sql.py | 44 +- .../jdbc => testing/spark/utils}/__init__.py | 0 python/pysail/testing/spark/utils/common.py | 11 + python/pysail/testing/spark/utils/files.py | 82 + python/pysail/testing/spark/utils/sql.py | 161 + python/pysail/tests/conftest.py | 7 +- .../pysail/tests/datasources/jdbc/README.md | 32 - .../datasources/jdbc/manual_test_jdbc.py | 921 --- .../spark/__snapshots__/test_tpcds.plan.yaml | 6733 +++++++++++++++++ .../__snapshots__/test_tpcds.result.yaml | 5034 ++++++++++++ .../spark/__snapshots__/test_tpch.plan.yaml | 677 ++ .../spark/__snapshots__/test_tpch.result.yaml | 282 + .../__snapshots__/features/explain.yaml | 435 ++ ...lt_explain_returns_full_physical_plan.plan | 8 - ...ze_executes_and_returns_physical_plan.plan | 8 - ...hows_codegen_notice_and_physical_plan.plan | 293 - ...n_cost_shows_logical_plans_with_stats.plan | 23 - ...ed_returns_logical_and_physical_plans.plan | 23 - ...matted_includes_statistics_and_schema.plan | 8 - ...erbose_returns_detailed_physical_plan.plan | 28 - .../__snapshots__/features/system.yaml | 100 + ...nd_limit_pushdown_for_system_tables.1.plan | 6 - ...nd_limit_pushdown_for_system_tables.2.plan | 6 - ...nd_limit_pushdown_for_system_tables.3.plan | 6 - ...nd_limit_pushdown_for_system_tables.4.plan | 6 - ...nd_limit_pushdown_for_system_tables.5.plan | 6 - ..._and_limit_pushdown_for_system_tables.plan | 6 - ..._no_filter_pushdown_for_system_tables.plan | 7 - ...ial_filter_pushdown_for_system_tables.plan | 7 - .../test_projection_for_system_tables.plan | 5 - .../spark/catalog/features/system.feature | 36 + python/pysail/tests/spark/conftest.py | 44 +- .../jdbc => spark/datasource}/init.sql | 29 +- .../tests/spark/datasource/test_jdbc.py | 627 ++ .../tests/spark/datasource/test_rate.py | 2 +- .../tests/spark/datasource/test_socket.py | 2 +- .../__snapshots__/features/checkpoint.yaml | 40 + .../__snapshots__/features/checksum.yaml | 68 + .../features/column_mapping.yaml | 191 + .../delta/__snapshots__/features/delete.yaml | 569 ++ .../features/log_scan_parallel.yaml | 74 + .../delta/__snapshots__/features/merge.yaml | 2162 ++++++ .../__snapshots__/features/overwrite.yaml | 113 + .../__snapshots__/features/read_path.yaml | 30 + .../__snapshots__/test_delta_features.yaml | 394 - ...nd_delta_rewrite_artifacts_for_delete.plan | 360 - ...lan_steps_and_merge_rewrite_artifacts.plan | 1746 ----- ...k_when_source_is_grouped_by_join_keys.plan | 124 - ...ppend_plan_shape_for_insertonly_merge.plan | 45 - ...s_json_delta_log_commits_when_present.plan | 29 - ...e_json_delta_log_commits_when_present.plan | 29 - ...l_overwrite_replace_where_category__a.plan | 26 - ...ite_replace_where_id__cast0_as_bigint.plan | 26 - ...log_meta_scan_under_merge_file_lookup.plan | 71 - ..._filter_pushdown_for_mixed_predicates.plan | 25 - ...allel_file_groups_then_merge_succeeds.plan | 71 - .../spark/delta/features/checkpoint.feature | 474 ++ .../features/checkpoint_properties.feature | 68 + .../spark/delta/features/checksum.feature | 166 + .../delta/features/checksum_read.feature | 209 + .../delta/features/column_mapping.feature | 37 + .../tests/spark/delta/features/delete.feature | 175 + .../spark/delta/features/read_path.feature | 172 + .../spark/delta/features/time_travel.feature | 120 + .../tests/spark/delta/test_delta_advanced.py | 170 +- .../spark/delta/test_delta_checkpoint.py | 84 +- .../spark/delta/test_delta_column_mapping.py | 28 +- .../spark/delta/test_delta_concurrent.py | 6 +- .../pysail/tests/spark/delta/test_delta_io.py | 9 +- .../delta/test_delta_merge_into_dataframe.py | 3 +- .../spark/delta/test_delta_metadata_only.py | 70 + .../delta/test_delta_partition_pruning.py | 33 + .../function/features/array_min_max.feature | 391 + .../spark/function/features/col_regex.feature | 14 - .../features/count_distinct_star.feature | 96 + .../spark/function/features/dataframe.feature | 15 - .../spark/function/features/hash.feature | 41 + .../features/identifier_clause.feature | 83 + .../features/interval_day_to_second.feature | 100 + .../spark/function/features/parse_url.feature | 216 + .../function/features/set_operations.feature | 213 + .../spark/function/features/soundex.feature | 171 + .../function/features/time_functions.feature | 264 + .../spark/function/features/to_json.feature | 162 + .../features/window_range_interval.feature | 101 + .../spark/function/features/xxhash64.feature | 41 + .../glue/test_glue_partition_transforms.py | 2 +- .../__snapshots__/features/explain.yaml | 75 + .../features/partition_transforms.yaml | 410 + .../spark/iceberg/features/explain.feature | 205 + .../tests/spark/iceberg/features/io.feature | 120 + .../features/partition_transforms.feature | 308 + .../iceberg/features/partitioning.feature | 124 + .../iceberg/features/time_travel.feature | 145 + .../iceberg/test_iceberg_cross_compat.py | 2 +- .../spark/iceberg/test_iceberg_features.py | 5 + .../tests/spark/iceberg/test_iceberg_io.py | 2 +- .../test_iceberg_partition_transforms.py | 8 +- .../iceberg/test_iceberg_partitioned_reads.py | 2 +- .../test_iceberg_partitioned_writes.py | 5 +- .../spark/iceberg/test_iceberg_projection.py | 2 +- .../spark/iceberg/test_iceberg_pruning.py | 2 +- .../tests/spark/iceberg/test_iceberg_reads.py | 2 +- .../spark/iceberg/test_iceberg_time_travel.py | 2 +- .../tests/spark/iceberg/test_iceberg_write.py | 2 +- .../tests/spark/optimizer/test_features.py | 2 +- python/pysail/tests/spark/steps/dataframe.py | 63 - .../spark/streaming/test_streaming_basic.py | 2 +- python/pysail/tests/spark/test_basic.py | 7 +- python/pysail/tests/spark/test_datasources.py | 29 + python/pysail/tests/spark/test_datetime.py | 2 +- python/pysail/tests/spark/test_dml.py | 2 +- python/pysail/tests/spark/test_execution.py | 2 +- .../tests/spark/test_identifier_clause.py | 53 + python/pysail/tests/spark/test_math.py | 3 +- python/pysail/tests/spark/test_parquet.py | 18 +- .../tests/spark/test_python_datasource.py | 221 + .../spark/test_python_datasource_read.txt | 32 + .../test_python_datasource_read_arrow.txt | 40 + python/pysail/tests/spark/test_tpcds.py | 96 +- python/pysail/tests/spark/test_tpch.py | 25 +- .../tests/spark/test_write_directory.py | 3 +- python/pysail/tests/spark/test_write_table.py | 23 +- python/pysail/tests/spark/utils.py | 199 - python/pysail/tests/spark/utils/__init__.py | 1 + .../spark/utils/test_compatibility_data.py | 28 + .../utils/test_function_scanner.py} | 18 +- .../utils/test_function_support.py} | 10 +- .../__snapshots__/features/explain_write.yaml | 19 + .../write_table/features/create_table.feature | 142 + .../features/ctas_error_handling.feature | 18 - .../features/explain_write.feature | 38 + python/pysail/tests/utils/__init__.py | 0 python/pysail/utils/__init__.py | 0 python/pysail/utils/logging_config.py | 19 - .../pysail/utils/sort_compatibility_jsons.py | 91 - 521 files changed, 51963 insertions(+), 15555 deletions(-) create mode 100644 crates/sail-catalog/src/manager/tracker.rs create mode 100644 crates/sail-cli/src/python/spark_run.py create mode 100644 crates/sail-cli/src/spark/run.rs create mode 100644 crates/sail-common/src/config/observer.rs rename crates/sail-data-source/src/formats/python/{python_datasource.rs => datasource.rs} (82%) rename crates/sail-data-source/src/formats/python/{python_table_provider.rs => table_provider.rs} (100%) create mode 100644 crates/sail-delta-lake/src/delta_log/cleanup.rs create mode 100644 crates/sail-delta-lake/src/delta_log/listing.rs create mode 100644 crates/sail-delta-lake/src/delta_log/mod.rs create mode 100644 crates/sail-delta-lake/src/delta_log/replay.rs create mode 100644 crates/sail-delta-lake/src/delta_log/segment.rs create mode 100644 crates/sail-delta-lake/src/delta_log/timestamps.rs delete mode 100644 crates/sail-delta-lake/src/error.rs delete mode 100644 crates/sail-delta-lake/src/kernel/arrow/engine_ext.rs delete mode 100644 crates/sail-delta-lake/src/kernel/arrow/mod.rs delete mode 100644 crates/sail-delta-lake/src/kernel/error.rs create mode 100644 crates/sail-delta-lake/src/kernel/log_segment.rs delete mode 100644 crates/sail-delta-lake/src/kernel/models/metadata.rs delete mode 100644 crates/sail-delta-lake/src/kernel/models/mod.rs delete mode 100644 crates/sail-delta-lake/src/kernel/snapshot/iterators.rs delete mode 100644 crates/sail-delta-lake/src/kernel/snapshot/log_data.rs create mode 100644 crates/sail-delta-lake/src/kernel/snapshot/materialize.rs create mode 100644 crates/sail-delta-lake/src/kernel/snapshot/stats.rs delete mode 100644 crates/sail-delta-lake/src/kernel/snapshot/stream.rs delete mode 100644 crates/sail-delta-lake/src/kernel/statistics.rs delete mode 100644 crates/sail-delta-lake/src/kernel/table_properties.rs create mode 100644 crates/sail-delta-lake/src/physical_plan/metadata_stats_exec.rs create mode 100644 crates/sail-delta-lake/src/physical_plan/planner/log_segment.rs create mode 100644 crates/sail-delta-lake/src/physical_plan/planner/metadata_predicate.rs create mode 100644 crates/sail-delta-lake/src/schema/arrow_conversions.rs create mode 100644 crates/sail-delta-lake/src/session_extension.rs create mode 100644 crates/sail-delta-lake/src/spec/action_schema.rs rename crates/sail-delta-lake/src/{kernel/models => spec}/actions.rs (79%) create mode 100644 crates/sail-delta-lake/src/spec/checkpoint.rs create mode 100644 crates/sail-delta-lake/src/spec/checksum.rs create mode 100644 crates/sail-delta-lake/src/spec/error.rs create mode 100644 crates/sail-delta-lake/src/spec/fields.rs create mode 100644 crates/sail-delta-lake/src/spec/log.rs create mode 100644 crates/sail-delta-lake/src/spec/metadata.rs create mode 100644 crates/sail-delta-lake/src/spec/mod.rs rename crates/sail-delta-lake/src/{kernel => spec}/operation.rs (60%) create mode 100644 crates/sail-delta-lake/src/spec/properties.rs create mode 100644 crates/sail-delta-lake/src/spec/protocol.rs create mode 100644 crates/sail-delta-lake/src/spec/schema.rs create mode 100644 crates/sail-delta-lake/src/spec/statistics.rs create mode 100644 crates/sail-delta-lake/src/spec/utils.rs delete mode 100644 crates/sail-delta-lake/src/table/state.rs create mode 100644 crates/sail-function/src/scalar/datetime/spark_make_time.rs create mode 100644 crates/sail-function/src/scalar/datetime/spark_time_diff.rs create mode 100644 crates/sail-function/src/scalar/datetime/spark_time_trunc.rs create mode 100644 crates/sail-function/src/scalar/geo/mod.rs create mode 100644 crates/sail-function/src/scalar/geo/st_asbinary.rs create mode 100644 crates/sail-function/src/scalar/geo/st_geogfromwkb.rs create mode 100644 crates/sail-function/src/scalar/geo/st_geomfromwkb.rs create mode 100644 crates/sail-function/src/scalar/geo/wkb_reader.rs create mode 100644 crates/sail-iceberg/src/utils/partition_transform.rs rename crates/sail-logical-plan/src/{precondition.rs => barrier.rs} (80%) create mode 100644 crates/sail-physical-optimizer/src/barrier.rs create mode 100644 crates/sail-physical-optimizer/src/collect_left.rs create mode 100644 crates/sail-physical-plan/src/barrier.rs create mode 100644 crates/sail-physical-plan/src/catalog_command.rs create mode 100644 crates/sail-plan/src/function/scalar/geo.rs create mode 100644 crates/sail-plan/src/resolver/query/time_travel.rs delete mode 100644 docs/guide/formats/index.md create mode 100644 docs/guide/integrations/_code/pyspark-skill.md create mode 100644 docs/guide/integrations/agent-skills.md delete mode 100644 docs/guide/integrations/jdbc.md rename docs/guide/{formats/delta.md => sources/delta/examples.md} (89%) create mode 100644 docs/guide/sources/delta/features.md create mode 100644 docs/guide/sources/delta/index.data.ts create mode 100644 docs/guide/sources/delta/index.md rename docs/guide/{formats/iceberg.md => sources/iceberg/examples.md} (86%) create mode 100644 docs/guide/sources/iceberg/features.md create mode 100644 docs/guide/sources/iceberg/index.data.ts create mode 100644 docs/guide/sources/iceberg/index.md create mode 100644 docs/guide/sources/index.md create mode 100644 docs/guide/sources/jdbc/index.md create mode 100644 docs/guide/sources/python/index.md rename python/pysail/{datasources => spark/datasource}/__init__.py (100%) rename python/pysail/{datasources/jdbc/datasource.py => spark/datasource/jdbc.py} (99%) rename python/pysail/{datasources/jdbc => spark/utils}/__init__.py (100%) rename python/pysail/{utils/pyspark_function_scanner.py => spark/utils/_function_scanner.py} (93%) rename python/pysail/{utils/sail_function_support.py => spark/utils/_function_support.py} (81%) create mode 100644 python/pysail/testing/__init__.py rename python/pysail/{tests/datasources => testing/snapshot}/__init__.py (100%) rename python/pysail/{tests/snapshot_yaml.py => testing/snapshot/yaml.py} (81%) create mode 100644 python/pysail/testing/spark/__init__.py rename python/pysail/{tests => testing}/spark/steps/__init__.py (100%) rename python/pysail/{tests => testing}/spark/steps/delta_log.py (66%) rename python/pysail/{tests => testing}/spark/steps/file_tree.py (67%) create mode 100644 python/pysail/testing/spark/steps/iceberg_metadata.py rename python/pysail/{tests => testing}/spark/steps/plan.py (69%) rename python/pysail/{tests => testing}/spark/steps/sql.py (70%) rename python/pysail/{tests/datasources/jdbc => testing/spark/utils}/__init__.py (100%) create mode 100644 python/pysail/testing/spark/utils/common.py create mode 100644 python/pysail/testing/spark/utils/files.py create mode 100644 python/pysail/testing/spark/utils/sql.py delete mode 100644 python/pysail/tests/datasources/jdbc/README.md delete mode 100644 python/pysail/tests/datasources/jdbc/manual_test_jdbc.py create mode 100644 python/pysail/tests/spark/__snapshots__/test_tpcds.plan.yaml create mode 100644 python/pysail/tests/spark/__snapshots__/test_tpcds.result.yaml create mode 100644 python/pysail/tests/spark/__snapshots__/test_tpch.plan.yaml create mode 100644 python/pysail/tests/spark/__snapshots__/test_tpch.result.yaml create mode 100644 python/pysail/tests/spark/analyst/__snapshots__/features/explain.yaml delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_default_explain_returns_full_physical_plan.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_analyze_executes_and_returns_physical_plan.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_codegen_shows_codegen_notice_and_physical_plan.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_cost_shows_logical_plans_with_stats.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_extended_returns_logical_and_physical_plans.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_formatted_includes_statistics_and_schema.plan delete mode 100644 python/pysail/tests/spark/analyst/__snapshots__/test_features/test_explain_verbose_returns_detailed_physical_plan.plan create mode 100644 python/pysail/tests/spark/catalog/__snapshots__/features/system.yaml delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.1.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.2.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.3.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.4.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.5.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_filter_and_limit_pushdown_for_system_tables.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_no_filter_pushdown_for_system_tables.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_partial_filter_pushdown_for_system_tables.plan delete mode 100644 python/pysail/tests/spark/catalog/__snapshots__/test_features/test_projection_for_system_tables.plan rename python/pysail/tests/{datasources/jdbc => spark/datasource}/init.sql (81%) create mode 100644 python/pysail/tests/spark/datasource/test_jdbc.py create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/checkpoint.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/checksum.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/column_mapping.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/delete.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/log_scan_parallel.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/merge.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/overwrite.yaml create mode 100644 python/pysail/tests/spark/delta/__snapshots__/features/read_path.yaml delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features.yaml delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_codegen_includes_plan_steps_and_delta_rewrite_artifacts_for_delete.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_codegen_includes_plan_steps_and_merge_rewrite_artifacts.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_extended_does_not_include_mergecardinalitycheck_when_source_is_grouped_by_join_keys.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_extended_shows_fastappend_plan_shape_for_insertonly_merge.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_includes_json_delta_log_commits_when_present.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_includes_multiple_json_delta_log_commits_when_present.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_plan_for_conditional_overwrite_replace_where_category__a.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_plan_for_full_conditional_overwrite_replace_where_id__cast0_as_bigint.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_shows_delta_log_meta_scan_under_merge_file_lookup.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_explain_shows_partition_filter_pushdown_for_mixed_predicates.plan delete mode 100644 python/pysail/tests/spark/delta/__snapshots__/test_delta_features/test_merge_explain_includes_log_scan_union_and_parallel_file_groups_then_merge_succeeds.plan create mode 100644 python/pysail/tests/spark/delta/features/checkpoint.feature create mode 100644 python/pysail/tests/spark/delta/features/checkpoint_properties.feature create mode 100644 python/pysail/tests/spark/delta/features/checksum.feature create mode 100644 python/pysail/tests/spark/delta/features/checksum_read.feature create mode 100644 python/pysail/tests/spark/delta/features/read_path.feature create mode 100644 python/pysail/tests/spark/delta/features/time_travel.feature create mode 100644 python/pysail/tests/spark/delta/test_delta_metadata_only.py create mode 100644 python/pysail/tests/spark/function/features/array_min_max.feature delete mode 100644 python/pysail/tests/spark/function/features/col_regex.feature create mode 100644 python/pysail/tests/spark/function/features/count_distinct_star.feature delete mode 100644 python/pysail/tests/spark/function/features/dataframe.feature create mode 100644 python/pysail/tests/spark/function/features/hash.feature create mode 100644 python/pysail/tests/spark/function/features/identifier_clause.feature create mode 100644 python/pysail/tests/spark/function/features/interval_day_to_second.feature create mode 100644 python/pysail/tests/spark/function/features/set_operations.feature create mode 100644 python/pysail/tests/spark/function/features/soundex.feature create mode 100644 python/pysail/tests/spark/function/features/time_functions.feature create mode 100644 python/pysail/tests/spark/function/features/window_range_interval.feature create mode 100644 python/pysail/tests/spark/function/features/xxhash64.feature create mode 100644 python/pysail/tests/spark/iceberg/__snapshots__/features/explain.yaml create mode 100644 python/pysail/tests/spark/iceberg/__snapshots__/features/partition_transforms.yaml create mode 100644 python/pysail/tests/spark/iceberg/features/explain.feature create mode 100644 python/pysail/tests/spark/iceberg/features/io.feature create mode 100644 python/pysail/tests/spark/iceberg/features/partition_transforms.feature create mode 100644 python/pysail/tests/spark/iceberg/features/partitioning.feature create mode 100644 python/pysail/tests/spark/iceberg/features/time_travel.feature create mode 100644 python/pysail/tests/spark/iceberg/test_iceberg_features.py delete mode 100644 python/pysail/tests/spark/steps/dataframe.py create mode 100644 python/pysail/tests/spark/test_identifier_clause.py create mode 100644 python/pysail/tests/spark/test_python_datasource_read.txt create mode 100644 python/pysail/tests/spark/test_python_datasource_read_arrow.txt delete mode 100644 python/pysail/tests/spark/utils.py create mode 100644 python/pysail/tests/spark/utils/__init__.py create mode 100644 python/pysail/tests/spark/utils/test_compatibility_data.py rename python/pysail/tests/{utils/test_pyspark_function_scanner.py => spark/utils/test_function_scanner.py} (92%) rename python/pysail/tests/{utils/test_sail_function_support.py => spark/utils/test_function_support.py} (82%) create mode 100644 python/pysail/tests/spark/write_table/__snapshots__/features/explain_write.yaml create mode 100644 python/pysail/tests/spark/write_table/features/create_table.feature create mode 100644 python/pysail/tests/spark/write_table/features/explain_write.feature delete mode 100644 python/pysail/tests/utils/__init__.py delete mode 100644 python/pysail/utils/__init__.py delete mode 100644 python/pysail/utils/logging_config.py delete mode 100644 python/pysail/utils/sort_compatibility_jsons.py diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 5200e000e6..2d8ff2a924 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -12,7 +12,7 @@ "distribution": "corretto" }, "ghcr.io/devcontainers/features/rust:1": { - "version": "1.88.0", + "version": "1.91.0", "profile": "default" } }, diff --git a/.gitattributes b/.gitattributes index 423d2db6b0..fce88b8ebe 100644 --- a/.gitattributes +++ b/.gitattributes @@ -3,5 +3,3 @@ *.jpg filter=lfs diff=lfs merge=lfs -text *.jpeg filter=lfs diff=lfs merge=lfs -text *.gif filter=lfs diff=lfs merge=lfs -text - -*.plan text eol=lf diff --git a/.github/instructions/dev.instructions.md b/.github/instructions/dev.instructions.md index 682d5dc590..0a18950ccb 100644 --- a/.github/instructions/dev.instructions.md +++ b/.github/instructions/dev.instructions.md @@ -44,6 +44,16 @@ Before committing changes, make sure to format and lint the files, and ensure th You can skip API documentation generation if you are only working on files inside the `docs/` directory. +## Test Style + +- Prefer **BDD-style SQL integration tests** when behavior can be expressed in SQL. +- Organize scenarios with `Given / When / Then`; assert user-visible results (result/schema/error), not internals. +- Use unit tests mainly for logic that is hard to cover via SQL scenarios. +- Reference locations: + - `python/pysail/tests/spark/**/features/*.feature` (BDD scenario definitions) + - `python/pysail/tests/spark/**/test*_features.py` (scenario loaders / test entrypoints) + - `python/pysail/tests/spark/steps/*` (shared Given/When/Then step implementations) + ## Contributing Please make sure the pull request title follows the Conventional Commits specification: `[()]: `. diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index dbc52aeac8..2b775ac372 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -9,6 +9,8 @@ on: workflow_dispatch: inputs: publish: + # The manual test must be done from the `main` branch + # due to the environment protection rules. description: Publish the package to Test PyPI type: boolean required: true @@ -365,18 +367,16 @@ jobs: release-pypi: name: Release to PyPI if: ${{ github.event_name == 'push' || (github.event_name == 'workflow_dispatch' && github.event.inputs.publish == 'true') }} - runs-on: ubuntu-slim - environment: release + # The PyPI publishing step runs inside a Docker container, + # so we cannot use the `ubuntu-slim` runner. + runs-on: ubuntu-latest + environment: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && 'release' || 'release/test' }} permissions: id-token: write needs: - setup - review steps: - - uses: actions/setup-python@v5 - with: - python-version: "3.11" - - uses: actions/download-artifact@v4 with: pattern: "package-*" @@ -384,12 +384,11 @@ jobs: merge-multiple: true - name: Publish to PyPI - uses: PyO3/maturin-action@v1 - env: - MATURIN_REPOSITORY: "${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && 'pypi' || 'testpypi' }}" + uses: pypa/gh-action-pypi-publish@release/v1 with: - command: upload - args: --non-interactive --skip-existing dist/* + repository-url: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && 'https://upload.pypi.org/legacy/' || 'https://test.pypi.org/legacy/' }} + packages-dir: dist + skip-existing: true release-github: name: Release to GitHub diff --git a/.github/workflows/rust-tests.yml b/.github/workflows/rust-tests.yml index 73cdd33cee..15f7dcfde5 100644 --- a/.github/workflows/rust-tests.yml +++ b/.github/workflows/rust-tests.yml @@ -40,7 +40,7 @@ jobs: mkdir -p target/llvm-profiles/rust-slow - name: Run Cargo Test - run: cargo nextest run + run: cargo nextest run --no-fail-fast # Generate coverage report after unit tests (runs in PRs and pushes) - name: Generate Rust unit coverage report diff --git a/.gitignore b/.gitignore index 427e5f6b80..2c19213bff 100644 --- a/.gitignore +++ b/.gitignore @@ -103,3 +103,4 @@ docs/.vitepress/.temp .worktrees/ CLAUDE.md +AGENTS.md diff --git a/Cargo.lock b/Cargo.lock index c4e1bed80e..5141ad1492 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -26,7 +26,7 @@ checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" dependencies = [ "cfg-if", "cipher", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -114,7 +114,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" dependencies = [ "anstyle", - "anstyle-parse", + "anstyle-parse 0.2.7", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstream" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" +dependencies = [ + "anstyle", + "anstyle-parse 1.0.0", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -124,9 +139,9 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.13" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" +checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" [[package]] name = "anstyle-parse" @@ -137,6 +152,15 @@ dependencies = [ "utf8parse", ] +[[package]] +name = "anstyle-parse" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52ce7f38b242319f7cabaa6813055467063ecdc9d355bbb4ce0c68908cd8130e" +dependencies = [ + "utf8parse", +] + [[package]] name = "anstyle-query" version = "1.1.5" @@ -159,9 +183,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.100" +version = "1.0.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" [[package]] name = "apache-avro" @@ -194,9 +218,9 @@ dependencies = [ [[package]] name = "ar_archive_writer" -version = "0.2.0" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0c269894b6fe5e9d7ada0cf69b5bf847ff35bc25fc271f08e1d080fce80339a" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" dependencies = [ "object", ] @@ -215,9 +239,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a2b10dcb159faf30d3f81f6d56c1211a5bea2ca424eabe477648a44b993320e" +checksum = "d441fdda254b65f3e9025910eb2c2066b6295d9c8ed409522b8d2ace1ff8574c" dependencies = [ "arrow-arith", "arrow-array", @@ -236,9 +260,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "288015089e7931843c80ed4032c5274f02b37bcb720c4a42096d50b390e70372" +checksum = "ced5406f8b720cc0bc3aa9cf5758f93e8593cda5490677aa194e4b4b383f9a59" dependencies = [ "arrow-array", "arrow-buffer", @@ -250,9 +274,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +checksum = "772bd34cacdda8baec9418d80d23d0fb4d50ef0735685bd45158b83dfeb6e62d" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -269,9 +293,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +checksum = "898f4cf1e9598fdb77f356fdf2134feedfd0ee8d5a4e0a5f573e7d0aec16baa4" dependencies = [ "bytes", "half", @@ -281,9 +305,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +checksum = "b0127816c96533d20fc938729f48c52d3e48f99717e7a0b5ade77d742510736d" dependencies = [ "arrow-array", "arrow-buffer", @@ -303,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e4100b729fe656f2e4fb32bc5884f14acf9118d4ad532b7b33c1132e4dce896" +checksum = "ca025bd0f38eeecb57c2153c0123b960494138e6a957bbda10da2b25415209fe" dependencies = [ "arrow-array", "arrow-cast", @@ -318,9 +342,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +checksum = "42d10beeab2b1c3bb0b53a00f7c944a178b622173a5c7bcabc3cb45d90238df4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -331,9 +355,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63654f21676be802d446c6c4bc54f6a47e18d55f9ae6f7195a6f6faf2ecdbeb" +checksum = "302b2e036335f3f04d65dad3f74ff1f2aae6dc671d6aa04dc6b61193761e16fb" dependencies = [ "arrow-array", "arrow-buffer", @@ -351,9 +375,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +checksum = "609a441080e338147a84e8e6904b6da482cefb957c5cdc0f3398872f69a315d0" dependencies = [ "arrow-array", "arrow-buffer", @@ -367,9 +391,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a36b2332559d3310ebe3e173f75b29989b4412df4029a26a30cc3f7da0869297" +checksum = "6ead0914e4861a531be48fe05858265cf854a4880b9ed12618b1d08cba9bebc8" dependencies = [ "arrow-array", "arrow-buffer", @@ -391,9 +415,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +checksum = "763a7ba279b20b52dad300e68cfc37c17efa65e68623169076855b3a9e941ca5" dependencies = [ "arrow-array", "arrow-buffer", @@ -404,9 +428,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f45c7989cb70214b2f362eaa10266d15e1a433692f2ea1514018be3aace679f4" +checksum = "e63351dc11981a316c828a6032a5021345bba882f68bc4a36c36825a50725089" dependencies = [ "arrow-array", "arrow-data", @@ -416,9 +440,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b07f52788744cc71c4628567ad834cadbaeb9f09026ff1d7a4120f69edf7abd3" +checksum = "e14fe367802f16d7668163ff647830258e6e0aeea9a4d79aaedf273af3bdcd3e" dependencies = [ "arrow-array", "arrow-buffer", @@ -429,9 +453,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +checksum = "c30a1365d7a7dc50cc847e54154e6af49e4c4b0fddc9f607b687f29212082743" dependencies = [ "bitflags", "serde", @@ -441,9 +465,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +checksum = "78694888660a9e8ac949853db393af2a8b8fc82c19ce333132dfa2e72cc1a7fe" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -455,9 +479,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "57.2.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8ad6a81add9d3ea30bf8374ee8329992c7fd246ffd8b7e2f48a3cea5aa0cc9a" +checksum = "61e04a01f8bb73ce54437514c5fd3ee2aa3e8abe4c777ee5cc55853b1652f79e" dependencies = [ "arrow-array", "arrow-buffer", @@ -498,9 +522,9 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.37" +version = "0.4.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d10e4f991a553474232bc0a31799f6d24b034a84c0971d80d2e2f78b2e576e40" +checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1" dependencies = [ "compression-codecs", "compression-core", @@ -508,6 +532,17 @@ dependencies = [ "tokio", ] +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -584,9 +619,9 @@ checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" [[package]] name = "aws-config" -version = "1.8.14" +version = "1.8.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2" +checksum = "11493b0bad143270fb8ad284a096dd529ba91924c5409adeac856cc1bf047dbc" dependencies = [ "aws-credential-types", "aws-runtime", @@ -604,7 +639,7 @@ dependencies = [ "fastrand", "hex", "http 1.4.0", - "ring", + "sha1", "time", "tokio", "tracing", @@ -614,9 +649,9 @@ dependencies = [ [[package]] name = "aws-credential-types" -version = "1.2.13" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d203b0bf2626dcba8665f5cd0871d7c2c0930223d6b6be9097592fea21242d0" +checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", @@ -626,9 +661,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.4" +version = "1.16.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b7b6141e96a8c160799cc2d5adecd5cbbe5054cb8c7c4af53da0f83bb7ad256" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" dependencies = [ "aws-lc-sys", "zeroize", @@ -636,9 +671,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.37.0" +version = "0.39.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c34dda4df7017c8db52132f0f8a2e0f8161649d15723ed63fc00c82d0f2081a" +checksum = "1fa7e52a4c5c547c741610a2c6f123f3881e409b714cd27e6798ef020c514f0a" dependencies = [ "cc", "cmake", @@ -648,9 +683,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.7.1" +version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ede2ddc593e6c8acc6ce3358c28d6677a6dc49b65ba4b37a2befe14a11297e75" +checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -673,9 +708,9 @@ dependencies = [ [[package]] name = "aws-sdk-glue" -version = "1.139.0" +version = "1.142.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af3da2f5cf74983a60a7d5a182d76db1609ee4401057c98732ed8be973cb30ee" +checksum = "3962675ec1f2012ae6439814e784557550fa239a4a291bd4f33d8f514d4fdb5b" dependencies = [ "aws-credential-types", "aws-runtime", @@ -697,9 +732,9 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.94.0" +version = "1.97.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "699da1961a289b23842d88fe2984c6ff68735fdf9bdcbc69ceaeb2491c9bf434" +checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" dependencies = [ "aws-credential-types", "aws-runtime", @@ -721,9 +756,9 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.96.0" +version = "1.99.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3e3a4cb3b124833eafea9afd1a6cc5f8ddf3efefffc6651ef76a03cbc6b4981" +checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -745,9 +780,9 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.98.0" +version = "1.101.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89c4f19655ab0856375e169865c91264de965bd74c407c7f1e403184b1049409" +checksum = "ab41ad64e4051ecabeea802d6a17845a91e83287e1dd249e6963ea1ba78c428a" dependencies = [ "aws-credential-types", "aws-runtime", @@ -770,9 +805,9 @@ dependencies = [ [[package]] name = "aws-sigv4" -version = "1.4.1" +version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37411f8e0f4bea0c3ca0958ce7f18f6439db24d555dbd809787262cd00926aa9" +checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" dependencies = [ "aws-credential-types", "aws-smithy-http", @@ -792,9 +827,9 @@ dependencies = [ [[package]] name = "aws-smithy-async" -version = "1.2.13" +version = "1.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cc50d0f63e714784b84223abd7abbc8577de8c35d699e0edd19f0a88a08ae13" +checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc" dependencies = [ "futures-util", "pin-project-lite", @@ -803,9 +838,9 @@ dependencies = [ [[package]] name = "aws-smithy-http" -version = "0.63.5" +version = "0.63.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d619373d490ad70966994801bc126846afaa0d1ee920697a031f0cf63f2568e7" +checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", @@ -824,57 +859,51 @@ dependencies = [ [[package]] name = "aws-smithy-http-client" -version = "1.1.11" +version = "1.1.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00ccbb08c10f6bcf912f398188e42ee2eab5f1767ce215a02a73bc5df1bbdd95" +checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769" dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "h2 0.3.27", - "h2 0.4.13", - "http 0.2.12", + "h2", "http 1.4.0", - "http-body 0.4.6", - "hyper 0.14.32", - "hyper 1.8.1", - "hyper-rustls 0.24.2", - "hyper-rustls 0.27.7", + "hyper", + "hyper-rustls", "hyper-util", "pin-project-lite", - "rustls 0.21.12", - "rustls 0.23.36", + "rustls", "rustls-native-certs", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.4", + "tokio-rustls", "tower", "tracing", ] [[package]] name = "aws-smithy-json" -version = "0.62.4" +version = "0.62.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27b3a779093e18cad88bbae08dc4261e1d95018c4c5b9356a52bcae7c0b6e9bb" +checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a" dependencies = [ "aws-smithy-types", ] [[package]] name = "aws-smithy-observability" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d3f39d5bb871aaf461d59144557f16d5927a5248a983a40654d9cf3b9ba183b" +checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c" dependencies = [ "aws-smithy-runtime-api", ] [[package]] name = "aws-smithy-query" -version = "0.60.14" +version = "0.60.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05f76a580e3d8f8961e5d48763214025a2af65c2fa4cd1fb7f270a0e107a71b0" +checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd" dependencies = [ "aws-smithy-types", "urlencoding", @@ -882,9 +911,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.10.2" +version = "1.10.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22ccf7f6eba8b2dcf8ce9b74806c6c185659c311665c4bf8d6e71ebd454db6bf" +checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -907,9 +936,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.11.5" +version = "1.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4af6e5def28be846479bbeac55aa4603d6f7986fc5da4601ba324dd5d377516" +checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -924,9 +953,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.4.5" +version = "1.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ca2734c16913a45343b37313605d84e7d8b34a4611598ce1d25b35860a2bed3" +checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c" dependencies = [ "base64-simd", "bytes", @@ -950,18 +979,18 @@ dependencies = [ [[package]] name = "aws-smithy-xml" -version = "0.60.14" +version = "0.60.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b53543b4b86ed43f051644f704a98c7291b3618b67adf057ee77a366fa52fcaa" +checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" dependencies = [ "xmlparser", ] [[package]] name = "aws-types" -version = "1.3.13" +version = "1.3.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0470cc047657c6e286346bdf10a8719d26efd6a91626992e0e64481e44323e96" +checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" dependencies = [ "aws-credential-types", "aws-smithy-async", @@ -984,7 +1013,7 @@ dependencies = [ "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "itoa", "matchit", @@ -1061,9 +1090,9 @@ dependencies = [ [[package]] name = "bitflags" -version = "2.10.0" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "812e12b5285cc515a9c72a5c1d3b6d46a19dac5acfef5265968c166106e31dd3" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" [[package]] name = "bitvec" @@ -1097,7 +1126,7 @@ dependencies = [ "cc", "cfg-if", "constant_time_eq", - "cpufeatures", + "cpufeatures 0.2.17", ] [[package]] @@ -1137,16 +1166,16 @@ dependencies = [ "home", "http 1.4.0", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-named-pipe", - "hyper-rustls 0.27.7", + "hyper-rustls", "hyper-util", "hyperlocal", "log", "num", "pin-project-lite", "rand 0.9.2", - "rustls 0.23.36", + "rustls", "rustls-native-certs", "rustls-pemfile", "rustls-pki-types", @@ -1197,9 +1226,9 @@ dependencies = [ [[package]] name = "bon" -version = "3.8.2" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "234655ec178edd82b891e262ea7cf71f6584bcd09eff94db786be23f1821825c" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" dependencies = [ "bon-macros", "rustversion", @@ -1207,11 +1236,11 @@ dependencies = [ [[package]] name = "bon-macros" -version = "3.8.2" +version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89ec27229c38ed0eb3c0feee3d2c1d6a4379ae44f418a29a658890e062d8f365" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" dependencies = [ - "darling 0.23.0", + "darling", "ident_case", "prettyplease", "proc-macro2", @@ -1222,19 +1251,20 @@ dependencies = [ [[package]] name = "borsh" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1da5ab77c1437701eeff7c88d968729e7766172279eab0676857b3d63af7a6f" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" dependencies = [ "borsh-derive", + "bytes", "cfg_aliases", ] [[package]] name = "borsh-derive" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0686c856aa6aac0c4498f936d7d6a02df690f614c03e4d906d1018062b5c5e2c" +checksum = "bfcfdc083699101d5a7965e49925975f2f55060f94f9a05e7187be95d530ca59" dependencies = [ "once_cell", "proc-macro-crate", @@ -1266,9 +1296,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.19.1" +version = "3.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5dd9dc738b7a8311c7ade152424974d8115f2cdad61e8dab8dac9f2362298510" +checksum = "5d20789868f4b01b2f2caec9f5c4e0213b41e3e5702a50157d699ae31ced2fcb" [[package]] name = "bytecheck" @@ -1294,9 +1324,9 @@ dependencies = [ [[package]] name = "bytemuck" -version = "1.24.0" +version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1fbdf580320f38b612e485521afda1ee26d10cc9884efaaa750d383e13e3c5f4" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" dependencies = [ "bytemuck_derive", ] @@ -1354,9 +1384,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.54" +version = "1.2.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6354c81bbfd62d9cfa9cb3c773c2b7b2a3a482d569de977fd0e961f6e7c00583" +checksum = "7a0dd1ca384932ff3641c8718a02769f1698e7563dc6974ffd03346116310423" dependencies = [ "find-msvc-tools", "jobserver", @@ -1376,6 +1406,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" +[[package]] +name = "chacha20" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f8d983286843e49675a4b7a2d174efe136dc93a18d69130dd18198a6c167601" +dependencies = [ + "cfg-if", + "cpufeatures 0.3.0", + "rand_core 0.10.0", +] + [[package]] name = "chrono" version = "0.4.44" @@ -1423,9 +1464,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2797f34da339ce31042b27d23607e051786132987f595b02ba4f6a6dffb7030a" +checksum = "b193af5b67834b676abd72466a96c1024e6a6ad978a1f484bd90b85c94041351" dependencies = [ "clap_builder", "clap_derive", @@ -1433,11 +1474,11 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.60" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "24a241312cea5059b13574bb9b3861cabf758b879c15190b37b6d6fd63ab6876" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream", + "anstream 1.0.0", "anstyle", "clap_lex", "strsim", @@ -1445,9 +1486,9 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.55" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +checksum = "1110bd8a634a1ab8cb04345d8d878267d57c3cf1b38d91b71af6686408bbca6a" dependencies = [ "heck", "proc-macro2", @@ -1457,9 +1498,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "1.0.0" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cmake" @@ -1472,15 +1513,15 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.4" +version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +checksum = "1d07550c9036bf2ae0c684c4297d503f838287c83c53686d05370d0e139ae570" [[package]] name = "comfy-table" -version = "7.1.4" +version = "7.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4a65ebfec4fb190b6f90e944a817d60499ee0744e582530e2c9900a22e591d9a" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" dependencies = [ "crossterm", "unicode-segmentation", @@ -1489,9 +1530,9 @@ dependencies = [ [[package]] name = "compression-codecs" -version = "0.4.36" +version = "0.4.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00828ba6fd27b45a448e57dbfe84f1029d4c9f26b368157e9a448a5f49a2ec2a" +checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" dependencies = [ "bzip2", "compression-core", @@ -1508,6 +1549,15 @@ version = "0.4.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + [[package]] name = "console" version = "0.15.11" @@ -1582,6 +1632,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc" version = "3.4.0" @@ -1632,14 +1691,15 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crossterm" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829d955a0bb380ef178a640b91779e3987da38c9aea133b20614cfed8cdea9c6" +checksum = "d8b9f2e4c67f833b660cdb0a3523065869fb35570177239812ed4c905aeff87b" dependencies = [ "bitflags", "crossterm_winapi", + "document-features", "parking_lot", - "rustix 0.38.44", + "rustix", "winapi", ] @@ -1699,38 +1759,14 @@ dependencies = [ "cipher", ] -[[package]] -name = "darling" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdf337090841a411e2a7f3deb9187445851f91b309c0c0a29e05f74a00a48c0" -dependencies = [ - "darling_core 0.21.3", - "darling_macro 0.21.3", -] - [[package]] name = "darling" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "darling_core 0.23.0", - "darling_macro 0.23.0", -] - -[[package]] -name = "darling_core" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1247195ecd7e3c85f83c8d2a366e4210d588e802133e1e355180a9870b517ea4" -dependencies = [ - "fnv", - "ident_case", - "proc-macro2", - "quote", - "strsim", - "syn 2.0.117", + "darling_core", + "darling_macro", ] [[package]] @@ -1746,24 +1782,13 @@ dependencies = [ "syn 2.0.117", ] -[[package]] -name = "darling_macro" -version = "0.21.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d38308df82d1080de0afee5d069fa14b0326a88c14f15c5ccda35b4a6c414c81" -dependencies = [ - "darling_core 0.21.3", - "quote", - "syn 2.0.117", -] - [[package]] name = "darling_macro" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ - "darling_core 0.23.0", + "darling_core", "quote", "syn 2.0.117", ] @@ -1784,9 +1809,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d12ee9fdc6cdb5898c7691bb994f0ba606c4acc93a2258d78bb9f26ff8158bb3" +checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" dependencies = [ "arrow", "arrow-schema", @@ -1830,6 +1855,7 @@ dependencies = [ "rand 0.9.2", "regex", "serde", + "sqlparser", "tempfile", "tokio", "url", @@ -1839,9 +1865,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "462dc9ef45e5d688aeaae49a7e310587e81b6016b9d03bace5626ad0043e5a9e" +checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" dependencies = [ "arrow", "async-trait", @@ -1864,9 +1890,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b96dbf1d728fc321817b744eb5080cdd75312faa6980b338817f68f3caa4208" +checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" dependencies = [ "arrow", "async-trait", @@ -1887,9 +1913,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3237a6ff0d2149af4631290074289cae548c9863c885d821315d54c6673a074a" +checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" dependencies = [ "ahash 0.8.12", "apache-avro", @@ -1899,6 +1925,7 @@ dependencies = [ "half", "hashbrown 0.16.1", "indexmap 2.13.0", + "itertools", "libc", "log", "object_store", @@ -1912,9 +1939,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70b5e34026af55a1bfccb1ef0a763cf1f64e77c696ffcf5a128a278c31236528" +checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" dependencies = [ "futures", "log", @@ -1923,9 +1950,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1b2a6be734cc3785e18bbf2a7f2b22537f6b9fb960d79617775a51568c281842" +checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" dependencies = [ "arrow", "async-compression", @@ -1958,9 +1985,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1739b9b07c9236389e09c74f770e88aff7055250774e9def7d3f4f56b3dcc7be" +checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" dependencies = [ "arrow", "arrow-ipc", @@ -1982,9 +2009,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "828088c2fb681cc0e06fb42f541f76c82a0c10278f9fd6334e22c8d1e3574ee7" +checksum = "49dda81c79b6ba57b1853a9158abc66eb85a3aa1cede0c517dabec6d8a4ed3aa" dependencies = [ "apache-avro", "arrow", @@ -2002,9 +2029,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61c73bc54b518bbba7c7650299d07d58730293cfba4356f6f428cc94c20b7600" +checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" dependencies = [ "arrow", "async-trait", @@ -2025,9 +2052,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "37812c8494c698c4d889374ecfabbff780f1f26d9ec095dd1bddfc2a8ca12559" +checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" dependencies = [ "arrow", "async-trait", @@ -2042,14 +2069,16 @@ dependencies = [ "datafusion-session", "futures", "object_store", + "serde_json", "tokio", + "tokio-stream", ] [[package]] name = "datafusion-datasource-parquet" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2210937ecd9f0e824c397e73f4b5385c97cd1aff43ab2b5836fcfd2d321523fb" +checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" dependencies = [ "arrow", "async-trait", @@ -2077,22 +2106,24 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c825f969126bc2ef6a6a02d94b3c07abff871acf4d6dd759ce1255edb7923ce" +checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" [[package]] name = "datafusion-execution" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa03ef05a2c2f90dd6c743e3e111078e322f4b395d20d4b4d431a245d79521ae" +checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" dependencies = [ "arrow", + "arrow-buffer", "async-trait", "chrono", "dashmap", "datafusion-common", "datafusion-expr", + "datafusion-physical-expr-common", "futures", "log", "object_store", @@ -2104,9 +2135,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef33934c1f98ee695cc51192cc5f9ed3a8febee84fdbcd9131bf9d3a9a78276f" +checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" dependencies = [ "arrow", "async-trait", @@ -2127,9 +2158,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "000c98206e3dd47d2939a94b6c67af4bfa6732dd668ac4fafdbde408fd9134ea" +checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" dependencies = [ "arrow", "datafusion-common", @@ -2140,9 +2171,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "379b01418ab95ca947014066248c22139fe9af9289354de10b445bd000d5d276" +checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" dependencies = [ "arrow", "arrow-buffer", @@ -2161,6 +2192,7 @@ dependencies = [ "itertools", "log", "md-5", + "memchr", "num-traits", "rand 0.9.2", "regex", @@ -2171,9 +2203,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fd00d5454ba4c3f8ebbd04bd6a6a9dc7ced7c56d883f70f2076c188be8459e4c" +checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" dependencies = [ "ahash 0.8.12", "arrow", @@ -2187,14 +2219,15 @@ dependencies = [ "datafusion-physical-expr-common", "half", "log", + "num-traits", "paste", ] [[package]] name = "datafusion-functions-aggregate-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aec06b380729a87210a4e11f555ec2d729a328142253f8d557b87593622ecc9f" +checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" dependencies = [ "ahash 0.8.12", "arrow", @@ -2205,9 +2238,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "904f48d45e0f1eb7d0eb5c0f80f2b5c6046a85454364a6b16a2e0b46f62e7dff" +checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" dependencies = [ "arrow", "arrow-ord", @@ -2221,16 +2254,18 @@ dependencies = [ "datafusion-functions-aggregate-common", "datafusion-macros", "datafusion-physical-expr-common", + "hashbrown 0.16.1", "itertools", + "itoa", "log", "paste", ] [[package]] name = "datafusion-functions-table" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9a0d20e2b887e11bee24f7734d780a2588b925796ac741c3118dd06d5aa77f0" +checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" dependencies = [ "arrow", "async-trait", @@ -2244,9 +2279,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3414b0a07e39b6979fe3a69c7aa79a9f1369f1d5c8e52146e66058be1b285ee" +checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" dependencies = [ "arrow", "datafusion-common", @@ -2262,9 +2297,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bf2feae63cd4754e31add64ce75cae07d015bce4bb41cd09872f93add32523a" +checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2272,9 +2307,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c4fe888aeb6a095c4bcbe8ac1874c4b9a4c7ffa2ba849db7922683ba20875aaf" +checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" dependencies = [ "datafusion-doc", "quote", @@ -2283,9 +2318,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a6527c063ae305c11be397a86d8193936f4b84d137fe40bd706dfc178cf733c" +checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" dependencies = [ "arrow", "chrono", @@ -2296,15 +2331,16 @@ dependencies = [ "indexmap 2.13.0", "itertools", "log", + "recursive", "regex", "regex-syntax", ] [[package]] name = "datafusion-physical-expr" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0bb028323dd4efd049dd8a78d78fe81b2b969447b39c51424167f973ac5811d9" +checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" dependencies = [ "ahash 0.8.12", "arrow", @@ -2326,9 +2362,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78fe0826aef7eab6b4b61533d811234a7a9e5e458331ebbf94152a51fc8ab433" +checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" dependencies = [ "arrow", "datafusion-common", @@ -2341,9 +2377,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cfccd388620734c661bd8b7ca93c44cdd59fecc9b550eea416a78ffcbb29475f" +checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" dependencies = [ "ahash 0.8.12", "arrow", @@ -2358,9 +2394,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bde5fa10e73259a03b705d5fddc136516814ab5f441b939525618a4070f5a059" +checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" dependencies = [ "arrow", "datafusion-common", @@ -2372,13 +2408,14 @@ dependencies = [ "datafusion-physical-plan", "datafusion-pruning", "itertools", + "recursive", ] [[package]] name = "datafusion-physical-plan" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e1098760fb29127c24cc9ade3277051dc73c9ed0ac0131bd7bcd742e0ad7470" +checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" dependencies = [ "ahash 0.8.12", "arrow", @@ -2400,6 +2437,7 @@ dependencies = [ "indexmap 2.13.0", "itertools", "log", + "num-traits", "parking_lot", "pin-project-lite", "tokio", @@ -2407,9 +2445,9 @@ dependencies = [ [[package]] name = "datafusion-proto" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0cf75daf56aa6b1c6867cc33ff0fb035d517d6d06737fd355a3e1ef67cba6e7a" +checksum = "677ee4448a010ed5faeff8d73ff78972c2ace59eff3cd7bd15833a1dafa00492" dependencies = [ "arrow", "chrono", @@ -2430,13 +2468,14 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", + "rand 0.9.2", ] [[package]] name = "datafusion-proto-common" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12a0cb3cce232a3de0d14ef44b58a6537aeb1362cfb6cf4d808691ddbb918956" +checksum = "965eca01edc8259edbbd95883a00b6d81e329fd44a019cfac3a03b026a83eade" dependencies = [ "arrow", "datafusion-common", @@ -2445,9 +2484,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64d0fef4201777b52951edec086c21a5b246f3c82621569ddb4a26f488bc38a9" +checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" dependencies = [ "arrow", "datafusion-common", @@ -2462,9 +2501,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f71f1e39e8f2acbf1c63b0e93756c2e970a64729dab70ac789587d6237c4fde0" +checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" dependencies = [ "async-trait", "datafusion-common", @@ -2476,40 +2515,46 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "556c431f5f2259620c8223254c0ef57aa9a85c576d4da0166157260f71eb0e25" +checksum = "923a8b871962a9d860f036f743a20af50ff04729f1da2468ed220dab4f61c97d" dependencies = [ "arrow", "bigdecimal", "chrono", "crc32fast", + "datafusion", "datafusion-catalog", "datafusion-common", "datafusion-execution", "datafusion-expr", "datafusion-functions", + "datafusion-functions-aggregate", "datafusion-functions-nested", "log", "percent-encoding", "rand 0.9.2", + "serde_json", "sha1", + "sha2", "url", ] [[package]] name = "datafusion-sql" -version = "52.1.0" +version = "53.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f44693cfcaeb7a9f12d71d1c576c3a6dc025a12cef209375fa2d16fb3b5670ee" +checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" dependencies = [ "arrow", "bigdecimal", "chrono", "datafusion-common", "datafusion-expr", + "datafusion-functions-nested", "indexmap 2.13.0", "log", + "recursive", "regex", "sqlparser", ] @@ -2532,53 +2577,11 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "092966b41edc516079bdf31ec78a2e0588d1d0c08f78b91d8307215928642b2b" -[[package]] -name = "delta_kernel" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "59c53769ac0aea07e2efeb0859eb759cffb6007486e0cb1c27cc32a1e72ddd8f" -dependencies = [ - "arrow", - "bytes", - "chrono", - "comfy-table", - "crc", - "delta_kernel_derive", - "futures", - "indexmap 2.13.0", - "itertools", - "object_store", - "parquet", - "reqwest", - "roaring", - "rustc_version", - "serde", - "serde_json", - "strum", - "thiserror", - "tokio", - "tracing", - "url", - "uuid", - "z85", -] - -[[package]] -name = "delta_kernel_derive" -version = "0.18.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbe4ce45d99298a34ffdf533840f5cad5002682f5a397ac469b8381a2f47a378" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.117", -] - [[package]] name = "deranged" -version = "0.5.5" +version = "0.5.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ececcb659e7ba858fb4f10388c250a7252eb0a27373f1a72b8748afdd248e587" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" dependencies = [ "powerfmt", "serde_core", @@ -2659,13 +2662,13 @@ dependencies = [ [[package]] name = "dns-lookup" -version = "2.1.1" +version = "3.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf5597a4b7fe5275fc9dcf88ce26326bc8e4cb87d0130f33752d4c5f717793cf" +checksum = "6e39034cee21a2f5bbb66ba0e3689819c4bb5d00382a282006e802a7ffa6c41d" dependencies = [ "cfg-if", "libc", - "socket2 0.6.2", + "socket2", "windows-sys 0.60.2", ] @@ -2680,6 +2683,15 @@ dependencies = [ "serde_json", ] +[[package]] +name = "document-features" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b8a88685455ed29a21542a33abd9cb6510b6b129abadabdcef0f4c55bc8f61" +dependencies = [ + "litrs", +] + [[package]] name = "dunce" version = "1.0.5" @@ -2761,7 +2773,7 @@ version = "0.11.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b2daee4ea451f429a58296525ddf28b45a3b64f1acf6587e2067437bb11e218d" dependencies = [ - "anstream", + "anstream 0.6.21", "anstyle", "env_filter", "jiff", @@ -2794,6 +2806,27 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + [[package]] name = "fastant" version = "0.1.11" @@ -2900,9 +2933,9 @@ dependencies = [ [[package]] name = "find-msvc-tools" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" [[package]] name = "fixedbitset" @@ -2922,9 +2955,9 @@ dependencies = [ [[package]] name = "flate2" -version = "1.1.8" +version = "1.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b375d6465b98090a5f25b1c7703f3859783755aa9a80433b36e0379a3ec2f369" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" dependencies = [ "crc32fast", "miniz_oxide", @@ -3118,20 +3151,21 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "r-efi", + "r-efi 5.3.0", "wasip2", "wasm-bindgen", ] [[package]] name = "getrandom" -version = "0.4.1" +version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "139ef39800118c7683f2fd3c98c1b23c09ae076556b435f8e9064ae108aaeeec" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", + "rand_core 0.10.0", "wasip2", "wasip3", ] @@ -3152,25 +3186,6 @@ version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" -[[package]] -name = "h2" -version = "0.3.27" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap 2.13.0", - "slab", - "tokio", - "tokio-util", - "tracing", -] - [[package]] name = "h2" version = "0.4.13" @@ -3243,9 +3258,9 @@ dependencies = [ [[package]] name = "hdfs-native" -version = "0.12.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08603b51f970930b0025b92d3f6c0ba39a0a6d0dfb4b3f527af58768adc2f3b2" +checksum = "51610510377a0847d53b78b53f9c6c9b7df3ffb300d1181b2e04f68bba363734" dependencies = [ "aes", "base64 0.22.1", @@ -3274,7 +3289,7 @@ dependencies = [ "rand 0.9.2", "regex", "roxmltree", - "socket2 0.6.2", + "socket2", "thiserror", "tokio", "url", @@ -3284,9 +3299,9 @@ dependencies = [ [[package]] name = "hdfs-native-object-store" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d5495a763df493ea3883271e727914e83a0ce188a32e230ca820e3cb5e188d2" +checksum = "c2a8fd74d01f5831e0a81581e252831c70572b7a9145b813fad99323460f2f69" dependencies = [ "async-trait", "bytes", @@ -3447,30 +3462,6 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" -[[package]] -name = "hyper" -version = "0.14.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.27", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "socket2 0.5.10", - "tokio", - "tower-service", - "tracing", - "want", -] - [[package]] name = "hyper" version = "1.8.1" @@ -3481,7 +3472,7 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2 0.4.13", + "h2", "http 1.4.0", "http-body 1.0.1", "httparse", @@ -3501,7 +3492,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "73b7d8abf35697b81a825e386fc151e0d503e8cb5fcb93cc8669c376dfd6f278" dependencies = [ "hex", - "hyper 1.8.1", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3509,21 +3500,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "hyper-rustls" -version = "0.24.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" -dependencies = [ - "futures-util", - "http 0.2.12", - "hyper 0.14.32", - "log", - "rustls 0.21.12", - "tokio", - "tokio-rustls 0.24.1", -] - [[package]] name = "hyper-rustls" version = "0.27.7" @@ -3531,14 +3507,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ "http 1.4.0", - "hyper 1.8.1", + "hyper", "hyper-util", "log", - "rustls 0.23.36", + "rustls", "rustls-native-certs", "rustls-pki-types", "tokio", - "tokio-rustls 0.26.4", + "tokio-rustls", "tower-service", "webpki-roots", ] @@ -3549,7 +3525,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b90d566bffbce6a75bd8b09a05aa8c2cb1fabb6cb348f8840c9e4c90a0d83b0" dependencies = [ - "hyper 1.8.1", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3558,23 +3534,22 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.19" +version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "727805d60e7938b76b826a6ef209eb70eaa1812794f9424d4a4e2d740662df5f" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ "base64 0.22.1", "bytes", "futures-channel", - "futures-core", "futures-util", "http 1.4.0", "http-body 1.0.1", - "hyper 1.8.1", + "hyper", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.2", + "socket2", "system-configuration", "tokio", "tower-service", @@ -3590,7 +3565,7 @@ checksum = "986c5ce3b994526b3cd75578e62554abd09f0899d6206de48b3e96ab34ccc8c7" dependencies = [ "hex", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "pin-project-lite", "tokio", @@ -3771,15 +3746,6 @@ dependencies = [ "web-time", ] -[[package]] -name = "indoc" -version = "2.0.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79cf5c93f93228cf8efb3ba362535fb11199ac548a09ce117c9b1adc3030d706" -dependencies = [ - "rustversion", -] - [[package]] name = "inlinable_string" version = "0.1.15" @@ -3804,15 +3770,15 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.11.0" +version = "2.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" [[package]] name = "iri-string" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +checksum = "d8e7418f59cc01c88316161279a7f665217ae316b388e58a0d10e29f54f1e5eb" dependencies = [ "memchr", "serde", @@ -3835,15 +3801,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.17" +version = "1.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" [[package]] name = "jiff" -version = "0.2.18" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e67e8da4c49d6d9909fe03361f9b620f58898859f5c7aded68351e85e71ecf50" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" dependencies = [ "jiff-static", "log", @@ -3854,9 +3820,9 @@ dependencies = [ [[package]] name = "jiff-static" -version = "0.2.18" +version = "0.2.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0c84ee7f197eca9a86c6fd6cb771e55eb991632f15f2bc3ca6ec838929e6e78" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" dependencies = [ "proc-macro2", "quote", @@ -3865,9 +3831,9 @@ dependencies = [ [[package]] name = "jiter" -version = "0.11.1" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e805fb15a8249d25213202b9098f7b9ad00f8042ccc6f0063d2ae7b33f3d7da" +checksum = "020ba671987d7444d251d3ee5340be1bf4606cd6c0b53e6f4066b5a1ee376b22" dependencies = [ "ahash 0.8.12", "bitvec", @@ -3890,9 +3856,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.85" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +checksum = "b49715b7073f385ba4bc528e5747d02e66cb39c6146efb66b781f131f0fb399c" dependencies = [ "once_cell", "wasm-bindgen", @@ -3900,9 +3866,9 @@ dependencies = [ [[package]] name = "jsonpath-rust" -version = "0.7.5" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c00ae348f9f8fd2d09f82a98ca381c60df9e0820d8d79fce43e649b4dc3128b" +checksum = "633a7320c4bb672863a3782e89b9094ad70285e097ff6832cddd0ec615beadfa" dependencies = [ "pest", "pest_derive", @@ -3913,21 +3879,21 @@ dependencies = [ [[package]] name = "k8s-openapi" -version = "0.26.1" +version = "0.27.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06d9e5e61dd037cdc51da0d7e2b2be10f497478ea7e120d85dad632adb99882b" +checksum = "51b326f5219dd55872a72c1b6ddd1b830b8334996c667449c29391d657d78d5e" dependencies = [ "base64 0.22.1", - "chrono", + "jiff", "serde", "serde_json", ] [[package]] name = "kube" -version = "2.0.1" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48e7bb0b6a46502cc20e4575b6ff401af45cfea150b34ba272a3410b78aa014e" +checksum = "acc5a6a69da2975ed9925d56b5dcfc9cc739b66f37add06785b7c9f6d1e88741" dependencies = [ "k8s-openapi", "kube-client", @@ -3936,28 +3902,27 @@ dependencies = [ [[package]] name = "kube-client" -version = "2.0.1" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4987d57a184d2b5294fdad3d7fc7f278899469d21a4da39a8f6ca16426567a36" +checksum = "0fcaf2d1f1a91e1805d4cd82e8333c022767ae8ffd65909bbef6802733a7dd40" dependencies = [ "base64 0.22.1", "bytes", - "chrono", "either", "futures", - "home", "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.8.1", - "hyper-rustls 0.27.7", + "hyper", + "hyper-rustls", "hyper-timeout", "hyper-util", + "jiff", "jsonpath-rust", "k8s-openapi", "kube-core", "pem", - "rustls 0.23.36", + "rustls", "secrecy", "serde", "serde_json", @@ -3972,14 +3937,14 @@ dependencies = [ [[package]] name = "kube-core" -version = "2.0.1" +version = "3.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "914bbb770e7bb721a06e3538c0edd2babed46447d128f7c21caa68747060ee73" +checksum = "f126d2db7a8b532ec1d839ece2a71e2485dc3bbca6cc3c3f929becaa810e719e" dependencies = [ - "chrono", "derive_more", "form_urlencoded", "http 1.4.0", + "jiff", "k8s-openapi", "serde", "serde-value", @@ -4064,15 +4029,15 @@ checksum = "2c4a545a15244c7d945065b5d392b2d2d7f21526fba56ce51467b06ed445e8f7" [[package]] name = "libc" -version = "0.2.180" +version = "0.2.183" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcc35a38544a891a5f7c865aca548a982ccb3b8650a5b06d0fd33a10283c56fc" +checksum = "b5b646652bf6661599e1da8901b3b9522896f01e736bad5f723fe7a3a27f899d" [[package]] name = "libloading" -version = "0.8.9" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7c4b02199fee7c5d21a5ae7d8cfa79a6ef5bb2fc834d6e9058e89c825efdc55" +checksum = "754ca22de805bb5744484a5b151a9e1a8e837d5dc232c2d7d8c2e3492edc8b60" dependencies = [ "cfg-if", "windows-link", @@ -4080,9 +4045,9 @@ dependencies = [ [[package]] name = "liblzma" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "73c36d08cad03a3fbe2c4e7bb3a9e84c57e4ee4135ed0b065cade3d98480c648" +checksum = "b6033b77c21d1f56deeae8014eb9fbe7bdf1765185a6c508b5ca82eeaed7f899" dependencies = [ "liblzma-sys", ] @@ -4116,26 +4081,21 @@ dependencies = [ [[package]] name = "libredox" -version = "0.1.12" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d0b95e02c851351f877147b7deea7b1afb1df71b63aa5f8270716e0c5720616" +checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" dependencies = [ "bitflags", "libc", - "redox_syscall 0.7.0", + "plain", + "redox_syscall 0.7.3", ] [[package]] name = "linux-raw-sys" -version = "0.4.15" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" - -[[package]] -name = "linux-raw-sys" -version = "0.11.0" +version = "0.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df1d3c3b53da64cf5760482273a98e575c651a67eec7f77df96b5b642de8f039" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" [[package]] name = "litemap" @@ -4143,6 +4103,12 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" +[[package]] +name = "litrs" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11d3d7f243d5c5a8b9bb5d6dd2b1602c0cb0b9db1621bafc7ed66e35ff9fe092" + [[package]] name = "lock_api" version = "0.4.14" @@ -4166,18 +4132,18 @@ checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" [[package]] name = "lz4_flex" -version = "0.12.0" +version = "0.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab6473172471198271ff72e9379150e9dfd70d8e533e0752a27e515b48dd375e" +checksum = "db9a0d582c2874f68138a16ce1867e0ffde6c0bb0a0df85e1f36d04146db488a" dependencies = [ "twox-hash", ] [[package]] name = "marrow" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea734fcb7619dfcc47a396f7bf0c72571ccc8c18ae7236ae028d485b27424b74" +checksum = "f5240d6977234968ff9ad254bfa73aa397fb51e41dcb22b1eb85835e9295485b" dependencies = [ "arrow-array", "arrow-buffer", @@ -4206,18 +4172,9 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f52b00d39961fc5b2736ea853c9cc86238e165017a493d1d5c8eac6bdc4cc273" - -[[package]] -name = "memoffset" -version = "0.9.1" +version = "2.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "488016bfae457b036d996092f6cb448677611ce4449e970ceaf42695203f218a" -dependencies = [ - "autocfg", -] +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" [[package]] name = "mimalloc" @@ -4267,14 +4224,17 @@ dependencies = [ [[package]] name = "moka" -version = "0.12.13" +version = "0.12.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4ac832c50ced444ef6be0767a008b02c106a909ba79d1d830501e94b96f6b7e" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" dependencies = [ + "async-lock", "crossbeam-channel", "crossbeam-epoch", "crossbeam-utils", "equivalent", + "event-listener", + "futures-util", "parking_lot", "portable-atomic", "smallvec", @@ -4409,9 +4369,9 @@ dependencies = [ [[package]] name = "num_enum" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1207a7e20ad57b847bbddc6776b968420d38292bbfe2089accff5e19e82454c" +checksum = "5d0bca838442ec211fa11de3a8b0e0e8f3a4522575b5c4c06ed722e005036f26" dependencies = [ "num_enum_derive", "rustversion", @@ -4419,9 +4379,9 @@ dependencies = [ [[package]] name = "num_enum_derive" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff32365de1b6743cb203b710788263c44a03de03802daf96092f2da4fe6ba4d7" +checksum = "680998035259dcfcafe653688bf2aa6d3e2dc05e98be6ab46afb089dc84f1df8" dependencies = [ "proc-macro-crate", "proc-macro2", @@ -4437,39 +4397,41 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" [[package]] name = "object" -version = "0.32.2" +version = "0.37.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" dependencies = [ "memchr", ] [[package]] name = "object_store" -version = "0.12.5" +version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +checksum = "622acbc9100d3c10e2ee15804b0caa40e55c933d5aa53814cd520805b7958a49" dependencies = [ "async-trait", "base64 0.22.1", "bytes", "chrono", "form_urlencoded", - "futures", + "futures-channel", + "futures-core", + "futures-util", "http 1.4.0", "http-body-util", "httparse", "humantime", - "hyper 1.8.1", + "hyper", "itertools", "md-5", "parking_lot", "percent-encoding", "quick-xml", - "rand 0.9.2", + "rand 0.10.0", "reqwest", "ring", - "rustls-pemfile", + "rustls-pki-types", "serde", "serde_json", "serde_urlencoded", @@ -4484,9 +4446,9 @@ dependencies = [ [[package]] name = "once_cell" -version = "1.21.3" +version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" [[package]] name = "once_cell_polyfill" @@ -4556,9 +4518,9 @@ dependencies = [ [[package]] name = "opentelemetry-otlp" -version = "0.31.0" +version = "0.31.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2366db2dca4d2ad033cad11e6ee42844fd727007af5ad04a1730f4cb8163bf" +checksum = "1f69cd6acbb9af919df949cd1ec9e5e7fdc2ef15d234b6b795aaa525cc02f71f" dependencies = [ "http 1.4.0", "opentelemetry", @@ -4635,6 +4597,12 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.5" @@ -4660,14 +4628,13 @@ dependencies = [ [[package]] name = "parquet" -version = "57.3.0" +version = "58.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee96b29972a257b855ff2341b37e61af5f12d6af1158b6dcdb5b31ea07bb3cb" +checksum = "7d3f9f2205199603564127932b89695f52b62322f541d0fc7179d57c2e1c9877" dependencies = [ "ahash 0.8.12", "arrow-array", "arrow-buffer", - "arrow-cast", "arrow-data", "arrow-ipc", "arrow-schema", @@ -4804,9 +4771,9 @@ checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" [[package]] name = "pest" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c9eb05c21a464ea704b53158d358a31e6425db2f63a1a7312268b05fe2b75f7" +checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" dependencies = [ "memchr", "ucd-trie", @@ -4814,9 +4781,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68f9dbced329c441fa79d80472764b1a2c7e57123553b8519b36663a2fb234ed" +checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" dependencies = [ "pest", "pest_generator", @@ -4824,9 +4791,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3bb96d5051a78f44f43c8f712d8e810adb0ebf923fc9ed2655a7f66f63ba8ee5" +checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" dependencies = [ "pest", "pest_meta", @@ -4837,9 +4804,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.8.5" +version = "2.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "602113b5b5e8621770cfd490cfd90b9f84ab29bd2b0e49ad83eb6d186cef2365" +checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" dependencies = [ "pest", "sha2", @@ -4920,18 +4887,18 @@ dependencies = [ [[package]] name = "pin-project" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677f1add503faace112b9f1373e43e9e054bfdd22ff1a63c1bc485eaec6a6a8a" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" dependencies = [ "pin-project-internal", ] [[package]] name = "pin-project-internal" -version = "1.1.10" +version = "1.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6e918e4ff8c4549eb882f14b3a4bc8c8bc93de829416eacf579f1207a8fbf861" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" dependencies = [ "proc-macro2", "quote", @@ -4940,9 +4907,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.16" +version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" +checksum = "a89322df9ebe1c1578d689c92318e070967d1042b512afbe49518723f4e6d5cd" [[package]] name = "pin-utils" @@ -4956,6 +4923,12 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "plain" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4596b6d070b27117e987119b4dac604f3c58cfb0b191112e24771b2faeac1a6" + [[package]] name = "pollster" version = "0.4.0" @@ -4969,22 +4942,22 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9d1fe60d06143b2430aa532c94cfe9e29783047f06c0d7fd359a9a51b729fa25" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "opaque-debug", "universal-hash", ] [[package]] name = "portable-atomic" -version = "1.13.0" +version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f89776e4d69bb58bc6993e99ffa1d11f228b839984854c7daeb5d37f87cbe950" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" [[package]] name = "portable-atomic-util" -version = "0.2.4" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" dependencies = [ "portable-atomic", ] @@ -5025,11 +4998,11 @@ dependencies = [ [[package]] name = "proc-macro-crate" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "219cb19e96be00ab2e37d6e299658a0cfa83e52429179969b0f0121b4ac46983" +checksum = "e67ba7e9b2b56446f1d419b1d807906278ffa1a658a8a5d8a39dcb1f5a78614f" dependencies = [ - "toml_edit 0.23.10+spec-1.0.0", + "toml_edit 0.25.5+spec-1.1.0", ] [[package]] @@ -5197,9 +5170,9 @@ dependencies = [ [[package]] name = "psm" -version = "0.1.28" +version = "0.1.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d11f2fedc3b7dafdc2851bc52f277377c5473d378859be234bc7ebb593144d01" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" dependencies = [ "ar_archive_writer", "cc", @@ -5227,9 +5200,9 @@ dependencies = [ [[package]] name = "pulldown-cmark" -version = "0.13.0" +version = "0.13.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e8bbe1a966bd2f362681a44f6edce3c2310ac21e4d5067a6e7ec396297a6ea0" +checksum = "7c3a14896dfa883796f1cb410461aef38810ea05f2b2c33c5aded3649095fdad" dependencies = [ "bitflags", "memchr", @@ -5247,37 +5220,35 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.26.0" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ba0117f4212101ee6544044dae45abe1083d30ce7b29c4b5cbdfa2354e07383" +checksum = "cf85e27e86080aafd5a22eae58a162e133a589551542b3e5cee4beb27e54f8e1" dependencies = [ - "indoc", "libc", - "memoffset", "num-bigint", + "num-traits", "once_cell", "portable-atomic", "pyo3-build-config", "pyo3-ffi", "pyo3-macros", "serde", - "unindent", ] [[package]] name = "pyo3-build-config" -version = "0.26.0" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fc6ddaf24947d12a9aa31ac65431fb1b851b8f4365426e182901eabfb87df5f" +checksum = "8bf94ee265674bf76c09fa430b0e99c26e319c945d96ca0d5a8215f31bf81cf7" dependencies = [ "target-lexicon", ] [[package]] name = "pyo3-ffi" -version = "0.26.0" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "025474d3928738efb38ac36d4744a74a400c901c7596199e20e45d98eb194105" +checksum = "491aa5fc66d8059dd44a75f4580a2962c1862a1c2945359db36f6c2818b748dc" dependencies = [ "libc", "pyo3-build-config", @@ -5285,9 +5256,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.26.0" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2e64eb489f22fe1c95911b77c44cc41e7c19f3082fc81cce90f657cdc42ffded" +checksum = "f5d671734e9d7a43449f8480f8b38115df67bef8d21f76837fa75ee7aaa5e52e" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5297,9 +5268,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.26.0" +version = "0.28.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "100246c0ecf400b475341b8455a9213344569af29a3c841d29270e53102e0fcf" +checksum = "22faaa1ce6c430a1f71658760497291065e6450d7b5dc2bcf254d49f66ee700a" dependencies = [ "heck", "proc-macro2", @@ -5316,9 +5287,9 @@ checksum = "5a651516ddc9168ebd67b24afd085a718be02f8858fe406591b013d101ce2f40" [[package]] name = "quick-xml" -version = "0.38.4" +version = "0.39.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +checksum = "958f21e8e7ceb5a1aa7fa87fab28e7c75976e0bfe7e23ff069e0a260f894067d" dependencies = [ "memchr", "serde", @@ -5336,8 +5307,8 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.36", - "socket2 0.6.2", + "rustls", + "socket2", "thiserror", "tokio", "tracing", @@ -5346,9 +5317,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.13" +version = "0.11.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" dependencies = [ "bytes", "getrandom 0.3.4", @@ -5356,7 +5327,7 @@ dependencies = [ "rand 0.9.2", "ring", "rustc-hash", - "rustls 0.23.36", + "rustls", "rustls-pki-types", "slab", "thiserror", @@ -5374,16 +5345,16 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.2", + "socket2", "tracing", "windows-sys 0.60.2", ] [[package]] name = "quote" -version = "1.0.44" +version = "1.0.45" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" dependencies = [ "proc-macro2", ] @@ -5394,6 +5365,12 @@ version = "5.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + [[package]] name = "radium" version = "0.7.0" @@ -5422,6 +5399,17 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +dependencies = [ + "chacha20", + "getrandom 0.4.2", + "rand_core 0.10.0", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -5442,6 +5430,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e6af7f3e25ded52c41df4e0b1af2d047e45896c2f3281792ed68a1c243daedb" +dependencies = [ + "ppv-lite86", + "rand_core 0.10.0", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -5461,14 +5459,20 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "rand_core" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c8d0fd677905edcbeedbf2edb6494d676f0e98d54d5cf9bda0b061cb8fb8aba" + [[package]] name = "rand_distr" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +checksum = "4d431c2703ccf129de4d45253c03f49ebb22b97d6ad79ee3ecfc7e3f4862c1d8" dependencies = [ "num-traits", - "rand 0.9.2", + "rand 0.10.0", ] [[package]] @@ -5513,9 +5517,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.7.0" +version = "0.7.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f3fe0889e69e2ae9e41f4d6c4c0181701d00e4697b356fb1f74173a5e0ee27" +checksum = "6ce70a74e890531977d37e532c34d45e9055d2409ed08ddba14529471ed0be16" dependencies = [ "bitflags", ] @@ -5565,9 +5569,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5276caf25ac86c8d810222b3dbb938e512c55c6831a10f3e6ed1c93b84041f1c" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" dependencies = [ "aho-corasick", "memchr", @@ -5576,15 +5580,15 @@ dependencies = [ [[package]] name = "regex-lite" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d942b98df5e658f56f20d592c7f868833fe38115e65c33003d8cd224b0155da" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" [[package]] name = "regex-syntax" -version = "0.8.8" +version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2d987857b319362043e95f5353c0535c1f58eec5336fdfcf626430af7def58" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "regress" @@ -5617,12 +5621,12 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", - "h2 0.4.13", + "h2", "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.8.1", - "hyper-rustls 0.27.7", + "hyper", + "hyper-rustls", "hyper-util", "js-sys", "log", @@ -5631,7 +5635,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.36", + "rustls", "rustls-native-certs", "rustls-pki-types", "serde", @@ -5639,7 +5643,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls 0.26.4", + "tokio-rustls", "tokio-util", "tower", "tower-http", @@ -5695,16 +5699,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "roaring" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" -dependencies = [ - "bytemuck", - "byteorder", -] - [[package]] name = "roxmltree" version = "0.21.1" @@ -5716,9 +5710,9 @@ dependencies = [ [[package]] name = "rtrb" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad8388ea1a9e0ea807e442e8263a699e7edcb320ecbcd21b4fa8ff859acce3ba" +checksum = "7204ed6420f698836b76d4d5c2ec5dec7585fd5c3a788fd1cde855d1de598239" [[package]] name = "rust_decimal" @@ -5753,54 +5747,29 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.44" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" -dependencies = [ - "bitflags", - "errno", - "libc", - "linux-raw-sys 0.4.15", - "windows-sys 0.59.0", -] - -[[package]] -name = "rustix" -version = "1.1.3" +version = "1.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "146c9e247ccc180c1f61615433868c99f3de3ae256a30a43b49f67c2d9171f34" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys 0.11.0", + "linux-raw-sys", "windows-sys 0.61.2", ] [[package]] name = "rustls" -version = "0.21.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" -dependencies = [ - "log", - "ring", - "rustls-webpki 0.101.7", - "sct", -] - -[[package]] -name = "rustls" -version = "0.23.36" +version = "0.23.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +checksum = "758025cb5fccfd3bc2fd74708fd4682be41d99e5dff73c377c0646c6012c73a4" dependencies = [ "aws-lc-rs", "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.9", + "rustls-webpki", "subtle", "zeroize", ] @@ -5838,19 +5807,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.101.7" +version = "0.103.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" -dependencies = [ - "ring", - "untrusted", -] - -[[package]] -name = "rustls-webpki" -version = "0.103.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" dependencies = [ "aws-lc-rs", "ring", @@ -5872,7 +5831,7 @@ checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" [[package]] name = "sail-cache" -version = "0.5.1" +version = "0.5.3" dependencies = [ "chrono", "datafusion", @@ -5885,7 +5844,7 @@ dependencies = [ [[package]] name = "sail-catalog" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "datafusion", @@ -5902,7 +5861,7 @@ dependencies = [ [[package]] name = "sail-catalog-glue" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "async-trait", @@ -5917,7 +5876,7 @@ dependencies = [ [[package]] name = "sail-catalog-iceberg" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "async-trait", @@ -5938,7 +5897,7 @@ dependencies = [ [[package]] name = "sail-catalog-memory" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "dashmap", @@ -5948,7 +5907,7 @@ dependencies = [ [[package]] name = "sail-catalog-onelake" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "async-trait", @@ -5961,7 +5920,7 @@ dependencies = [ [[package]] name = "sail-catalog-system" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "datafusion", @@ -5975,7 +5934,7 @@ dependencies = [ [[package]] name = "sail-catalog-unity" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "async-trait", @@ -6000,13 +5959,13 @@ dependencies = [ [[package]] name = "sail-cli" -version = "0.5.1" +version = "0.5.3" dependencies = [ "clap", "log", "mimalloc", "pyo3", - "rustls 0.23.36", + "rustls", "sail-common", "sail-execution", "sail-session", @@ -6017,7 +5976,7 @@ dependencies = [ [[package]] name = "sail-common" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "arrow-buffer", @@ -6034,11 +5993,12 @@ dependencies = [ "serde_yaml", "thiserror", "tokio", + "toml", ] [[package]] name = "sail-common-datafusion" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "chrono", @@ -6069,7 +6029,7 @@ dependencies = [ [[package]] name = "sail-data-source" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "arrow-pyarrow", @@ -6109,7 +6069,7 @@ dependencies = [ [[package]] name = "sail-delta-lake" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow-schema", "async-trait", @@ -6119,13 +6079,14 @@ dependencies = [ "datafusion-common", "datafusion-functions-nested", "datafusion-physical-expr", - "delta_kernel", "educe", "futures", "indexmap 2.13.0", "itertools", "log", + "moka", "object_store", + "once_cell", "parquet", "percent-encoding", "regex", @@ -6144,7 +6105,7 @@ dependencies = [ [[package]] name = "sail-execution" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow-flight", "chrono", @@ -6160,8 +6121,9 @@ dependencies = [ "log", "prost", "prost-build", - "rand 0.9.2", + "rand 0.10.0", "readonly", + "sail-catalog", "sail-catalog-system", "sail-common", "sail-common-datafusion", @@ -6187,7 +6149,7 @@ dependencies = [ [[package]] name = "sail-function" -version = "0.5.1" +version = "0.5.3" dependencies = [ "aes", "aes-gcm", @@ -6208,21 +6170,22 @@ dependencies = [ "num", "ordered-float 5.1.0", "percent-encoding", - "rand 0.9.2", - "rand_chacha 0.9.0", + "rand 0.10.0", + "rand_chacha 0.10.0", "rand_distr", "regex", "sail-common", "sail-common-datafusion", "sail-sql-analyzer", "serde_json", + "thiserror", "twox-hash", "url", ] [[package]] name = "sail-gold-test" -version = "0.5.1" +version = "0.5.3" dependencies = [ "clap", "lazy_static", @@ -6234,7 +6197,7 @@ dependencies = [ [[package]] name = "sail-iceberg" -version = "0.5.1" +version = "0.5.3" dependencies = [ "apache-avro", "async-trait", @@ -6262,14 +6225,14 @@ dependencies = [ [[package]] name = "sail-logical-optimizer" -version = "0.5.1" +version = "0.5.3" dependencies = [ "datafusion", ] [[package]] name = "sail-logical-plan" -version = "0.5.1" +version = "0.5.3" dependencies = [ "comfy-table", "datafusion", @@ -6284,7 +6247,7 @@ dependencies = [ [[package]] name = "sail-object-store" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-stream", "async-trait", @@ -6318,7 +6281,7 @@ dependencies = [ [[package]] name = "sail-physical-optimizer" -version = "0.5.1" +version = "0.5.3" dependencies = [ "datafusion", "datafusion-physical-expr", @@ -6328,13 +6291,14 @@ dependencies = [ [[package]] name = "sail-physical-plan" -version = "0.5.1" +version = "0.5.3" dependencies = [ "datafusion", "datafusion-common", "datafusion-expr", "futures", "lazy_static", + "sail-catalog", "sail-common-datafusion", "sail-logical-plan", "tokio-stream", @@ -6342,7 +6306,7 @@ dependencies = [ [[package]] name = "sail-plan" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow", "async-recursion", @@ -6362,10 +6326,11 @@ dependencies = [ "lazy_static", "log", "object_store", - "rand 0.9.2", + "rand 0.10.0", "regex", "ryu", "sail-catalog", + "sail-catalog-memory", "sail-common", "sail-common-datafusion", "sail-function", @@ -6383,7 +6348,7 @@ dependencies = [ [[package]] name = "sail-plan-lakehouse" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "datafusion", @@ -6399,7 +6364,7 @@ dependencies = [ [[package]] name = "sail-python" -version = "0.5.1" +version = "0.5.3" dependencies = [ "fastrace", "log", @@ -6413,7 +6378,7 @@ dependencies = [ [[package]] name = "sail-python-udf" -version = "0.5.1" +version = "0.5.3" dependencies = [ "arrow-pyarrow", "datafusion", @@ -6433,7 +6398,7 @@ dependencies = [ [[package]] name = "sail-server" -version = "0.5.1" +version = "0.5.3" dependencies = [ "axum", "fastrace", @@ -6449,7 +6414,7 @@ dependencies = [ [[package]] name = "sail-session" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-trait", "chrono", @@ -6494,7 +6459,7 @@ dependencies = [ [[package]] name = "sail-spark-connect" -version = "0.5.1" +version = "0.5.3" dependencies = [ "async-stream", "datafusion", @@ -6538,7 +6503,7 @@ dependencies = [ [[package]] name = "sail-sql-analyzer" -version = "0.5.1" +version = "0.5.3" dependencies = [ "chrono", "chumsky", @@ -6552,7 +6517,7 @@ dependencies = [ [[package]] name = "sail-sql-macro" -version = "0.5.1" +version = "0.5.3" dependencies = [ "proc-macro2", "quote", @@ -6561,7 +6526,7 @@ dependencies = [ [[package]] name = "sail-sql-parser" -version = "0.5.1" +version = "0.5.3" dependencies = [ "chumsky", "either", @@ -6578,7 +6543,7 @@ dependencies = [ [[package]] name = "sail-telemetry" -version = "0.5.1" +version = "0.5.3" dependencies = [ "datafusion", "env_logger", @@ -6621,9 +6586,9 @@ dependencies = [ [[package]] name = "schannel" -version = "0.1.28" +version = "0.1.29" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "891d81b926048e76efe18581bf793546b4c0eaf8448d72be8de2bbee5fd166e1" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" dependencies = [ "windows-sys 0.61.2", ] @@ -6656,9 +6621,9 @@ dependencies = [ [[package]] name = "schemars" -version = "1.2.0" +version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54e910108742c57a770f492731f99be216a52fadd361b06c8fb59d74ccc267d2" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" dependencies = [ "dyn-clone", "ref-cast", @@ -6684,16 +6649,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sct" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" -dependencies = [ - "ring", - "untrusted", -] - [[package]] name = "seahash" version = "4.1.0" @@ -6712,9 +6667,9 @@ dependencies = [ [[package]] name = "security-framework" -version = "3.5.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags", "core-foundation 0.10.1", @@ -6725,9 +6680,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.15.0" +version = "2.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc1f0cbffaac4852523ce30d8bd3c5cdc873501d96ff467ca09b6767bb8cd5c0" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" dependencies = [ "core-foundation-sys", "libc", @@ -6771,9 +6726,9 @@ dependencies = [ [[package]] name = "serde_arrow" -version = "0.13.7" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038967a6dda16f5c6ca5b6e1afec9cd2361d39f0db681ca338ac5f0ccece6469" +checksum = "2784e59a0315568e850cb01ddadf458f8c09e28d8cfc4880c2cc08f5dc3444e0" dependencies = [ "arrow-array", "arrow-schema", @@ -6871,9 +6826,9 @@ dependencies = [ [[package]] name = "serde_tokenstream" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64060d864397305347a78851c51588fd283767e7e7589829e8121d65512340f1" +checksum = "d7c49585c52c01f13c5c2ebb333f14f6885d76daa768d8a037d28017ec538c69" dependencies = [ "proc-macro2", "quote", @@ -6895,9 +6850,9 @@ dependencies = [ [[package]] name = "serde_with" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fa237f2807440d238e0364a218270b98f767a00d3dada77b1c53ae88940e2e7" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" dependencies = [ "base64 0.22.1", "chrono", @@ -6905,7 +6860,7 @@ dependencies = [ "indexmap 1.9.3", "indexmap 2.13.0", "schemars 0.9.0", - "schemars 1.2.0", + "schemars 1.2.1", "serde_core", "serde_json", "serde_with_macros", @@ -6914,11 +6869,11 @@ dependencies = [ [[package]] name = "serde_with_macros" -version = "3.16.1" +version = "3.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52a8e3ca0ca629121f70ab50f95249e5a6f925cc0f6ffe8256c45b728875706c" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" dependencies = [ - "darling 0.21.3", + "darling", "proc-macro2", "quote", "syn 2.0.117", @@ -6944,7 +6899,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -6955,7 +6910,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] @@ -6989,15 +6944,15 @@ checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" [[package]] name = "siphasher" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56199f7ddabf13fe5074ce809e7d3f42b42ae711800501b5b16ea82ad029c39d" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" [[package]] name = "slab" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a2ae44ef20feb57a68b23d846850f861394c2e02dc425a50098ae8c90267589" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" [[package]] name = "small_ctor" @@ -7019,39 +6974,30 @@ checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" [[package]] name = "socket2" -version = "0.5.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" -dependencies = [ - "libc", - "windows-sys 0.52.0", -] - -[[package]] -name = "socket2" -version = "0.6.2" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] name = "sqlparser" -version = "0.59.0" +version = "0.61.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +checksum = "dbf5ea8d4d7c808e1af1cbabebca9a2abe603bcefc22294c5b95018d53200cb7" dependencies = [ "log", + "recursive", "sqlparser_derive", ] [[package]] name = "sqlparser_derive" -version = "0.3.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +checksum = "a6dd45d8fc1c79299bfbb7190e42ccbbdf6a5f52e4a6ad98d92357ea965bd289" dependencies = [ "proc-macro2", "quote", @@ -7066,9 +7012,9 @@ checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" [[package]] name = "stacker" -version = "0.1.22" +version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e1f8b29fb42aafcea4edeeb6b2f2d7ecd0d969c48b4cf0d2e64aafc471dd6e59" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" dependencies = [ "cc", "cfg-if", @@ -7111,9 +7057,6 @@ name = "strum" version = "0.27.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af23d6f6c1a224baef9d3f61e287d2761385a5b88fdab4eb4c6f11aeb54c4bcf" -dependencies = [ - "strum_macros", -] [[package]] name = "strum_macros" @@ -7177,9 +7120,9 @@ dependencies = [ [[package]] name = "system-configuration" -version = "0.6.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" dependencies = [ "bitflags", "core-foundation 0.9.4", @@ -7210,20 +7153,20 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "target-lexicon" -version = "0.13.4" +version = "0.13.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1dd07eb858a2067e2f3c7155d54e929265c264e6f37efe3ee7a8d1b5a1dd0ba" +checksum = "adb6935a6f5c20170eeceb1a3835a49e12e19d792f6dd344ccc76a985ca5a6ca" [[package]] name = "tempfile" -version = "3.25.0" +version = "3.27.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0136791f7c95b1f6dd99f9cc786b91bb81c3800b639b3478e561ddb7be95e5f1" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.4.1", + "getrandom 0.4.2", "once_cell", - "rustix 1.1.3", + "rustix", "windows-sys 0.61.2", ] @@ -7340,9 +7283,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.10.0" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +checksum = "3e61e67053d25a4e82c844e8424039d9745781b3fc4f32b8d55ed50f5f667ef3" dependencies = [ "tinyvec_macros", ] @@ -7355,9 +7298,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.49.0" +version = "1.50.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" dependencies = [ "bytes", "libc", @@ -7365,39 +7308,29 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2 0.6.2", + "socket2", "tokio-macros", "windows-sys 0.61.2", ] [[package]] name = "tokio-macros" -version = "2.6.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" dependencies = [ "proc-macro2", "quote", "syn 2.0.117", ] -[[package]] -name = "tokio-rustls" -version = "0.24.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" -dependencies = [ - "rustls 0.21.12", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls 0.23.36", + "rustls", "tokio", ] @@ -7449,9 +7382,9 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "0.7.5+spec-1.1.0" +version = "1.0.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "92e1cfed4a3038bc5a127e35a2d360f145e1f4b971b551a2ba5fd7aedf7e1347" +checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" dependencies = [ "serde_core", ] @@ -7467,28 +7400,28 @@ dependencies = [ "serde_spanned", "toml_datetime 0.6.11", "toml_write", - "winnow", + "winnow 0.7.15", ] [[package]] name = "toml_edit" -version = "0.23.10+spec-1.0.0" +version = "0.25.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "84c8b9f757e028cee9fa244aea147aab2a9ec09d5325a9b01e0a49730c2b5269" +checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" dependencies = [ "indexmap 2.13.0", - "toml_datetime 0.7.5+spec-1.1.0", + "toml_datetime 1.0.1+spec-1.1.0", "toml_parser", - "winnow", + "winnow 1.0.0", ] [[package]] name = "toml_parser" -version = "1.0.6+spec-1.1.0" +version = "1.0.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3198b4b0a8e11f09dd03e133c0280504d0801269e9afa46362ffde1cbeebf44" +checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" dependencies = [ - "winnow", + "winnow 1.0.0", ] [[package]] @@ -7508,20 +7441,20 @@ dependencies = [ "base64 0.22.1", "bytes", "flate2", - "h2 0.4.13", + "h2", "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-timeout", "hyper-util", "percent-encoding", "pin-project", "rustls-native-certs", - "socket2 0.6.2", + "socket2", "sync_wrapper", "tokio", - "tokio-rustls 0.26.4", + "tokio-rustls", "tokio-stream", "tower", "tower-layer", @@ -7783,9 +7716,9 @@ checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" [[package]] name = "unicode-ident" -version = "1.0.22" +version = "1.0.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9312f7c4f6ff9069b165498234ce8be658059c6728633667c526e27dc2cf1df5" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" [[package]] name = "unicode-segmentation" @@ -7805,12 +7738,6 @@ version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" -[[package]] -name = "unindent" -version = "0.2.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7264e107f553ccae879d21fbea1d6724ac785e8c3bfc762137959b5802826ef3" - [[package]] name = "universal-hash" version = "0.5.1" @@ -7835,25 +7762,24 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" [[package]] name = "ureq" -version = "3.1.4" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d39cb1dbab692d82a977c0392ffac19e188bd9186a9f32806f0aaa859d75585a" +checksum = "dea7109cdcd5864d4eeb1b58a1648dc9bf520360d7af16ec26d0a9354bafcfc0" dependencies = [ "base64 0.22.1", "log", "percent-encoding", - "rustls 0.23.36", + "rustls", "rustls-pki-types", "ureq-proto", - "utf-8", - "webpki-roots", + "utf8-zero", ] [[package]] name = "ureq-proto" -version = "0.5.3" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81f9efa9df032be5934a46a068815a10a042b494b6a58cb0a1a97bb5467ed6f" +checksum = "e994ba84b0bd1b1b0cf92878b7ef898a5c1760108fe7b6010327e274917a808c" dependencies = [ "base64 0.22.1", "http 1.4.0", @@ -7880,18 +7806,18 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" -[[package]] -name = "utf-8" -version = "0.7.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" - [[package]] name = "utf8-width" version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" +[[package]] +name = "utf8-zero" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8c0a043c9540bae7c578c88f91dda8bd82e59ae27c21baca69c8b191aaf5a6e" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -7906,13 +7832,12 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.21.0" +version = "1.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b672338555252d43fd2240c714dc444b8c6fb0a5c5335e65a07bba7742735ddb" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" dependencies = [ - "getrandom 0.4.1", + "getrandom 0.4.2", "js-sys", - "rand 0.9.2", "serde_core", "wasm-bindgen", ] @@ -7980,9 +7905,9 @@ checksum = "b8dad83b4f25e74f184f64c43b150b91efe7647395b42289f38e50566d82855b" [[package]] name = "wasm-bindgen" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +checksum = "6532f9a5c1ece3798cb1c2cfdba640b9b3ba884f5db45973a6f442510a87d38e" dependencies = [ "cfg-if", "once_cell", @@ -7993,9 +7918,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.58" +version = "0.4.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +checksum = "e9c5522b3a28661442748e09d40924dfb9ca614b21c00d3fd135720e48b67db8" dependencies = [ "cfg-if", "futures-util", @@ -8007,9 +7932,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +checksum = "18a2d50fcf105fb33bb15f00e7a77b772945a2ee45dcf454961fd843e74c18e6" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -8017,9 +7942,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +checksum = "03ce4caeaac547cdf713d280eda22a730824dd11e6b8c3ca9e42247b25c631e3" dependencies = [ "bumpalo", "proc-macro2", @@ -8030,9 +7955,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.108" +version = "0.2.114" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +checksum = "75a326b8c223ee17883a4251907455a2431acc2791c98c26279376490c378c16" dependencies = [ "unicode-ident", ] @@ -8086,9 +8011,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.85" +version = "0.3.91" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +checksum = "854ba17bb104abfb26ba36da9729addc7ce7f06f5c0f90f3c391f8461cca21f9" dependencies = [ "js-sys", "wasm-bindgen", @@ -8106,9 +8031,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.5" +version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12bed680863276c63889429bfd6cab3b99943659923822de1c8a39c49e4d722c" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" dependencies = [ "rustls-pki-types", ] @@ -8392,9 +8317,18 @@ checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" [[package]] name = "winnow" -version = "0.7.14" +version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a5364e9d77fcdeeaa6062ced926ee3381faa2ee02d3eb83a5c27a8825540829" +checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" +dependencies = [ + "memchr", +] + +[[package]] +name = "winnow" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" dependencies = [ "memchr", ] @@ -8411,7 +8345,7 @@ dependencies = [ "futures", "http 1.4.0", "http-body-util", - "hyper 1.8.1", + "hyper", "hyper-util", "log", "once_cell", @@ -8532,7 +8466,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" dependencies = [ "libc", - "rustix 1.1.3", + "rustix", ] [[package]] @@ -8570,26 +8504,20 @@ dependencies = [ "synstructure", ] -[[package]] -name = "z85" -version = "3.0.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b3a41ce106832b4da1c065baa4c31cf640cf965fa1483816402b7f6b96f0a64" - [[package]] name = "zerocopy" -version = "0.8.33" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "668f5168d10b9ee831de31933dc111a459c97ec93225beb307aed970d1372dfd" +checksum = "efbb2a062be311f2ba113ce66f697a4dc589f85e78a4aea276200804cea0ed87" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.8.33" +version = "0.8.47" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c7962b26b0a8685668b671ee4b54d007a67d4eaf05fda79ac0ecf41e32270f1" +checksum = "0e8bc7269b54418e7aeeef514aa68f8690b8c0489a06b0136e5f57c4c5ccab89" dependencies = [ "proc-macro2", "quote", @@ -8658,15 +8586,15 @@ dependencies = [ [[package]] name = "zlib-rs" -version = "0.5.5" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" +checksum = "3be3d40e40a133f9c916ee3f9f4fa2d9d63435b5fbe1bfc6d9dae0aa0ada1513" [[package]] name = "zmij" -version = "1.0.16" +version = "1.0.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index b037acd0a8..319f0511a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,14 +5,14 @@ members = [ resolver = "2" [workspace.package] -version = "0.5.1" +version = "0.5.3" authors = ["LakeSail "] edition = "2021" homepage = "https://lakesail.com" license = "Apache-2.0" readme = "README.md" repository = "https://github.com/lakehq/sail" -rust-version = "1.88.0" +rust-version = "1.91.0" [workspace.lints.clippy] allow_attributes = "deny" @@ -24,14 +24,14 @@ todo = "deny" [workspace.dependencies] thiserror = { version = "2.0.18" } -tokio = { version = "1.49.0", features = ["full"] } +tokio = { version = "1.50.0", features = ["full"] } tokio-stream = { version = "0.1.18", features = ["time", "io-util"] } pbjson = "0.9.0" pbjson-types = "0.9.0" pbjson-build = "0.9.0" educe = "0.6" tower = { version = "0.5.3", features = ["full"] } -uuid = { version = "1.21.0", features = ["serde", "v4"] } +uuid = { version = "1.22.0", features = ["serde", "v4"] } async-trait = "0.1.89" async-recursion = "1.1.1" async-stream = "0.3.6" @@ -41,17 +41,17 @@ serde = { version = "1.0.228", features = ["derive"] } serde_json = "1.0.149" # TODO: `serde_yaml` is deprecated. No satisfactory alternative exists yet, we may need to build our own in the future. serde_yaml = "0.9.34" -serde_with = { version = "3.16.1", default-features = false, features = ["base64", "std", "macros"] } +serde_with = { version = "3.18.0", default-features = false, features = ["base64", "std", "macros"] } monostate = "1.0.2" regex = "1.12.3" glob = "0.3.3" chrono = { version = "0.4.44", features = ["serde"] } chrono-tz = "0.10.4" futures = "0.3.32" -comfy-table = "=7.1.4" +comfy-table = "=7.2.2" html-escape = "0.2.13" syn = "2.0.117" -quote = "1.0.43" +quote = "1.0.45" paste = "1.0.15" proc-macro2 = "1.0.105" prettyplease = "0.2.37" @@ -60,19 +60,19 @@ ryu = "1.0.23" either = "1.15.0" num-bigint = "0.4.6" mimalloc = { version = "0.1.48", default-features = false } -rand = "0.9.2" -rand_chacha = "0.9.0" -rand_distr = "0.5.1" +rand = "0.10.0" +rand_chacha = "0.10.0" +rand_distr = "0.6.0" url = "2.5.8" lexical-core = { version = "1.0.6", default-features = false, features = ["write-integers", "write-floats", "parse-integers", "parse-floats"] } -aws-config = "1.8.14" +aws-config = "1.8.15" aws-credential-types = "1.2.13" -aws-sdk-glue = "1.139.0" +aws-sdk-glue = { version = "1.142.0", default-features = false, features = ["default-https-client", "rt-tokio"] } aws-smithy-runtime-api = "1.11.3" aws-smithy-types = "1.3.3" aws-smithy-async = "1.2.6" clap = { version = "4.5.60", features = ["derive"] } -num_enum = "0.7.4" +num_enum = "0.7.6" num-traits = "0.2.19" log = "0.4.29" env_logger = "0.11.9" @@ -84,8 +84,8 @@ opentelemetry_sdk = { version = "0.31.0", features = ["testing"] } opentelemetry-otlp = { version = "0.31.0", features = ["tls", "tls-roots", "grpc-tonic"] } opentelemetry-appender-log = "0.31.0" figment = { version = "0.10.19", features = ["toml", "env"] } -kube = "2.0.1" -k8s-openapi = { version = "0.26.1", features = ["latest"] } +kube = "3.0.1" +k8s-openapi = { version = "0.27.1", features = ["latest"] } twox-hash = "2.1.2" murmur3 = "0.5.2" aes = "0.8.4" @@ -102,13 +102,13 @@ num = "0.4.3" hf-hub = { version = "0.4.3", default-features = false, features = ["tokio"] } reqwest = { version = "0.12.27", default-features = false, features = ["rustls-tls", "json", "multipart", "stream"] } percent-encoding = "2.3.2" -rustls = "0.23.36" +rustls = "0.23.37" dashmap = "6.1.0" itertools = "0.14.0" -moka = { version = "0.12.13", features = ["sync"] } +moka = { version = "0.12.14", features = ["sync", "future"] } bytes = "1.11.1" indexmap = "2.13.0" -pin-project-lite = "0.2.16" +pin-project-lite = "0.2.17" ordered-float = { version = "5.1.0", features = ["serde"] } apache-avro = { version = "0.21.0" } rust_decimal = "1.40.0" @@ -117,8 +117,9 @@ testcontainers = "0.26.2" progenitor = "0.11.2" progenitor-client = "0.11.2" secrecy = { version = "0.10.3", features = ["serde"] } -tempfile = "3.25.0" +tempfile = "3.26.0" readonly = "0.2.13" +toml = "0.8" ###### # The versions of the following dependencies are managed manually. @@ -141,7 +142,7 @@ prost-types = "0.14" # The `axum` version must match the one used in `tonic` (replace `RELEASE` with the release we are using): # https://github.com/hyperium/tonic/blob/vRELEASE/tonic/Cargo.toml axum = "0.8.8" -datafusion = { version = "52.1.0", default-features = false, features = [ +datafusion = { version = "53.0.0", default-features = false, features = [ "nested_expressions", "crypto_expressions", "datetime_expressions", @@ -152,39 +153,36 @@ datafusion = { version = "52.1.0", default-features = false, features = [ "compression", "parquet", "serde", + "recursive_protection", ] } -datafusion-common = { version = "52.1.0", features = ["object_store", "avro", "recursive_protection"] } -datafusion-datasource = { version = "52.1.0" } -datafusion-datasource-avro = { version = "52.1.0" } -datafusion-expr = { version = "52.1.0", default-features = false, features = ["recursive_protection"] } -datafusion-expr-common = { version = "52.1.0" } -datafusion-optimizer = { version = "52.1.0", features = ["recursive_protection"] } -datafusion-physical-optimizer = { version = "52.1.0", features = ["recursive_protection"] } -datafusion-proto = { version = "52.1.0" } -datafusion-functions = { version = "52.1.0" } -datafusion-functions-nested = { version = "52.1.0", default-features = false } -datafusion-physical-expr = { version = "52.1.0", features = ["recursive_protection"] } -datafusion-session = { version = "52.1.0" } -datafusion-spark = { version = "52.1.0" } +datafusion-common = { version = "53.0.0", features = ["object_store", "avro"] } +datafusion-datasource = { version = "53.0.0" } +datafusion-datasource-avro = { version = "53.0.0" } +datafusion-expr = { version = "53.0.0", default-features = false } +datafusion-expr-common = { version = "53.0.0" } +datafusion-proto = { version = "53.0.0" } +datafusion-functions = { version = "53.0.0" } +datafusion-functions-nested = { version = "53.0.0", default-features = false } +datafusion-physical-expr = { version = "53.0.0" } +datafusion-session = { version = "53.0.0" } +datafusion-spark = { version = "53.0.0", features = ["core"] } # The `pyo3` version must match the one used in `arrow-pyarrow` (replace `RELEASE` with the release we are using): # https://github.com/apache/arrow-rs/blob/RELEASE/arrow-pyarrow/Cargo.toml -pyo3 = { version = "0.26.0", features = ["serde"] } +pyo3 = { version = "0.28.0", features = ["serde"] } # Jiter has a dependency on pyo3, which needs to match the version used in Sail. -# https://github.com/pydantic/jiter/blob/v0.11.1/Cargo.toml -jiter = { version = "0.11.1" } -arrow = { version = "57.1.0", features = ["chrono-tz"] } -arrow-buffer = { version = "57.1.0" } -arrow-schema = { version = "57.1.0", features = ["serde"] } -arrow-flight = { version = "57.1.0" } -arrow-pyarrow = { version = "57.1.0" } -parquet = { version = "57.3.0" } -serde_arrow = { version = "0.13.7", features = ["arrow-57"] } +# https://github.com/pydantic/jiter/blob/v0.13.0/Cargo.toml +jiter = { version = "0.13.0" } +arrow = { version = "58.0.0", features = ["chrono-tz"] } +arrow-buffer = { version = "58.0.0" } +arrow-schema = { version = "58.0.0", features = ["serde"] } +arrow-flight = { version = "58.0.0" } +arrow-pyarrow = { version = "58.0.0" } +parquet = { version = "58.0.0" } +serde_arrow = { version = "0.14.0", features = ["arrow-58"] } # The `object_store` version must match the one used in DataFusion. -object_store = { version = "0.12.4", features = ["aws", "gcp", "azure", "http"] } -hdfs-native-object-store = "0.15.0" +object_store = { version = "0.13.2", features = ["aws", "gcp", "azure", "http"] } +hdfs-native-object-store = "0.16.0" -# Lakehouse -delta_kernel = { version = "0.18.2", features = ["arrow-57", "default-engine-rustls", "internal-api"] } ###### # This is the end of the manually managed dependencies. # Do not add more dependencies below. diff --git a/clippy.toml b/clippy.toml index 44a9563e2c..f55f3f43d2 100644 --- a/clippy.toml +++ b/clippy.toml @@ -2,12 +2,6 @@ # since we are developing an application rather than a library. avoid-breaking-exported-api = false too-many-arguments-threshold = 8 -# FIXME: On Rust 1.87, tonic and DataFusion trigger: -# - `result_large_err`: https://rust-lang.github.io/rust-clippy/master/index.html#result_large_err -# - `large_enum_variant`: https://rust-lang.github.io/rust-clippy/master/index.html#large_enum_variant -# https://github.com/hyperium/tonic/issues/2253 -# https://github.com/apache/datafusion/issues/16061 -large-error-threshold = 264 disallowed-types = [ { path = "datafusion_catalog::table::TableProviderFactory" }, { path = "datafusion_datasource::file_format::FileFormatFactory" }, diff --git a/compose.yml b/compose.yml index 2b79d6d9d0..3cb79fdfcd 100644 --- a/compose.yml +++ b/compose.yml @@ -115,31 +115,8 @@ services: profiles: - telemetry - postgres: - image: postgres:16-alpine - container_name: sail-postgres-dev - environment: - POSTGRES_USER: testuser - POSTGRES_PASSWORD: testpass - POSTGRES_DB: testdb - ports: - - "5432:5432" - volumes: - - postgres-data:/var/lib/postgresql/data - - ./python/pysail/tests/datasources/jdbc/init.sql:/docker-entrypoint-initdb.d/init.sql - healthcheck: - test: ["CMD-SHELL", "pg_isready -U testuser -d testdb"] - start_period: 10s - interval: 5s - timeout: 5s - retries: 5 - profiles: - - datasources - volumes: minio-data: driver: local azurite-data: driver: local - postgres-data: - driver: local diff --git a/crates/sail-cache/src/file_listing_cache.rs b/crates/sail-cache/src/file_listing_cache.rs index a2419b4874..6ccdf8007f 100644 --- a/crates/sail-cache/src/file_listing_cache.rs +++ b/crates/sail-cache/src/file_listing_cache.rs @@ -4,15 +4,14 @@ use std::sync::Arc; use std::time::Duration; use datafusion::common::{Result as DataFusionResult, TableReference}; -use datafusion::execution::cache::cache_manager::ListFilesCache; +use datafusion::execution::cache::cache_manager::{CachedFileList, ListFilesCache}; use datafusion::execution::cache::{CacheAccessor, ListFilesEntry, TableScopedPath}; use log::debug; use moka::sync::Cache; -use object_store::path::Path; use object_store::ObjectMeta; pub struct MokaFileListingCache { - objects: Cache>>, + objects: Cache, ttl: Option, max_entries: Option, } @@ -58,63 +57,17 @@ fn meta_heap_bytes(object_meta: &ObjectMeta) -> usize { size } -impl CacheAccessor>> for MokaFileListingCache { - type Extra = Option; - - fn get(&self, k: &TableScopedPath) -> Option>> { - self.get_with_extra(k, &None) +impl CacheAccessor for MokaFileListingCache { + fn get(&self, k: &TableScopedPath) -> Option { + self.objects.get(k) } - fn get_with_extra( - &self, - k: &TableScopedPath, - prefix: &Self::Extra, - ) -> Option>> { - let objects = self.objects.get(k)?; - - let Some(prefix) = prefix else { - return Some(objects); - }; - - // Build full prefix: table_base/prefix - let table_base = &k.path; - let mut parts: Vec<_> = table_base.parts().collect(); - parts.extend(prefix.parts()); - let full_prefix = Path::from_iter(parts); - let full_prefix_str = full_prefix.as_ref(); - - let filtered = objects - .iter() - .filter(|meta| meta.location.as_ref().starts_with(full_prefix_str)) - .cloned() - .collect::>(); - - if filtered.is_empty() { - None - } else { - Some(Arc::new(filtered)) - } - } - - fn put( - &self, - key: &TableScopedPath, - value: Arc>, - ) -> Option>> { + fn put(&self, key: &TableScopedPath, value: CachedFileList) -> Option { self.objects.insert(key.clone(), value); None } - fn put_with_extra( - &self, - key: &TableScopedPath, - value: Arc>, - _e: &Self::Extra, - ) -> Option>> { - self.put(key, value) - } - - fn remove(&self, k: &TableScopedPath) -> Option>> { + fn remove(&self, k: &TableScopedPath) -> Option { self.objects.remove(k) } @@ -157,14 +110,14 @@ impl ListFilesCache for MokaFileListingCache { fn list_entries(&self) -> HashMap { self.objects .iter() - .map(|(table_scoped_path, metas)| { - let metas = Arc::clone(&metas); + .map(|(table_scoped_path, cached)| { + let metas = Arc::clone(&cached.files); let size_bytes = (metas.capacity() * size_of::()) + metas.iter().map(meta_heap_bytes).sum::(); ( (*table_scoped_path).clone(), ListFilesEntry { - metas, + metas: cached.clone(), size_bytes, // moka handles expiration; we don't have per-entry expiration time expires: None, @@ -200,7 +153,7 @@ mod tests { #[test] fn test_file_listing_cache() { let meta = ObjectMeta { - location: Path::from("test"), + location: object_store::path::Path::from("test"), last_modified: DateTime::parse_from_rfc3339("2022-09-27T22:36:00+02:00") .unwrap() .into(), @@ -216,9 +169,9 @@ mod tests { }; assert!(cache.get(&key).is_none()); - cache.put(&key, vec![meta.clone()].into()); + cache.put(&key, CachedFileList::new(vec![meta.clone()])); assert_eq!( - cache.get(&key).unwrap().first().unwrap().clone(), + cache.get(&key).unwrap().files.first().unwrap().clone(), meta.clone() ); } diff --git a/crates/sail-cache/src/file_metadata_cache.rs b/crates/sail-cache/src/file_metadata_cache.rs index edb88bad1e..a1059b01b7 100644 --- a/crates/sail-cache/src/file_metadata_cache.rs +++ b/crates/sail-cache/src/file_metadata_cache.rs @@ -1,20 +1,18 @@ use std::collections::HashMap; -use std::sync::Arc; use std::time::Duration; use datafusion::execution::cache::cache_manager::{ - FileMetadata, FileMetadataCache, FileMetadataCacheEntry, + CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry, }; use datafusion::execution::cache::CacheAccessor; use log::debug; use moka::policy::EvictionPolicy; use moka::sync::Cache; use object_store::path::Path; -use object_store::ObjectMeta; pub struct MokaFileMetadataCache { size_limit: Option, - metadata: Cache)>, + metadata: Cache, } impl MokaFileMetadataCache { @@ -34,11 +32,9 @@ impl MokaFileMetadataCache { Self::NAME ); builder = builder - .weigher( - |_key: &Path, (_, meta): &(ObjectMeta, Arc)| -> u32 { - meta.memory_size() as u32 - }, - ) + .weigher(|_key: &Path, entry: &CachedFileMetadataEntry| -> u32 { + entry.file_metadata.memory_size() as u32 + }) .max_capacity(size_limit); } else { debug!("No size limit set for {}", Self::NAME); @@ -65,15 +61,15 @@ impl FileMetadataCache for MokaFileMetadataCache { fn list_entries(&self) -> HashMap { self.metadata .iter() - .map(|(path, (object_meta, meta))| { + .map(|(path, entry)| { ( path.as_ref().clone(), FileMetadataCacheEntry { - object_meta, - size_bytes: meta.memory_size(), + object_meta: entry.meta.clone(), + size_bytes: entry.file_metadata.memory_size(), // TODO: get hits from the cache hits: 0, - extra: meta.extra_info(), + extra: entry.file_metadata.extra_info(), }, ) }) @@ -81,51 +77,22 @@ impl FileMetadataCache for MokaFileMetadataCache { } } -impl CacheAccessor> for MokaFileMetadataCache { - type Extra = ObjectMeta; - - fn get(&self, k: &ObjectMeta) -> Option> { - self.metadata - .get(&k.location) - .and_then(|(extra, metadata)| { - if extra.size == k.size && extra.last_modified == k.last_modified { - Some(Arc::clone(&metadata)) - } else { - None - } - }) - } - - fn get_with_extra(&self, k: &ObjectMeta, _e: &Self::Extra) -> Option> { - self.get(k) +impl CacheAccessor for MokaFileMetadataCache { + fn get(&self, k: &Path) -> Option { + self.metadata.get(k) } - fn put(&self, key: &ObjectMeta, value: Arc) -> Option> { - self.metadata - .insert(key.location.clone(), (key.clone(), value)); + fn put(&self, key: &Path, value: CachedFileMetadataEntry) -> Option { + self.metadata.insert(key.clone(), value); None } - fn put_with_extra( - &self, - key: &ObjectMeta, - value: Arc, - _e: &Self::Extra, - ) -> Option> { - self.put(key, value) + fn remove(&self, k: &Path) -> Option { + self.metadata.remove(k) } - fn remove(&self, k: &ObjectMeta) -> Option> { - self.metadata - .remove(&k.location) - .map(|(_, metadata)| metadata) - } - - fn contains_key(&self, k: &ObjectMeta) -> bool { - self.metadata - .get(&k.location) - .map(|(extra, _)| extra.size == k.size && extra.last_modified == k.last_modified) - .unwrap_or(false) + fn contains_key(&self, k: &Path) -> bool { + self.metadata.contains_key(k) } fn len(&self) -> usize { @@ -184,47 +151,36 @@ mod tests { version: None, }; - let metadata: Arc = Arc::new(TestFileMetadata { + let file_metadata: Arc = Arc::new(TestFileMetadata { metadata: "retrieved_metadata".to_owned(), }); + let entry = CachedFileMetadataEntry::new(object_meta.clone(), Arc::clone(&file_metadata)); let cache = MokaFileMetadataCache::new(None, None); - assert!(cache.get(&object_meta).is_none()); + assert!(cache.get(&object_meta.location).is_none()); // put - cache.put(&object_meta, metadata); + cache.put(&object_meta.location, entry.clone()); // get and contains of a valid entry - assert!(cache.contains_key(&object_meta)); - let value = cache.get(&object_meta); + assert!(cache.contains_key(&object_meta.location)); + let value = cache.get(&object_meta.location); assert!(value.is_some()); - let test_file_metadata = Arc::downcast::(value.unwrap()); - assert!(test_file_metadata.is_ok()); - assert_eq!(test_file_metadata.unwrap().metadata, "retrieved_metadata"); - - // file size changed - let mut object_meta2 = object_meta.clone(); - object_meta2.size = 2048; - assert!(cache.get(&object_meta2).is_none()); - assert!(!cache.contains_key(&object_meta2)); - - // file last_modified changed - let mut object_meta2 = object_meta.clone(); - object_meta2.last_modified = DateTime::parse_from_rfc3339("2025-07-29T13:13:13+00:00") - .unwrap() - .into(); - assert!(cache.get(&object_meta2).is_none()); - assert!(!cache.contains_key(&object_meta2)); + let cached_entry = value.unwrap(); + assert!(cached_entry.is_valid_for(&object_meta)); + let test_meta = Arc::downcast::(cached_entry.file_metadata); + assert!(test_meta.is_ok()); + assert_eq!(test_meta.unwrap().metadata, "retrieved_metadata"); // different file let mut object_meta2 = object_meta.clone(); object_meta2.location = Path::from("test2"); - assert!(cache.get(&object_meta2).is_none()); - assert!(!cache.contains_key(&object_meta2)); + assert!(cache.get(&object_meta2.location).is_none()); + assert!(!cache.contains_key(&object_meta2.location)); // remove - cache.remove(&object_meta); - assert!(cache.get(&object_meta).is_none()); - assert!(!cache.contains_key(&object_meta)); + cache.remove(&object_meta.location); + assert!(cache.get(&object_meta.location).is_none()); + assert!(!cache.contains_key(&object_meta.location)); } } diff --git a/crates/sail-cache/src/file_statistics_cache.rs b/crates/sail-cache/src/file_statistics_cache.rs index 12333f7e80..1410005b0f 100644 --- a/crates/sail-cache/src/file_statistics_cache.rs +++ b/crates/sail-cache/src/file_statistics_cache.rs @@ -1,17 +1,16 @@ use std::collections::HashMap; -use std::sync::Arc; use std::time::Duration; -use datafusion::common::Statistics; -use datafusion::execution::cache::cache_manager::{FileStatisticsCache, FileStatisticsCacheEntry}; +use datafusion::execution::cache::cache_manager::{ + CachedFileMetadata, FileStatisticsCache, FileStatisticsCacheEntry, +}; use datafusion::execution::cache::CacheAccessor; -use log::{debug, error}; +use log::debug; use moka::sync::Cache; use object_store::path::Path; -use object_store::ObjectMeta; pub struct MokaFileStatisticsCache { - statistics: Cache)>, + statistics: Cache, } impl MokaFileStatisticsCache { @@ -39,42 +38,18 @@ impl MokaFileStatisticsCache { } } -impl CacheAccessor> for MokaFileStatisticsCache { - type Extra = ObjectMeta; - - fn get(&self, k: &Path) -> Option> { - self.statistics - .get(k) - .map(|(_saved_meta, statistics)| statistics) - } - - fn get_with_extra(&self, k: &Path, e: &Self::Extra) -> Option> { - self.statistics.get(k).and_then(|(saved_meta, statistics)| { - if saved_meta.size == e.size && saved_meta.last_modified == e.last_modified { - Some(Arc::clone(&statistics)) - } else { - None - } - }) +impl CacheAccessor for MokaFileStatisticsCache { + fn get(&self, k: &Path) -> Option { + self.statistics.get(k) } - fn put(&self, _key: &Path, _value: Arc) -> Option> { - error!("Put cache in {} without Extra is not supported", Self::NAME); + fn put(&self, key: &Path, value: CachedFileMetadata) -> Option { + self.statistics.insert(key.clone(), value); None } - fn put_with_extra( - &self, - key: &Path, - value: Arc, - e: &Self::Extra, - ) -> Option> { - self.statistics.insert(key.clone(), (e.clone(), value)); - None - } - - fn remove(&self, k: &Path) -> Option> { - self.statistics.remove(k).map(|(_, statistics)| statistics) + fn remove(&self, k: &Path) -> Option { + self.statistics.remove(k) } fn contains_key(&self, k: &Path) -> bool { @@ -88,6 +63,7 @@ impl CacheAccessor> for MokaFileStatisticsCache { fn clear(&self) { self.statistics.invalidate_all(); } + fn name(&self) -> String { Self::NAME.to_string() } @@ -97,15 +73,16 @@ impl FileStatisticsCache for MokaFileStatisticsCache { fn list_entries(&self) -> HashMap { self.statistics .iter() - .map(|(path, (object_meta, stats))| { + .map(|(path, cached)| { ( path.as_ref().clone(), FileStatisticsCacheEntry { - object_meta, - num_rows: stats.num_rows, - num_columns: stats.column_statistics.len(), - table_size_bytes: stats.total_byte_size, + object_meta: cached.meta.clone(), + num_rows: cached.statistics.num_rows, + num_columns: cached.statistics.column_statistics.len(), + table_size_bytes: cached.statistics.total_byte_size, statistics_size_bytes: 0, // TODO: set to the real size in the future + has_ordering: cached.ordering.is_some(), }, ) }) @@ -116,8 +93,11 @@ impl FileStatisticsCache for MokaFileStatisticsCache { #[expect(clippy::unwrap_used)] #[cfg(test)] mod tests { + use std::sync::Arc; + use chrono::DateTime; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; + use datafusion::common::Statistics; use object_store::path::Path; use object_store::ObjectMeta; @@ -135,35 +115,30 @@ mod tests { version: None, }; let cache = MokaFileStatisticsCache::new(None, None); - assert!(cache.get_with_extra(&meta.location, &meta).is_none()); - - cache.put_with_extra( - &meta.location, - Statistics::new_unknown(&Schema::new(vec![Field::new( - "test_column", - DataType::Timestamp(TimeUnit::Second, None), - false, - )])) - .into(), - &meta, - ); - assert!(cache.get_with_extra(&meta.location, &meta).is_some()); + assert!(cache.get(&meta.location).is_none()); + + let stats = Arc::new(Statistics::new_unknown(&Schema::new(vec![Field::new( + "test_column", + DataType::Timestamp(TimeUnit::Second, None), + false, + )]))); + let cached = CachedFileMetadata::new(meta.clone(), Arc::clone(&stats), None); + cache.put(&meta.location, cached); + let cached = cache.get(&meta.location); + assert!(cached.is_some()); + assert!(cached.unwrap().is_valid_for(&meta)); // file size changed let mut meta2 = meta.clone(); meta2.size = 2048; - assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); - - // file last_modified changed - let mut meta2 = meta.clone(); - meta2.last_modified = DateTime::parse_from_rfc3339("2022-09-27T22:40:00+02:00") - .unwrap() - .into(); - assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); + assert!(!cache + .get(&meta2.location) + .map(|c| c.is_valid_for(&meta2)) + .unwrap_or(false)); // different file let mut meta2 = meta; meta2.location = Path::from("test2"); - assert!(cache.get_with_extra(&meta2.location, &meta2).is_none()); + assert!(cache.get(&meta2.location).is_none()); } } diff --git a/crates/sail-catalog-glue/src/format.rs b/crates/sail-catalog-glue/src/format.rs index 90842992e4..58ba79492b 100644 --- a/crates/sail-catalog-glue/src/format.rs +++ b/crates/sail-catalog-glue/src/format.rs @@ -1,3 +1,5 @@ +use std::collections::HashMap; + /// Storage format information for Glue tables. /// /// Maps user-friendly format names to Hive/Hadoop Java class names for InputFormat, @@ -89,4 +91,13 @@ impl GlueStorageFormat { _ => "unknown".to_string(), } } + + pub fn detect_iceberg_format(properties: Option<&HashMap>) -> Option { + if let Some(properties) = properties { + if properties.get("table_type").is_some_and(|x| x == "iceberg") { + return Some("iceberg".to_string()); + } + } + None + } } diff --git a/crates/sail-catalog-glue/src/hive.rs b/crates/sail-catalog-glue/src/hive.rs index 348b4f6414..e9a46fceac 100644 --- a/crates/sail-catalog-glue/src/hive.rs +++ b/crates/sail-catalog-glue/src/hive.rs @@ -32,7 +32,7 @@ pub(crate) async fn create_hive_table( table: &str, options: CreateTableOptions, ) -> CatalogResult { - let db_name = database.to_string(); + let database_name = GlueCatalogProvider::database_name(database)?; let ValidatedHiveOptions { columns, @@ -61,7 +61,7 @@ pub(crate) async fn create_hive_table( let result = client .create_table() - .database_name(&db_name) + .database_name(&database_name) .table_input(table_input) .send() .await; diff --git a/crates/sail-catalog-glue/src/iceberg.rs b/crates/sail-catalog-glue/src/iceberg.rs index c85d0e10f9..8fd5026ebb 100644 --- a/crates/sail-catalog-glue/src/iceberg.rs +++ b/crates/sail-catalog-glue/src/iceberg.rs @@ -37,7 +37,7 @@ pub(crate) async fn create_iceberg_table( table: &str, options: CreateTableOptions, ) -> CatalogResult { - let db_name = database.to_string(); + let database_name = GlueCatalogProvider::database_name(database)?; let ValidatedIcebergOptions { columns, @@ -87,7 +87,7 @@ pub(crate) async fn create_iceberg_table( let result = client .create_table() - .database_name(&db_name) + .database_name(&database_name) .name(table) .open_table_format_input(open_format_input) .send() diff --git a/crates/sail-catalog-glue/src/provider.rs b/crates/sail-catalog-glue/src/provider.rs index a964bfe544..fbb3896efb 100644 --- a/crates/sail-catalog-glue/src/provider.rs +++ b/crates/sail-catalog-glue/src/provider.rs @@ -11,7 +11,10 @@ use sail_catalog::provider::{ CatalogProvider, CreateDatabaseOptions, CreateTableOptions, CreateViewColumnOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, }; -use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; +use sail_catalog::utils::quote_namespace_if_needed; +use sail_common_datafusion::catalog::{ + identity_partition_fields, DatabaseStatus, TableColumnStatus, TableKind, TableStatus, +}; use tokio::sync::OnceCell; use crate::data_type::{arrow_to_glue_type, glue_type_to_arrow}; @@ -43,7 +46,7 @@ impl GlueCatalogProvider { } } - pub(crate) async fn get_client(&self) -> CatalogResult<&Client> { + pub(super) async fn get_client(&self) -> CatalogResult<&Client> { self.client .get_or_try_init(|| async { let mut config_loader = aws_config::defaults(BehaviorVersion::latest()); @@ -62,6 +65,17 @@ impl GlueCatalogProvider { .await } + pub(super) fn database_name(database: &Namespace) -> CatalogResult { + if database.tail.is_empty() { + Ok(database.head.to_string()) + } else { + Err(CatalogError::InvalidArgument(format!( + "Glue catalog does not support multi-level database names: {}", + quote_namespace_if_needed(database) + ))) + } + } + fn database_to_status( &self, db: &aws_sdk_glue::types::Database, @@ -95,11 +109,12 @@ impl GlueCatalogProvider { // Extract location let location = storage.and_then(|sd| sd.location()).map(|s| s.to_string()); - // Detect format from serde info + // Detect format from serde info and table parameters let format = storage .and_then(|sd| sd.serde_info()) .and_then(|si| si.serialization_library()) .map(|lib| GlueStorageFormat::detect_format_from_serde(Some(lib))) + .or_else(|| GlueStorageFormat::detect_iceberg_format(table.parameters())) .unwrap_or_else(|| "unknown".to_string()); // Extract columns from storage descriptor @@ -167,7 +182,7 @@ impl GlueCatalogProvider { constraints: vec![], location, format, - partition_by: partition_keys, + partition_by: identity_partition_fields(&partition_keys), sort_by: vec![], bucket_by: None, options: vec![], @@ -308,7 +323,7 @@ impl CatalogProvider for GlueCatalogProvider { options: CreateDatabaseOptions, ) -> CatalogResult { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let CreateDatabaseOptions { if_not_exists, @@ -323,25 +338,25 @@ impl CatalogProvider for GlueCatalogProvider { Some(properties.into_iter().collect()) }; - let mut db_input = aws_sdk_glue::types::DatabaseInput::builder().name(&db_name); + let mut database_input = aws_sdk_glue::types::DatabaseInput::builder().name(&database_name); if let Some(desc) = &comment { - db_input = db_input.description(desc); + database_input = database_input.description(desc); } if let Some(loc) = &location { - db_input = db_input.location_uri(loc); + database_input = database_input.location_uri(loc); } if let Some(params) = parameters { - db_input = db_input.set_parameters(Some(params)); + database_input = database_input.set_parameters(Some(params)); } - let db_input = db_input.build().map_err(|e| { + let database_input = database_input.build().map_err(|e| { CatalogError::InvalidArgument(format!("Failed to build database input: {e}")) })?; let result = client .create_database() - .database_input(db_input) + .database_input(database_input) .send() .await; @@ -353,7 +368,7 @@ impl CatalogProvider for GlueCatalogProvider { if if_not_exists { self.get_database(database).await } else { - Err(CatalogError::AlreadyExists("database", db_name)) + Err(CatalogError::AlreadyExists("database", database_name)) } } else { Err(CatalogError::External(format!( @@ -366,9 +381,9 @@ impl CatalogProvider for GlueCatalogProvider { async fn get_database(&self, database: &Namespace) -> CatalogResult { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; - let result = client.get_database().name(&db_name).send().await; + let result = client.get_database().name(&database_name).send().await; match result { Ok(output) => { @@ -380,7 +395,7 @@ impl CatalogProvider for GlueCatalogProvider { Err(sdk_err) => { let service_err = sdk_err.into_service_error(); if service_err.is_entity_not_found_exception() { - Err(CatalogError::NotFound("database", db_name)) + Err(CatalogError::NotFound("database", database_name)) } else { Err(CatalogError::External(format!( "Failed to get database: {service_err}" @@ -408,9 +423,9 @@ impl CatalogProvider for GlueCatalogProvider { for db in page.database_list() { let status = self.database_to_status(db)?; if let Some(p) = prefix { - let db_namespace = Namespace::try_from(status.database.clone()) + let database_namespace = Namespace::try_from(status.database.clone()) .map_err(|e| CatalogError::External(format!("Invalid namespace: {e}")))?; - if !p.is_parent_of(&db_namespace) && p != &db_namespace { + if !p.is_parent_of(&database_namespace) && p != &database_namespace { continue; } } @@ -427,14 +442,14 @@ impl CatalogProvider for GlueCatalogProvider { options: DropDatabaseOptions, ) -> CatalogResult<()> { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let DropDatabaseOptions { if_exists, cascade: _, // Glue requires database to be empty; cascade not directly supported } = options; - let result = client.delete_database().name(&db_name).send().await; + let result = client.delete_database().name(&database_name).send().await; match result { Ok(_) => Ok(()), @@ -444,7 +459,7 @@ impl CatalogProvider for GlueCatalogProvider { if if_exists { Ok(()) } else { - Err(CatalogError::NotFound("database", db_name)) + Err(CatalogError::NotFound("database", database_name)) } } else { Err(CatalogError::External(format!( @@ -459,11 +474,17 @@ impl CatalogProvider for GlueCatalogProvider { &self, database: &Namespace, table: &str, - options: CreateTableOptions, + mut options: CreateTableOptions, ) -> CatalogResult { let client = self.get_client().await?; let format_lower = options.format.to_lowercase(); + // Skip location or path options since the location is available in + // the `location` field in `CreateTableOptions`. + options + .options + .retain(|(k, _)| k != "location" && k != "path"); + if format_lower == "iceberg" { iceberg::create_iceberg_table(self, client, database, table, options).await } else { @@ -473,11 +494,11 @@ impl CatalogProvider for GlueCatalogProvider { async fn get_table(&self, database: &Namespace, table: &str) -> CatalogResult { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let result = client .get_table() - .database_name(&db_name) + .database_name(&database_name) .name(table) .send() .await; @@ -510,12 +531,12 @@ impl CatalogProvider for GlueCatalogProvider { async fn list_tables(&self, database: &Namespace) -> CatalogResult> { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let mut tables = Vec::new(); let mut paginator = client .get_tables() - .database_name(&db_name) + .database_name(&database_name) .into_paginator() .send(); @@ -552,11 +573,11 @@ impl CatalogProvider for GlueCatalogProvider { } let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let result = client .delete_table() - .database_name(&db_name) + .database_name(&database_name) .name(table) .send() .await; @@ -585,7 +606,7 @@ impl CatalogProvider for GlueCatalogProvider { options: CreateViewOptions, ) -> CatalogResult { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let CreateViewOptions { columns, @@ -613,7 +634,7 @@ impl CatalogProvider for GlueCatalogProvider { let result = client .create_table() - .database_name(&db_name) + .database_name(&database_name) .table_input(view_input) .send() .await; @@ -639,11 +660,11 @@ impl CatalogProvider for GlueCatalogProvider { async fn get_view(&self, database: &Namespace, view: &str) -> CatalogResult { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let result = client .get_table() - .database_name(&db_name) + .database_name(&database_name) .name(view) .send() .await; @@ -676,12 +697,12 @@ impl CatalogProvider for GlueCatalogProvider { async fn list_views(&self, database: &Namespace) -> CatalogResult> { let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let mut views = Vec::new(); let mut paginator = client .get_tables() - .database_name(&db_name) + .database_name(&database_name) .into_paginator() .send(); @@ -710,11 +731,11 @@ impl CatalogProvider for GlueCatalogProvider { let DropViewOptions { if_exists } = options; let client = self.get_client().await?; - let db_name = database.to_string(); + let database_name = Self::database_name(database)?; let result = client .delete_table() - .database_name(&db_name) + .database_name(&database_name) .name(view) .send() .await; diff --git a/crates/sail-catalog-glue/tests/table_tests.rs b/crates/sail-catalog-glue/tests/table_tests.rs index 7c222d3bb8..dd1d020d7c 100644 --- a/crates/sail-catalog-glue/tests/table_tests.rs +++ b/crates/sail-catalog-glue/tests/table_tests.rs @@ -98,7 +98,13 @@ async fn test_create_table() { assert_eq!(comment, &Some("Product catalog table".to_string())); assert_eq!(location, &Some("s3://bucket/products".to_string())); assert_eq!(format, "parquet"); - assert_eq!(partition_by, &vec!["category".to_string()]); + assert_eq!( + partition_by, + &vec![CatalogPartitionField { + column: "category".to_string(), + transform: None, + }] + ); assert!(properties .iter() .any(|(k, v)| k == "owner" && v == "test_user")); diff --git a/crates/sail-catalog-iceberg/src/provider.rs b/crates/sail-catalog-iceberg/src/provider.rs index 08155114b1..e611b17d02 100644 --- a/crates/sail-catalog-iceberg/src/provider.rs +++ b/crates/sail-catalog-iceberg/src/provider.rs @@ -19,11 +19,12 @@ use sail_catalog::provider::{ CreateTableOptions, CreateViewColumnOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, PartitionTransform, }; -use sail_catalog::utils::get_property; +use sail_catalog::utils::{get_property, quote_name_if_needed, quote_namespace_if_needed}; use sail_common_datafusion::catalog::{ CatalogTableConstraint, CatalogTableSort, DatabaseStatus, TableColumnStatus, TableKind, TableStatus, }; +use sail_iceberg::utils::partition_transform::catalog_partition_field_from_iceberg; use sail_iceberg::{arrow_type_to_iceberg, iceberg_type_to_arrow, NestedField, StructType}; use tokio::sync::OnceCell; @@ -149,6 +150,18 @@ impl IcebergRestCatalogProvider { Ok((client, merged_catalog_config)) } + /// Converts a `Namespace` into a string representation for the REST API URL. + /// The Iceberg REST API separates namespace components with `\x1F`. + fn namespace_string(database: &Namespace) -> String { + // TODO: The separator is actually configurable and we should support it as an option. + let mut result = database.head.to_string(); + for s in &database.tail { + result.push('\x1F'); + result.push_str(s.as_ref()); + } + result + } + /// Converts an Iceberg REST API table load result into a catalog `TableStatus`. fn load_table_result_to_status( &self, @@ -205,9 +218,30 @@ impl IcebergRestCatalogProvider { }) .unwrap_or_default(); - let partition_by: Vec = default_partition_spec - .map(|spec| spec.fields.iter().map(|f| f.name.clone()).collect()) - .unwrap_or_default(); + let partition_by = match (current_schema, default_partition_spec) { + (Some(schema), Some(spec)) => spec + .fields + .iter() + .map(|field| { + let source_column = schema + .fields + .iter() + .find(|f| f.id == field.source_id) + .ok_or_else(|| { + CatalogError::External(format!( + "Partition field source id {} not found in schema", + field.source_id + )) + })? + .name + .clone(); + let transform = field.transform.parse().map_err(CatalogError::External)?; + catalog_partition_field_from_iceberg(source_column, transform) + .map_err(CatalogError::External) + }) + .collect::>>()?, + _ => Vec::new(), + }; let columns = if let Some(schema) = current_schema { let mut cols = Vec::new(); @@ -562,12 +596,12 @@ impl CatalogProvider for IcebergRestCatalogProvider { async fn get_database(&self, database: &Namespace) -> CatalogResult { let (client, catalog_config) = self.load_client_and_merged_config().await?; - let namespace_str = database.to_string(); + let namespace = Self::namespace_string(database); let result = client .catalog_api_api() .load_namespace_metadata( - &namespace_str, + &namespace, catalog_config .props .get(REST_CATALOG_PROP_PREFIX) @@ -577,12 +611,18 @@ impl CatalogProvider for IcebergRestCatalogProvider { .map_err(|e| match e { apis::Error::ResponseError(apis::ResponseContent { status, .. }) => { if status == 404 { - CatalogError::NotFound("namespace", database.to_string()) + CatalogError::NotFound("namespace", quote_namespace_if_needed(database)) } else { - CatalogError::External(format!("Failed to load namespace {database}: {e}")) + CatalogError::External(format!( + "Failed to load namespace {}: {e}", + quote_namespace_if_needed(database) + )) } } - _ => CatalogError::External(format!("Failed to load namespace {database}: {e}")), + _ => CatalogError::External(format!( + "Failed to load namespace {}: {e}", + quote_namespace_if_needed(database) + )), })?; let comment = result @@ -609,7 +649,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { prefix: Option<&Namespace>, ) -> CatalogResult> { let (client, catalog_config) = self.load_client_and_merged_config().await?; - let parent = prefix.map(|namespace| namespace.to_string()); + let parent = prefix.map(|namespace| Self::namespace_string(namespace)); let result = client .catalog_api_api() @@ -654,7 +694,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { match client .catalog_api_api() .drop_namespace( - &database.to_string(), + &Self::namespace_string(database), catalog_config .props .get(REST_CATALOG_PROP_PREFIX) @@ -776,7 +816,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .create_table( - &database.to_string(), + &Self::namespace_string(database), request, None, catalog_config @@ -795,7 +835,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .load_table( - &database.to_string(), + &Self::namespace_string(database), table, None, None, @@ -810,11 +850,20 @@ impl CatalogProvider for IcebergRestCatalogProvider { apis::Error::ResponseError(apis::ResponseContent { status, .. }) if status == 404 => { - CatalogError::NotFound("table", format!("{database}.{table}")) - } - _ => { - CatalogError::External(format!("Failed to load table {database}.{table}: {e}")) + CatalogError::NotFound( + "table", + format!( + "{}.{}", + quote_namespace_if_needed(database), + quote_name_if_needed(table) + ), + ) } + _ => CatalogError::External(format!( + "Failed to load table {}.{}: {e}", + quote_namespace_if_needed(database), + quote_name_if_needed(table), + )), })?; self.load_table_result_to_status(table, database, result) } @@ -825,7 +874,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .list_tables( - &database.to_string(), + &Self::namespace_string(database), None, None, catalog_config @@ -871,7 +920,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { match client .catalog_api_api() .drop_table( - &database.to_string(), + &Self::namespace_string(database), table, Some(purge), catalog_config @@ -996,7 +1045,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .create_view( - &database.to_string(), + &Self::namespace_string(database), request, catalog_config .props @@ -1014,7 +1063,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .load_view( - &database.to_string(), + &Self::namespace_string(database), view, catalog_config .props @@ -1023,7 +1072,11 @@ impl CatalogProvider for IcebergRestCatalogProvider { ) .await .map_err(|e| { - CatalogError::External(format!("Failed to load view {database}.{view}: {e}")) + CatalogError::External(format!( + "Failed to load view {}.{}: {e}", + quote_namespace_if_needed(database), + quote_name_if_needed(view) + )) })?; self.load_view_result_to_status(view, database, result) } @@ -1034,7 +1087,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { let result = client .catalog_api_api() .list_views( - &database.to_string(), + &Self::namespace_string(database), None, None, catalog_config @@ -1074,7 +1127,7 @@ impl CatalogProvider for IcebergRestCatalogProvider { match client .catalog_api_api() .drop_view( - &database.to_string(), + &Self::namespace_string(database), view, catalog_config .props @@ -1858,7 +1911,13 @@ mod tests { assert_eq!(location, Some("s3://bucket/table".to_string())); assert_eq!(format, "iceberg"); - assert_eq!(partition_by, vec!["category".to_string()]); + assert_eq!( + partition_by, + vec![CatalogPartitionField { + column: "category".to_string(), + transform: None, + }] + ); assert_eq!(sort_by.len(), 1); assert_eq!(sort_by[0].column, "id"); diff --git a/crates/sail-catalog-iceberg/tests/rest_integration_test.rs b/crates/sail-catalog-iceberg/tests/rest_integration_test.rs index 8bb880ecff..0dc053abaf 100644 --- a/crates/sail-catalog-iceberg/tests/rest_integration_test.rs +++ b/crates/sail-catalog-iceberg/tests/rest_integration_test.rs @@ -255,7 +255,7 @@ async fn test_get_namespace() { } let get_db = rest_catalog.get_database(&namespace).await.unwrap(); - assert_eq!(get_db.database, vec![namespace.to_string()]); + assert_eq!(get_db.database, Vec::::from(namespace)); for (key, value) in &properties { assert!(get_db .properties @@ -657,7 +657,7 @@ async fn test_create_table() { assert_eq!(properties.len(), 15); assert_eq!(static_properties, expected_properties); assert!(properties.iter().any(|(k, v)| k == "metadata-location" - && v.starts_with("s3://icebergdata/demo/test_create_table.apple.ios/t1/metadata/"))); + && v.starts_with("s3://icebergdata/demo/test_create_table/apple/ios/t1/metadata/"))); assert!(properties .iter() .any(|(k, v)| k == "metadata.last-updated-ms" && !v.is_empty())); @@ -672,10 +672,10 @@ async fn test_create_table() { assert_eq!(constraints, vec![]); assert_eq!( location, - Some("s3://icebergdata/demo/test_create_table.apple.ios/t1".to_string()) + Some("s3://icebergdata/demo/test_create_table/apple/ios/t1".to_string()) ); assert_eq!(format, "iceberg".to_string()); - assert_eq!(partition_by, Vec::::new()); + assert_eq!(partition_by, Vec::::new()); assert_eq!(sort_by, vec![]); assert_eq!(bucket_by, None); assert_eq!(options, Vec::<(String, String)>::new()); @@ -834,7 +834,13 @@ async fn test_create_table() { Some("s3://icebergdata/custom/path/meow".to_string()) ); assert_eq!(format, "iceberg".to_string()); - assert_eq!(partition_by, vec!["baz".to_string()]); + assert_eq!( + partition_by, + vec![CatalogPartitionField { + column: "baz".to_string(), + transform: None, + }] + ); assert_eq!(sort_by.len(), 2); assert!(sort_by.contains(&CatalogTableSort { column: "bar".to_string(), @@ -1070,7 +1076,13 @@ async fn test_get_table() { Some("s3://icebergdata/custom/path/meow".to_string()) ); assert_eq!(format, "iceberg".to_string()); - assert_eq!(partition_by, vec!["baz".to_string()]); + assert_eq!( + partition_by, + vec![CatalogPartitionField { + column: "baz".to_string(), + transform: None, + }] + ); assert_eq!(sort_by.len(), 2); assert!(sort_by.contains(&CatalogTableSort { column: "bar".to_string(), @@ -1893,7 +1905,13 @@ async fn test_create_table_partition_identity() { match kind { TableKind::Table { partition_by, .. } => { assert_eq!(partition_by.len(), 1); - assert_eq!(partition_by[0], "id"); + assert_eq!( + partition_by[0], + CatalogPartitionField { + column: "id".to_string(), + transform: None, + } + ); } _ => panic!("Expected Table kind"), } @@ -1932,7 +1950,13 @@ async fn test_create_table_partition_year() { match kind { TableKind::Table { partition_by, .. } => { assert_eq!(partition_by.len(), 1); - assert_eq!(partition_by[0], "ts_year"); + assert_eq!( + partition_by[0], + CatalogPartitionField { + column: "ts".to_string(), + transform: Some(PartitionTransform::Year), + } + ); } _ => panic!("Expected Table kind"), } @@ -1971,7 +1995,13 @@ async fn test_create_table_partition_bucket() { match kind { TableKind::Table { partition_by, .. } => { assert_eq!(partition_by.len(), 1); - assert_eq!(partition_by[0], "id_bucket"); + assert_eq!( + partition_by[0], + CatalogPartitionField { + column: "id".to_string(), + transform: Some(PartitionTransform::Bucket(16)), + } + ); } _ => panic!("Expected Table kind"), } @@ -2010,7 +2040,13 @@ async fn test_create_table_partition_truncate() { match kind { TableKind::Table { partition_by, .. } => { assert_eq!(partition_by.len(), 1); - assert_eq!(partition_by[0], "name_trunc"); + assert_eq!( + partition_by[0], + CatalogPartitionField { + column: "name".to_string(), + transform: Some(PartitionTransform::Truncate(10)), + } + ); } _ => panic!("Expected Table kind"), } diff --git a/crates/sail-catalog-memory/src/provider.rs b/crates/sail-catalog-memory/src/provider.rs index 1826b5757b..64a870f14e 100644 --- a/crates/sail-catalog-memory/src/provider.rs +++ b/crates/sail-catalog-memory/src/provider.rs @@ -7,6 +7,7 @@ use sail_catalog::provider::{ CreateViewColumnOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, }; +use sail_catalog::utils::quote_namespace_if_needed; use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; struct MemoryDatabase { @@ -71,7 +72,7 @@ impl CatalogProvider for MemoryCatalogProvider { } else { Err(CatalogError::AlreadyExists( "database", - database.to_string(), + quote_namespace_if_needed(database), )) } } @@ -98,7 +99,10 @@ impl CatalogProvider for MemoryCatalogProvider { if let Some(db) = self.databases.get(database) { Ok(db.status.clone()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } @@ -133,7 +137,10 @@ impl CatalogProvider for MemoryCatalogProvider { if if_exists { Ok(()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } else { Ok(()) @@ -160,15 +167,16 @@ impl CatalogProvider for MemoryCatalogProvider { options, properties, } = options; - if partition_by.iter().any(|f| f.transform.is_some()) { + if !format.eq_ignore_ascii_case("iceberg") + && partition_by.iter().any(|f| f.transform.is_some()) + { return Err(CatalogError::NotSupported( "partition transforms are not supported by memory catalog".to_string(), )); } - let mut db = self - .databases - .get_mut(database) - .ok_or_else(|| CatalogError::NotFound("database", database.to_string()))?; + let mut db = self.databases.get_mut(database).ok_or_else(|| { + CatalogError::NotFound("database", quote_namespace_if_needed(database)) + })?; if let Some(status) = db.tables.get(table) { if if_not_exists { return Ok(status.clone()); @@ -218,7 +226,7 @@ impl CatalogProvider for MemoryCatalogProvider { constraints, location, format, - partition_by: partition_by.into_iter().map(|f| f.column).collect(), + partition_by, sort_by, bucket_by, options, @@ -242,7 +250,10 @@ impl CatalogProvider for MemoryCatalogProvider { if let Some(db) = self.databases.get(database) { Ok(db.tables.values().cloned().collect()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } @@ -268,7 +279,10 @@ impl CatalogProvider for MemoryCatalogProvider { } else if if_exists { Ok(()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } @@ -286,10 +300,9 @@ impl CatalogProvider for MemoryCatalogProvider { comment, properties, } = options; - let mut db = self - .databases - .get_mut(database) - .ok_or_else(|| CatalogError::NotFound("database", database.to_string()))?; + let mut db = self.databases.get_mut(database).ok_or_else(|| { + CatalogError::NotFound("database", quote_namespace_if_needed(database)) + })?; if let Some(status) = db.views.get(view) { if if_not_exists { return Ok(status.clone()); @@ -349,7 +362,10 @@ impl CatalogProvider for MemoryCatalogProvider { if let Some(db) = self.databases.get(database) { Ok(db.views.values().cloned().collect()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } @@ -369,7 +385,10 @@ impl CatalogProvider for MemoryCatalogProvider { } else if if_exists { Ok(()) } else { - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } } } diff --git a/crates/sail-catalog-onelake/src/provider.rs b/crates/sail-catalog-onelake/src/provider.rs index 9cc53017ef..444065ab0c 100644 --- a/crates/sail-catalog-onelake/src/provider.rs +++ b/crates/sail-catalog-onelake/src/provider.rs @@ -6,6 +6,7 @@ use sail_catalog::provider::{ CatalogProvider, CreateDatabaseOptions, CreateTableOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, }; +use sail_catalog::utils::quote_namespace_if_needed; use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; use serde::Deserialize; use tokio::sync::OnceCell; @@ -159,6 +160,17 @@ impl OneLakeCatalogProvider { ) } + fn schema_name(database: &Namespace) -> CatalogResult { + if database.tail.is_empty() { + Ok(database.head.to_string()) + } else { + Err(CatalogError::InvalidArgument(format!( + "OneLake catalog does not support multi-level namespaces: {}", + quote_namespace_if_needed(database) + ))) + } + } + fn catalog_name(&self) -> String { format!("{}.{}", self.config.item_name, self.config.item_type) } @@ -327,7 +339,7 @@ impl CatalogProvider for OneLakeCatalogProvider { } async fn get_database(&self, database: &Namespace) -> CatalogResult { - let schema_name = database.head_to_string(); + let schema_name = Self::schema_name(database)?; let client = self.get_client().await?; // OneLake API requires full qualified schema name: catalog.schema @@ -421,7 +433,7 @@ impl CatalogProvider for OneLakeCatalogProvider { } async fn get_table(&self, database: &Namespace, table: &str) -> CatalogResult { - let schema_name = database.head_to_string(); + let schema_name = Self::schema_name(database)?; let client = self.get_client().await?; let full_table_name = format!("{}.{}.{}", self.catalog_name(), schema_name, table); @@ -457,7 +469,7 @@ impl CatalogProvider for OneLakeCatalogProvider { } async fn list_tables(&self, database: &Namespace) -> CatalogResult> { - let schema_name = database.head_to_string(); + let schema_name = Self::schema_name(database)?; let client = self.get_client().await?; let url = format!( diff --git a/crates/sail-catalog-system/src/physical_plan.rs b/crates/sail-catalog-system/src/physical_plan.rs index 033d15d8d9..f924991ffb 100644 --- a/crates/sail-catalog-system/src/physical_plan.rs +++ b/crates/sail-catalog-system/src/physical_plan.rs @@ -21,7 +21,7 @@ pub struct SystemTableExec { projection: Option>, filters: Vec>, fetch: Option, - properties: PlanProperties, + properties: Arc, } impl SystemTableExec { @@ -36,12 +36,12 @@ impl SystemTableExec { } else { table.schema() }; - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Ok(Self { table, projection, @@ -90,7 +90,7 @@ impl ExecutionPlan for SystemTableExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-catalog-system/src/provider.rs b/crates/sail-catalog-system/src/provider.rs index d1d335caf9..d4011437cc 100644 --- a/crates/sail-catalog-system/src/provider.rs +++ b/crates/sail-catalog-system/src/provider.rs @@ -7,6 +7,7 @@ use sail_catalog::provider::{ CatalogProvider, CreateDatabaseOptions, CreateTableOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, }; +use sail_catalog::utils::quote_namespace_if_needed; use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; use sail_common_datafusion::system::catalog::{SystemCatalog, SystemDatabase, SystemTable}; @@ -100,7 +101,10 @@ impl CatalogProvider for SystemCatalogProvider { return Self::get_database_status(database, &db); } } - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } async fn list_databases( @@ -154,7 +158,10 @@ impl CatalogProvider for SystemCatalogProvider { return Err(CatalogError::NotFound("table", table.to_string())); } } - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } async fn list_tables(&self, database: &Namespace) -> CatalogResult> { @@ -169,7 +176,10 @@ impl CatalogProvider for SystemCatalogProvider { return Ok(result); } } - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } async fn drop_table( @@ -199,7 +209,10 @@ impl CatalogProvider for SystemCatalogProvider { if tail.is_empty() && SystemDatabase::get(head).is_some() { return Err(CatalogError::NotFound("view", view.to_string())); } - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } async fn list_views(&self, database: &Namespace) -> CatalogResult> { @@ -207,7 +220,10 @@ impl CatalogProvider for SystemCatalogProvider { if tail.is_empty() && SystemDatabase::get(head).is_some() { return Ok(vec![]); } - Err(CatalogError::NotFound("database", database.to_string())) + Err(CatalogError::NotFound( + "database", + quote_namespace_if_needed(database), + )) } async fn drop_view( diff --git a/crates/sail-catalog-system/src/service.rs b/crates/sail-catalog-system/src/service.rs index d1a020b7ce..d847371ac0 100644 --- a/crates/sail-catalog-system/src/service.rs +++ b/crates/sail-catalog-system/src/service.rs @@ -91,6 +91,21 @@ impl SystemTableService { ) .await? } + SystemTable::Options => { + let key = filters + .extract("key")? + .unwrap_or_else(Predicates::always_true); + filters.finalize()?; + self.observe_system_manager( + |tx| SessionManagerObserver::Options { + key, + fetch, + result: tx, + }, + schema, + ) + .await? + } SystemTable::Sessions => { let session_id = filters .extract("session_id")? diff --git a/crates/sail-catalog-system/src/table_source.rs b/crates/sail-catalog-system/src/table_source.rs index ea76aacbf4..988441b03e 100644 --- a/crates/sail-catalog-system/src/table_source.rs +++ b/crates/sail-catalog-system/src/table_source.rs @@ -40,6 +40,7 @@ impl TableSource for SystemTableSource { SystemTable::Jobs | SystemTable::Stages | SystemTable::Tasks => { &["session_id", "job_id"] } + SystemTable::Options => &["key"], SystemTable::Sessions => &["session_id"], SystemTable::Workers => &["session_id", "worker_id"], }; diff --git a/crates/sail-catalog-unity/src/provider.rs b/crates/sail-catalog-unity/src/provider.rs index bd5a311a32..762fae35c9 100644 --- a/crates/sail-catalog-unity/src/provider.rs +++ b/crates/sail-catalog-unity/src/provider.rs @@ -20,8 +20,10 @@ use sail_catalog::provider::{ CatalogProvider, CreateDatabaseOptions, CreateTableOptions, CreateViewOptions, DropDatabaseOptions, DropTableOptions, DropViewOptions, Namespace, }; -use sail_catalog::utils::{get_property, quote_name_if_needed}; -use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; +use sail_catalog::utils::{get_property, quote_namespace_if_needed}; +use sail_common_datafusion::catalog::{ + identity_partition_fields, DatabaseStatus, TableColumnStatus, TableKind, TableStatus, +}; use secrecy::SecretString; use tokio::sync::OnceCell; @@ -113,35 +115,32 @@ impl UnityCatalogProvider { Ok(client) } - fn get_catalog_and_schema_name(&self, namespace: &Namespace) -> (String, String) { - match namespace.tail.len() { - 0 => ( + fn get_catalog_and_schema_name( + &self, + namespace: &Namespace, + ) -> CatalogResult<(String, String)> { + match namespace.tail.as_slice() { + [] => Ok(( self.catalog_config.default_catalog.to_string(), - namespace.head_to_string(), - ), - _ => (namespace.head_to_string(), namespace.tail_to_string()), + namespace.head.to_string(), + )), + [x] => Ok((namespace.head.to_string(), x.to_string())), + _ => Err(CatalogError::InvalidArgument(format!( + "Unity Catalog does not support multi-level schema name: {}", + quote_namespace_if_needed(namespace) + ))), } } - fn get_full_schema_name(&self, namespace: &Namespace) -> String { - match namespace.tail.len() { - 0 => { - format!( - "{}.{}", - self.catalog_config.default_catalog, - namespace.head_to_string() - ) - } - _ => namespace.to_string(), - } + // Unity Catalog does not quote names in full names even if they contain special characters, + // so we only concatenate the names with dot ('.'). + + fn get_full_schema_name(catalog_name: &str, schema_name: &str) -> String { + format!("{catalog_name}.{schema_name}") } - fn get_full_table_name(&self, database: &Namespace, table: &str) -> String { - format!( - "{}.{}", - self.get_full_schema_name(database), - quote_name_if_needed(table) - ) + fn get_full_table_name(catalog_name: &str, schema_name: &str, table_name: &str) -> String { + format!("{catalog_name}.{schema_name}.{table_name}") } fn schema_info_to_database_status( @@ -192,7 +191,7 @@ impl UnityCatalogProvider { } DatabaseStatus { - catalog: catalog_name, + catalog: self.name.clone(), database, comment: schema_info.comment, location: get_property(&properties, "location"), @@ -319,7 +318,7 @@ impl UnityCatalogProvider { let properties: Vec<_> = properties.into_iter().collect(); Ok(TableStatus { - catalog: Some(catalog), + catalog: Some(self.name.clone()), database, name, kind: TableKind::Table { @@ -328,7 +327,7 @@ impl UnityCatalogProvider { constraints: vec![], location: storage_location, format, - partition_by, + partition_by: identity_partition_fields(&partition_by), sort_by: vec![], bucket_by: None, options, @@ -369,7 +368,7 @@ impl CatalogProvider for UnityCatalogProvider { props.insert("location".to_string(), l); } - let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; let request = types::CreateSchema::builder() .catalog_name(&catalog_name) @@ -409,8 +408,9 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database); - let full_name = self.get_full_schema_name(database); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; + let full_name = Self::get_full_schema_name(&catalog_name, &schema_name); + let result = client.get_schema().full_name(&full_name).send().await; match result { @@ -440,9 +440,16 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let catalog_name = prefix - .map(|namespace| namespace.to_string()) - .unwrap_or(self.catalog_config.default_catalog.to_string()); + let catalog_name = match prefix { + None => self.catalog_config.default_catalog.to_string(), + Some(Namespace { head, tail }) if tail.is_empty() => head.to_string(), + Some(x) => { + return Err(CatalogError::InvalidArgument(format!( + "invalid prefix: {}", + quote_namespace_if_needed(x) + ))) + } + }; let result = client .list_schemas() .catalog_name(&catalog_name) @@ -478,11 +485,12 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let full_name = self.get_full_schema_name(database); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; + let full_name = Self::get_full_schema_name(&catalog_name, &schema_name); let result = client .delete_schema() - .full_name(&full_name) + .full_name(full_name) .force(cascade) .send() .await; @@ -562,7 +570,7 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load client: {e}")))?; - let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; let data_source_format = types::DataSourceFormat::from_str(&format.trim().to_uppercase()) .map_err(|e| { @@ -672,8 +680,8 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database); - let full_name = format!("{catalog_name}.{schema_name}.{table}"); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; + let full_name = Self::get_full_table_name(&catalog_name, &schema_name, table); let result = client.get_table().full_name(&full_name).send().await; @@ -697,7 +705,7 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; let result = client .list_tables() @@ -742,7 +750,8 @@ impl CatalogProvider for UnityCatalogProvider { .await .map_err(|e| CatalogError::External(format!("Failed to load config: {e}")))?; - let full_name = self.get_full_table_name(database, table); + let (catalog_name, schema_name) = self.get_catalog_and_schema_name(database)?; + let full_name = Self::get_full_table_name(&catalog_name, &schema_name, table); let result = client.delete_table().full_name(&full_name).send().await; diff --git a/crates/sail-catalog-unity/tests/rest_integration_test.rs b/crates/sail-catalog-unity/tests/rest_integration_test.rs index b0a592b7f3..87ae0cd823 100644 --- a/crates/sail-catalog-unity/tests/rest_integration_test.rs +++ b/crates/sail-catalog-unity/tests/rest_integration_test.rs @@ -197,7 +197,7 @@ async fn test_create_schema() { .iter() .any(|(k, v)| k == "created_at" && !v.is_empty())); - assert_eq!(catalog, DEFAULT_CATALOG.to_string()); + assert_eq!(catalog, "sail".to_string()); assert_eq!(database, Vec::::from(full_namespace.clone())); assert_eq!(comment, Some("test comment".to_string())); assert_eq!(location, Some("s3://bucket/path".to_string())); @@ -667,7 +667,7 @@ async fn test_create_table() { assert_eq!(properties.get("table_type"), Some(&"EXTERNAL".to_string())); assert_eq!(table.name, "t1".to_string()); - assert_eq!(table.catalog, Some("sail_test_catalog".to_string())); + assert_eq!(table.catalog, Some("sail".to_string())); assert_eq!(table.database, Vec::::from(full_ns.clone())); assert_eq!(comment, Some("peow".to_string())); assert_eq!(constraints, vec![]); @@ -676,7 +676,7 @@ async fn test_create_table() { Some("s3://deltadata/custom/path/meow".to_string()) ); assert_eq!(format, "delta".to_string()); - assert_eq!(partition_by, Vec::::new()); + assert_eq!(partition_by, Vec::::new()); assert_eq!(sort_by, vec![]); assert_eq!(bucket_by, None); assert_eq!(options, Vec::<(String, String)>::new()); @@ -869,7 +869,7 @@ async fn test_create_table() { }; assert_eq!(table.name, "t2".to_string()); - assert_eq!(table.catalog, Some("sail_test_catalog".to_string())); + assert_eq!(table.catalog, Some("sail".to_string())); assert_eq!(table.database, Vec::::from(full_ns.clone())); assert_eq!(comment, Some("test table".to_string())); assert!(constraints.is_empty()); @@ -878,7 +878,13 @@ async fn test_create_table() { Some("s3://deltadata/custom/path/meow2".to_string()) ); assert_eq!(format, "delta".to_string()); - assert_eq!(partition_by, vec!["baz".to_string()]); + assert_eq!( + partition_by, + vec![CatalogPartitionField { + column: "baz".to_string(), + transform: None, + }] + ); assert!(sort_by.is_empty()); assert_eq!(bucket_by, None); assert_eq!(options, vec![("key1".to_string(), "value1".to_string())]); @@ -1042,7 +1048,7 @@ async fn test_get_table() { assert_eq!(properties.get("team"), Some(&"data-eng".to_string())); assert_eq!(table_ns.name, "t2".to_string()); - assert_eq!(table_ns.catalog, Some("sail_test_catalog".to_string())); + assert_eq!(table_ns.catalog, Some("sail".to_string())); assert_eq!(table_ns.database, Vec::::from(full_ns.clone())); assert_eq!(comment, Some("test table".to_string())); assert!(constraints.is_empty()); @@ -1051,7 +1057,13 @@ async fn test_get_table() { Some("s3://deltadata/custom/path/meow2".to_string()) ); assert_eq!(format, "delta".to_string()); - assert_eq!(partition_by, vec!["baz".to_string()]); + assert_eq!( + partition_by, + vec![CatalogPartitionField { + column: "baz".to_string(), + transform: None, + }] + ); assert!(sort_by.is_empty()); assert_eq!(bucket_by, None); assert_eq!(options, vec![("key1".to_string(), "value1".to_string())]); @@ -1191,7 +1203,7 @@ async fn test_list_tables() { let TableKind::Table { format, .. } = &table.kind else { panic!("Expected TableKind::Table"); }; - assert_eq!(table.catalog, Some("sail_test_catalog".to_string())); + assert_eq!(table.catalog, Some("sail".to_string())); assert_eq!(table.database, Vec::::from(full_ns.clone())); assert_eq!(format, "delta"); } diff --git a/crates/sail-catalog/src/command.rs b/crates/sail-catalog/src/command.rs index bb38821e8e..07bddbf6bd 100644 --- a/crates/sail-catalog/src/command.rs +++ b/crates/sail-catalog/src/command.rs @@ -1,13 +1,12 @@ use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef; -use datafusion::prelude::SessionContext; -use datafusion_expr::ScalarUDF; use sail_common_datafusion::array::serde::ArrowSerializer; use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::session::plan::PlanService; use serde::{Deserialize, Serialize}; use crate::error::{CatalogError, CatalogResult}; +use crate::manager::tracker::{CatalogFunctionId, CatalogLogicalPlanId}; use crate::manager::CatalogManager; use crate::provider::{ CreateDatabaseOptions, CreateTableOptions, CreateTemporaryViewOptions, CreateViewOptions, @@ -15,7 +14,7 @@ use crate::provider::{ }; use crate::utils::quote_namespace_if_needed; -#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash)] +#[derive(Debug, Clone, Eq, PartialEq, PartialOrd, Hash, Serialize, Deserialize)] pub enum CatalogCommand { CurrentCatalog, SetCurrentCatalog { @@ -87,7 +86,7 @@ pub enum CatalogCommand { is_temporary: bool, }, RegisterFunction { - udf: ScalarUDF, + udf: CatalogFunctionId, }, RegisterTableFunction { name: String, @@ -107,7 +106,7 @@ pub enum CatalogCommand { CreateTemporaryView { view: String, is_global: bool, - options: CreateTemporaryViewOptions, + options: CreateTemporaryViewOptions, }, CreateView { view: Vec, @@ -119,7 +118,7 @@ pub enum CatalogCommand { }, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub enum CatalogTableFunction { // We do not support any kind of table functions yet. // PySpark UDTF is registered as a scalar UDF. @@ -159,7 +158,7 @@ impl CatalogCommand { } } - pub fn schema(&self, ctx: &SessionContext) -> CatalogResult { + pub fn schema(&self, ctx: &C) -> CatalogResult { let service = ctx.extension::()?; let display = service.catalog_display(); let schema = match self { @@ -200,9 +199,9 @@ impl CatalogCommand { Ok(schema) } - pub async fn execute( + pub async fn execute( self, - ctx: &SessionContext, + ctx: &C, manager: &CatalogManager, ) -> CatalogResult { // TODO: make sure we return the same schema as Spark for each command @@ -375,23 +374,23 @@ impl CatalogCommand { CatalogCommand::ListFunctions { .. } => { return Err(CatalogError::NotSupported("list functions".to_string())); } - // TODO: `ctx` will not be needed if `CatalogManager` manages functions internally. CatalogCommand::DropFunction { function, if_exists, is_temporary, } => { manager - .deregister_function(ctx, &function, if_exists, is_temporary) + .deregister_function(&function, if_exists, is_temporary) .await?; display.bools().to_record_batch(vec![true])? } CatalogCommand::RegisterFunction { udf } => { - manager.register_function(ctx, udf)?; + let udf = manager.get_tracked_function(udf)?; + manager.register_function(udf)?; display.empty().to_record_batch(vec![])? } CatalogCommand::RegisterTableFunction { name, udtf } => { - manager.register_table_function(ctx, name, udtf)?; + manager.register_table_function(name, udtf)?; display.empty().to_record_batch(vec![])? } CatalogCommand::DropTemporaryView { @@ -415,6 +414,15 @@ impl CatalogCommand { is_global, options, } => { + let input = manager.get_tracked_logical_plan(options.input)?; + let options = CreateTemporaryViewOptions { + input, + columns: options.columns, + if_not_exists: options.if_not_exists, + replace: options.replace, + comment: options.comment, + properties: options.properties, + }; if is_global { manager.create_global_temporary_view(&view, options).await?; } else { diff --git a/crates/sail-catalog/src/manager/database.rs b/crates/sail-catalog/src/manager/database.rs index ee54cbf464..08b905b1d9 100644 --- a/crates/sail-catalog/src/manager/database.rs +++ b/crates/sail-catalog/src/manager/database.rs @@ -53,6 +53,17 @@ impl CatalogManager { provider.get_database(&database).await } + /// Gets the database status for a given qualifier. + /// The qualifier is the table name parts except the last part (the table name itself). + /// If the qualifier is empty, the default database is used. + pub async fn get_database_by_qualifier>( + &self, + qualifier: &[T], + ) -> CatalogResult { + let (provider, database) = self.resolve_database_by_qualifier(qualifier)?; + provider.get_database(&database).await + } + pub async fn list_databases>( &self, qualifier: &[T], diff --git a/crates/sail-catalog/src/manager/function.rs b/crates/sail-catalog/src/manager/function.rs index c73787111a..c6852d5d2c 100644 --- a/crates/sail-catalog/src/manager/function.rs +++ b/crates/sail-catalog/src/manager/function.rs @@ -1,8 +1,6 @@ use std::sync::Arc; use datafusion::catalog::TableFunctionImpl; -use datafusion::prelude::SessionContext; -use datafusion_expr::registry::FunctionRegistry; use datafusion_expr::ScalarUDF; use crate::command::CatalogTableFunction; @@ -10,28 +8,35 @@ use crate::error::{CatalogError, CatalogResult}; use crate::manager::CatalogManager; impl CatalogManager { - pub fn register_function(&self, ctx: &SessionContext, udf: ScalarUDF) -> CatalogResult<()> { - ctx.register_udf(udf); + fn canonical_function_name(name: &str) -> Arc { + name.to_ascii_lowercase().into() + } + + pub fn register_function(&self, udf: ScalarUDF) -> CatalogResult<()> { + let mut state = self.state()?; + let name = Self::canonical_function_name(udf.name()); + state.functions.insert(name, udf); Ok(()) } + pub fn get_function>(&self, name: T) -> CatalogResult> { + let state = self.state()?; + let name = Self::canonical_function_name(name.as_ref()); + Ok(state.functions.get(&name).cloned()) + } + pub fn register_table_function( &self, - _ctx: &SessionContext, _name: String, udtf: CatalogTableFunction, ) -> CatalogResult<()> { let _function: Arc = match udtf {}; #[expect(unreachable_code)] - { - _ctx.register_udtf(_name.as_str(), _function); - Ok(()) - } + Ok(()) } pub async fn deregister_function>( &self, - ctx: &SessionContext, function: &[T], if_exists: bool, _is_temporary: bool, @@ -41,17 +46,11 @@ impl CatalogManager { "qualified function name".to_string(), )); }; - let found = ctx - .state_ref() - .write() - .deregister_udf(name.as_ref()) - .map_err(|e| CatalogError::Internal(e.to_string()))? - .is_some(); + let mut state = self.state()?; + let name = Self::canonical_function_name(name.as_ref()); + let found = state.functions.remove(&name).is_some(); if !found && !if_exists { - return Err(CatalogError::NotFound( - "function", - name.as_ref().to_string(), - )); + return Err(CatalogError::NotFound("function", name.to_string())); } Ok(()) } diff --git a/crates/sail-catalog/src/manager/mod.rs b/crates/sail-catalog/src/manager/mod.rs index 7c491a156e..91b4440ef5 100644 --- a/crates/sail-catalog/src/manager/mod.rs +++ b/crates/sail-catalog/src/manager/mod.rs @@ -1,9 +1,11 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex, MutexGuard}; +use datafusion_expr::{LogicalPlan, ScalarUDF}; use sail_common_datafusion::extension::SessionExtension; use crate::error::{CatalogError, CatalogResult}; +use crate::manager::tracker::{CatalogFunctionId, CatalogLogicalPlanId, CatalogObjectTracker}; use crate::provider::{CatalogProvider, Namespace}; use crate::temp_view::TemporaryViewManager; @@ -11,6 +13,7 @@ pub mod catalog; pub mod database; pub mod function; pub mod table; +pub mod tracker; pub mod view; /// A manager for all catalogs registered with the session. @@ -18,10 +21,12 @@ pub mod view; pub struct CatalogManager { state: Arc>, pub(super) temporary_views: TemporaryViewManager, + pub(super) tracker: CatalogObjectTracker, } pub(super) struct CatalogManagerState { pub(super) catalogs: HashMap, Arc>, + pub(super) functions: HashMap, datafusion_expr::ScalarUDF>, pub(super) default_catalog: Arc, pub(super) default_database: Namespace, pub(super) global_temporary_database: Namespace, @@ -35,7 +40,7 @@ pub struct CatalogManagerOptions { } impl CatalogManager { - pub fn new(options: CatalogManagerOptions) -> CatalogResult { + pub fn try_new(options: CatalogManagerOptions) -> CatalogResult { let catalogs = options .catalogs .into_iter() @@ -52,6 +57,7 @@ impl CatalogManager { // Even if the default database is valid now, it may be dropped externally later. let state = CatalogManagerState { catalogs, + functions: HashMap::new(), default_catalog: options.default_catalog.into(), default_database: options.default_database.try_into()?, global_temporary_database: options.global_temporary_database.try_into()?, @@ -59,6 +65,7 @@ impl CatalogManager { Ok(CatalogManager { state: Arc::new(Mutex::new(state)), temporary_views: Default::default(), + tracker: Default::default(), }) } @@ -86,6 +93,21 @@ impl CatalogManager { Ok((state.get_catalog(&catalog)?, database)) } + pub(super) fn resolve_database_by_qualifier>( + &self, + qualifier: &[T], + ) -> CatalogResult<(Arc, Namespace)> { + let state = self.state()?; + if qualifier.is_empty() { + let catalog = state.default_catalog.clone(); + let database = state.default_database.clone(); + Ok((state.get_catalog(&catalog)?, database)) + } else { + let (catalog, database) = state.resolve_database_reference(qualifier)?; + Ok((state.get_catalog(&catalog)?, database)) + } + } + pub(super) fn resolve_optional_database>( &self, database: &[T], @@ -103,6 +125,28 @@ impl CatalogManager { let (catalog, database, table) = state.resolve_object_reference(object)?; Ok((state.get_catalog(&catalog)?, database, table)) } + + pub fn track_function(&self, udf: ScalarUDF) -> CatalogResult { + self.tracker.track_function(udf) + } + + pub fn get_tracked_function(&self, id: CatalogFunctionId) -> CatalogResult { + self.tracker.get_tracked_function(id) + } + + pub fn track_logical_plan( + &self, + plan: Arc, + ) -> CatalogResult { + self.tracker.track_logical_plan(plan) + } + + pub fn get_tracked_logical_plan( + &self, + id: CatalogLogicalPlanId, + ) -> CatalogResult> { + self.tracker.get_tracked_logical_plan(id) + } } impl CatalogManagerState { diff --git a/crates/sail-catalog/src/manager/tracker.rs b/crates/sail-catalog/src/manager/tracker.rs new file mode 100644 index 0000000000..85f32473dd --- /dev/null +++ b/crates/sail-catalog/src/manager/tracker.rs @@ -0,0 +1,78 @@ +use std::collections::HashMap; +use std::sync::{Arc, Mutex, MutexGuard}; + +use datafusion_expr::{LogicalPlan, ScalarUDF}; +use serde::{Deserialize, Serialize}; + +use crate::error::{CatalogError, CatalogResult}; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] +pub struct CatalogFunctionId(u64); + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] +pub struct CatalogLogicalPlanId(u64); + +#[derive(Default)] +struct CatalogObjectTrackerState { + next_function_id: u64, + next_logical_plan_id: u64, + functions: HashMap, + logical_plans: HashMap>, +} + +/// Tracks in-memory objects (UDFs and logical plans) that cannot be serialized directly, +/// assigning each a unique ID. The ID can then be stored in [`super::super::command::CatalogCommand`] +/// to allow the command itself to be serialized and deserialized, while the actual objects +/// are retrieved from this tracker at execution time. +#[derive(Default)] +pub struct CatalogObjectTracker { + state: Mutex, +} + +impl CatalogObjectTracker { + fn state(&self) -> CatalogResult> { + self.state + .lock() + .map_err(|e| CatalogError::Internal(e.to_string())) + } + + pub fn track_function(&self, udf: ScalarUDF) -> CatalogResult { + let mut state = self.state()?; + let id = state.next_function_id; + state.next_function_id += 1; + state.functions.insert(id, udf); + Ok(CatalogFunctionId(id)) + } + + pub fn get_tracked_function(&self, id: CatalogFunctionId) -> CatalogResult { + let state = self.state()?; + state + .functions + .get(&id.0) + .cloned() + .ok_or_else(|| CatalogError::NotFound("function", id.0.to_string())) + } + + pub fn track_logical_plan( + &self, + plan: Arc, + ) -> CatalogResult { + let mut state = self.state()?; + let id = state.next_logical_plan_id; + state.next_logical_plan_id += 1; + state.logical_plans.insert(id, plan); + Ok(CatalogLogicalPlanId(id)) + } + + pub fn get_tracked_logical_plan( + &self, + id: CatalogLogicalPlanId, + ) -> CatalogResult> { + let state = self.state()?; + state + .logical_plans + .get(&id.0) + .cloned() + .ok_or_else(|| CatalogError::NotFound("logical plan", id.0.to_string())) + } +} diff --git a/crates/sail-catalog/src/provider/namespace.rs b/crates/sail-catalog/src/provider/namespace.rs index 6f48c9f2e1..f6eb34ec06 100644 --- a/crates/sail-catalog/src/provider/namespace.rs +++ b/crates/sail-catalog/src/provider/namespace.rs @@ -1,8 +1,6 @@ -use std::fmt; use std::sync::Arc; use crate::error::{CatalogError, CatalogResult}; -use crate::utils::{quote_name_if_needed, quote_namespace_if_needed}; /// A non-empty, multi-level name. /// This is used to refer to a database in the catalog. @@ -66,25 +64,7 @@ impl> PartialEq<&[T]> for Namespace { } } -impl fmt::Display for Namespace { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.write_str("e_namespace_if_needed(self)) - } -} - impl Namespace { - pub fn head_to_string(&self) -> String { - quote_name_if_needed(&self.head) - } - - pub fn tail_to_string(&self) -> String { - self.tail - .iter() - .map(|s| quote_name_if_needed(s)) - .collect::>() - .join(".") - } - pub fn is_child_of(&self, other: &Self) -> bool { self.head == other.head && self.tail.len() == other.tail.len() + 1 diff --git a/crates/sail-catalog/src/provider/options.rs b/crates/sail-catalog/src/provider/options.rs index f468858b34..a46a3d1aab 100644 --- a/crates/sail-catalog/src/provider/options.rs +++ b/crates/sail-catalog/src/provider/options.rs @@ -2,12 +2,14 @@ use std::sync::Arc; use datafusion::arrow::datatypes::DataType; use datafusion_expr::LogicalPlan; +pub use sail_common_datafusion::catalog::{CatalogPartitionField, PartitionTransform}; use sail_common_datafusion::catalog::{ CatalogTableBucketBy, CatalogTableConstraint, CatalogTableSort, }; +use serde::{Deserialize, Serialize}; /// Options for creating a database in a catalog. -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateDatabaseOptions { pub comment: Option, pub location: Option, @@ -15,34 +17,13 @@ pub struct CreateDatabaseOptions { pub properties: Vec<(String, String)>, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct DropDatabaseOptions { pub if_exists: bool, pub cascade: bool, } -// TODO: Upstream changes in sail-plan are needed to expose partition transforms to users -// via SQL or DataFrame APIs. - -#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Default)] -pub enum PartitionTransform { - #[default] - Identity, - Year, - Month, - Day, - Hour, - Bucket(u32), - Truncate(u32), -} - -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] -pub struct CatalogPartitionField { - pub column: String, - pub transform: Option, -} - -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateTableOptions { pub columns: Vec, pub comment: Option, @@ -58,7 +39,7 @@ pub struct CreateTableOptions { pub properties: Vec<(String, String)>, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateTableColumnOptions { pub name: String, pub data_type: DataType, @@ -68,13 +49,13 @@ pub struct CreateTableColumnOptions { pub generated_always_as: Option, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct DropTableOptions { pub if_exists: bool, pub purge: bool, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateViewOptions { pub columns: Vec, pub definition: String, @@ -84,7 +65,7 @@ pub struct CreateViewOptions { pub properties: Vec<(String, String)>, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateViewColumnOptions { pub name: String, pub data_type: DataType, @@ -92,9 +73,9 @@ pub struct CreateViewColumnOptions { pub comment: Option, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] -pub struct CreateTemporaryViewOptions { - pub input: Arc, +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] +pub struct CreateTemporaryViewOptions> { + pub input: I, pub columns: Vec, pub if_not_exists: bool, pub replace: bool, @@ -102,17 +83,17 @@ pub struct CreateTemporaryViewOptions { pub properties: Vec<(String, String)>, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CreateTemporaryViewColumnOptions { pub comment: Option, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct DropViewOptions { pub if_exists: bool, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct DropTemporaryViewOptions { pub if_exists: bool, } diff --git a/crates/sail-cli/src/python.rs b/crates/sail-cli/src/python.rs index 6d7e8733ed..3220aea3fc 100644 --- a/crates/sail-cli/src/python.rs +++ b/crates/sail-cli/src/python.rs @@ -43,6 +43,8 @@ impl Modules { "_sail_cli_spark_shell", include_str!("python/spark_shell.py"), ); + pub const SPARK_RUN: Module<()> = + Module::new("_sail_cli_spark_run", include_str!("python/spark_run.py")); pub const SPARK_MCP_SERVER: Module<()> = Module::new( "_sail_cli_spark_mcp_server", include_str!("python/spark_mcp_server.py"), diff --git a/crates/sail-cli/src/python/spark_run.py b/crates/sail-cli/src/python/spark_run.py new file mode 100644 index 0000000000..b057e3ace2 --- /dev/null +++ b/crates/sail-cli/src/python/spark_run.py @@ -0,0 +1,31 @@ +import sys +from pathlib import Path + +from pyspark.sql import SparkSession + + +def read_script(file: str) -> tuple[str, str]: + if file == "-": + return (sys.stdin.read(), "") + + path = Path(file) + if not path.is_absolute(): + path = Path.cwd() / path + path = path.resolve() + + return (path.read_text(), str(path)) + + +def run_pyspark_script(port: int, file: str): + source, filename = read_script(file) + spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() + scope = { + "__name__": "__main__", + "__file__": filename, + "__package__": None, + "spark": spark, + } + try: + exec(compile(source, filename, "exec"), scope) # noqa: S102 + finally: + spark.stop() diff --git a/crates/sail-cli/src/python/spark_shell.py b/crates/sail-cli/src/python/spark_shell.py index 73007c96c6..5f98542887 100644 --- a/crates/sail-cli/src/python/spark_shell.py +++ b/crates/sail-cli/src/python/spark_shell.py @@ -9,6 +9,13 @@ def run_pyspark_shell(port: int): spark = SparkSession.builder.remote(f"sc://localhost:{port}").getOrCreate() + try: + _run(f"localhost:{port}", spark) + finally: + spark.stop() + + +def _run(endpoint: str, spark: SparkSession): namespace = {"spark": spark} readline.parse_and_bind("tab: complete") readline.set_completer(Completer(namespace).complete) @@ -24,6 +31,6 @@ def run_pyspark_shell(port: int): /_/ Using Python version {python_version} ({build_number}, {build_date}) -Client connected to the Sail Spark Connect server at localhost:{port} +Client connected to the Sail Spark Connect server at {endpoint} SparkSession available as 'spark'.""" code.interact(local=namespace, banner=banner, exitmsg="") diff --git a/crates/sail-cli/src/runner.rs b/crates/sail-cli/src/runner.rs index 1854795328..66892f6237 100644 --- a/crates/sail-cli/src/runner.rs +++ b/crates/sail-cli/src/runner.rs @@ -1,6 +1,7 @@ use clap::{Parser, Subcommand}; use sail_common::error::CommonError; +use crate::spark::run::run_pyspark_script; use crate::spark::{ run_pyspark_shell, run_spark_connect_server, run_spark_mcp_server, McpSettings, McpTransport, }; @@ -48,6 +49,22 @@ enum SparkCommand { about = "Start the PySpark shell with a Spark Connect server running in the background" )] Shell, + #[command(about = "Run a PySpark script and exit")] + Run { + #[arg( + short = 'f', + long, + help = "The PySpark script file to run, or '-' for stdin", + default_value = "-" + )] + file: String, + #[arg( + short = 'C', + long, + help = "The directory to change to before running the script" + )] + directory: Option, + }, #[command(about = "Start the Spark MCP (Model Context Protocol) server")] McpServer { #[arg( @@ -109,6 +126,12 @@ pub fn main(args: Vec) -> Result<(), Box> { // according to the Python multiprocessing resource tracker? run_pyspark_shell() } + SparkCommand::Run { file, directory } => { + if let Some(directory) = directory { + std::env::set_current_dir(directory)?; + } + run_pyspark_script(file) + } SparkCommand::McpServer { host, port, @@ -119,12 +142,14 @@ pub fn main(args: Vec) -> Result<(), Box> { if let Some(directory) = directory { std::env::set_current_dir(directory)?; } - run_spark_mcp_server(McpSettings { - transport, - host, - port, + run_spark_mcp_server( + McpSettings { + transport, + host, + port, + }, spark_remote, - }) + ) } }, } diff --git a/crates/sail-cli/src/spark/mcp_server.rs b/crates/sail-cli/src/spark/mcp_server.rs index 694f0d3db4..2d1f60f909 100644 --- a/crates/sail-cli/src/spark/mcp_server.rs +++ b/crates/sail-cli/src/spark/mcp_server.rs @@ -1,24 +1,17 @@ use std::fmt; use std::fmt::Formatter; -use std::net::Ipv4Addr; +use std::net::{IpAddr, Ipv4Addr}; use std::sync::Arc; use clap::ValueEnum; -use log::info; use pyo3::prelude::PyAnyMethods; use pyo3::{PyResult, Python}; use sail_common::config::AppConfig; -use sail_common::runtime::{RuntimeHandle, RuntimeManager}; -use sail_spark_connect::entrypoint::serve; -use sail_telemetry::telemetry::{init_telemetry, ResourceOptions}; -use tokio::net::TcpListener; +use sail_common::runtime::RuntimeManager; +use tokio::sync::oneshot; use crate::python::Modules; - -async fn shutdown() { - let _ = tokio::signal::ctrl_c().await; - info!("Shutting down the Spark Connect server..."); -} +use crate::spark::server::{telemetry, with_spark_connect_server}; #[derive(Debug, Clone, Copy, ValueEnum)] #[clap(rename_all = "kebab-case")] @@ -41,43 +34,44 @@ pub struct McpSettings { pub transport: McpTransport, pub host: String, pub port: u16, - pub spark_remote: Option, -} - -fn run_spark_connect_server( - config: Arc, - runtime: RuntimeHandle, -) -> Result> { - let handle = runtime.clone(); - let (server_port, server_task) = runtime.primary().block_on(async move { - // Listen on only the loopback interface for security. - let listener = TcpListener::bind((Ipv4Addr::new(127, 0, 0, 1), 0)).await?; - let port = listener.local_addr()?.port(); - let task = async move { - info!("Starting the Spark Connect server on port {port}..."); - let _ = serve(listener, shutdown(), config, handle).await; - info!("The Spark Connect server has stopped."); - }; - >>::Ok((port, task)) - })?; - runtime.primary().spawn(server_task); - Ok(format!("sc://127.0.0.1:{server_port}")) } -pub fn run_spark_mcp_server(settings: McpSettings) -> Result<(), Box> { - let config = Arc::new(AppConfig::load()?); - let runtime = RuntimeManager::try_new(&config.runtime)?; +pub fn run_spark_mcp_server( + settings: McpSettings, + spark_remote: Option, +) -> Result<(), Box> { + match spark_remote { + None => { + // We follow the same setup as `run_pyspark_shell`. + // Please refer to the comments in that function for details. + let (tx, rx) = oneshot::channel::<()>(); + let address = (IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 0); + let shutdown = async { + let _ = rx.await; + }; + with_spark_connect_server(address, shutdown, |addr| async move { + let _tx = tx; + _run_mcp_server(settings, format!("sc://127.0.0.1:{}", addr.port())) + }) + } + Some(x) => { + let config = Arc::new(AppConfig::load()?); + let runtime = RuntimeManager::try_new(&config.runtime)?; - runtime.handle().primary().block_on(async { - let resource = ResourceOptions { kind: "server" }; - init_telemetry(&config.telemetry, resource) - })?; + let _telemetry = runtime + .handle() + .primary() + .block_on(async { telemetry::TelemetryGuard::try_new(&config) })?; - let spark_remote = match settings.spark_remote { - None => run_spark_connect_server(Arc::clone(&config), runtime.handle())?, - Some(x) => x, - }; + _run_mcp_server(settings, x.clone()) + } + } +} +fn _run_mcp_server( + settings: McpSettings, + spark_remote: String, +) -> Result<(), Box> { Python::attach(|py| -> PyResult<_> { let _ = Modules::NATIVE_LOGGING.load(py)?; let server = Modules::SPARK_MCP_SERVER.load(py)?; diff --git a/crates/sail-cli/src/spark/mod.rs b/crates/sail-cli/src/spark/mod.rs index 81e48e257c..3f9980d5bf 100644 --- a/crates/sail-cli/src/spark/mod.rs +++ b/crates/sail-cli/src/spark/mod.rs @@ -1,4 +1,5 @@ mod mcp_server; +pub(crate) mod run; mod server; mod shell; diff --git a/crates/sail-cli/src/spark/run.rs b/crates/sail-cli/src/spark/run.rs new file mode 100644 index 0000000000..793eb4df20 --- /dev/null +++ b/crates/sail-cli/src/spark/run.rs @@ -0,0 +1,30 @@ +use std::net::{IpAddr, Ipv4Addr}; + +use pyo3::prelude::PyAnyMethods; +use pyo3::{PyResult, Python}; +use tokio::sync::oneshot; + +use crate::python::Modules; +use crate::spark::server::with_spark_connect_server; + +pub fn run_pyspark_script(file: String) -> Result<(), Box> { + // We follow the same setup as `run_pyspark_shell`. + // Please refer to the comments in that function for details. + let (tx, rx) = oneshot::channel::<()>(); + let address = (IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 0); + let shutdown = async { + let _ = rx.await; + }; + with_spark_connect_server(address, shutdown, |addr| async move { + let _tx = tx; + Python::attach(|py| -> PyResult<_> { + let runner = Modules::SPARK_RUN.load(py)?; + runner + .getattr("run_pyspark_script")? + .call1((addr.port(), file))?; + Ok(()) + })?; + Ok(()) + })?; + Ok(()) +} diff --git a/crates/sail-cli/src/spark/server.rs b/crates/sail-cli/src/spark/server.rs index 9c8f25aa52..c54ac4978c 100644 --- a/crates/sail-cli/src/spark/server.rs +++ b/crates/sail-cli/src/spark/server.rs @@ -1,11 +1,11 @@ -use std::net::IpAddr; +use std::future::Future; +use std::net::{IpAddr, SocketAddr}; use std::sync::Arc; -use log::info; +use log::{error, info}; use sail_common::config::AppConfig; use sail_common::runtime::RuntimeManager; use sail_spark_connect::entrypoint::serve; -use sail_telemetry::telemetry::{init_telemetry, shutdown_telemetry, ResourceOptions}; use tokio::net::TcpListener; /// Handles graceful shutdown by waiting for a `SIGINT` signal in [tokio]. @@ -26,29 +26,100 @@ async fn shutdown() { info!("Shutting down the Spark Connect server..."); } -pub fn run_spark_connect_server(ip: IpAddr, port: u16) -> Result<(), Box> { +pub(super) mod telemetry { + use sail_common::config::AppConfig; + use sail_telemetry::telemetry::{init_telemetry, shutdown_telemetry, ResourceOptions}; + + pub struct TelemetryGuard { + /// A marker to prevent struct creation without calling [`TelemetryGuard::try_new()`]. + _marker: (), + } + + impl TelemetryGuard { + pub fn try_new(config: &AppConfig) -> Result> { + let resource = ResourceOptions { kind: "server" }; + init_telemetry(&config.telemetry, resource)?; + Ok(Self { _marker: () }) + } + } + + impl Drop for TelemetryGuard { + fn drop(&mut self) { + shutdown_telemetry(); + } + } +} + +/// A user-facing error for the Spark Connect server. +/// This does not wrap the underlying error but only tracks the error message, +/// so that it can be `Send` from the server task. +#[derive(Debug)] +pub struct ServerError(String); + +impl std::fmt::Display for ServerError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "server error: {}", self.0) + } +} + +impl std::error::Error for ServerError {} + +/// Starts a Spark Connect server and runs the given workload with the server address. +/// This function should be called only once in the entire process since it initializes +/// the telemetry and shuts down the telemetry when the server stops. +pub(super) fn with_spark_connect_server( + address: (IpAddr, u16), + signal: S, + workload: W, +) -> Result<(), Box> +where + S: Future + Send + 'static, + W: FnOnce(SocketAddr) -> F, + F: Future>>, +{ let config = Arc::new(AppConfig::load()?); let runtime = RuntimeManager::try_new(&config.runtime)?; - runtime.handle().primary().block_on(async { - let resource = ResourceOptions { kind: "server" }; - init_telemetry(&config.telemetry, resource) - })?; + let _telemetry = runtime + .handle() + .primary() + .block_on(async { telemetry::TelemetryGuard::try_new(&config) })?; let handle = runtime.handle(); - runtime.handle().primary().block_on(async { + let (server_address, server_task) = runtime.handle().primary().block_on(async { // A secure connection can be handled by a gateway in production. - let listener = TcpListener::bind((ip, port)).await?; - info!( - "Starting the Spark Connect server on {}...", - listener.local_addr()? - ); - serve(listener, shutdown(), config, handle).await?; - info!("The Spark Connect server has stopped."); - >>::Ok(()) + let listener = TcpListener::bind(address).await?; + let server_address = listener.local_addr()?; + let server_task = async move { + info!("Starting the Spark Connect server on {server_address}..."); + match serve(listener, signal, config, handle).await { + Ok(()) => { + info!("The Spark Connect server has stopped."); + Ok(()) + } + Err(e) => { + error!("{e}"); + Err(ServerError(e.to_string())) + } + } + }; + >>::Ok((server_address, server_task)) })?; - shutdown_telemetry(); + let server_task = runtime.handle().primary().spawn(server_task); - Ok(()) + runtime.handle().primary().block_on(async move { + let result = workload(server_address).await; + let server_result = server_task.await; + match (result, server_result) { + (Err(e), _) => Err(e), + (Ok(()), Ok(Ok(()))) => Ok(()), + (Ok(()), Ok(Err(e))) => Err(Box::new(e) as Box), + (Ok(()), Err(e)) => Err(Box::new(e) as Box), + } + }) +} + +pub fn run_spark_connect_server(ip: IpAddr, port: u16) -> Result<(), Box> { + with_spark_connect_server((ip, port), shutdown(), |_| async { Ok(()) }) } diff --git a/crates/sail-cli/src/spark/shell.rs b/crates/sail-cli/src/spark/shell.rs index deba92bf5b..2d44adee67 100644 --- a/crates/sail-cli/src/spark/shell.rs +++ b/crates/sail-cli/src/spark/shell.rs @@ -1,43 +1,36 @@ -use std::net::Ipv4Addr; -use std::sync::Arc; +use std::net::{IpAddr, Ipv4Addr}; use pyo3::prelude::PyAnyMethods; use pyo3::{PyResult, Python}; -use sail_common::config::AppConfig; -use sail_common::runtime::RuntimeManager; -use sail_spark_connect::entrypoint::serve; -use tokio::net::TcpListener; use tokio::sync::oneshot; use crate::python::Modules; +use crate::spark::server::with_spark_connect_server; pub fn run_pyspark_shell() -> Result<(), Box> { - let config = Arc::new(AppConfig::load()?); - let runtime = RuntimeManager::try_new(&config.runtime)?; - let (_tx, rx) = oneshot::channel::<()>(); - let handle = runtime.handle(); - let (server_port, server_task) = runtime.handle().primary().block_on(async move { - // Listen on only the loopback interface for security. - let listener = TcpListener::bind((Ipv4Addr::new(127, 0, 0, 1), 0)).await?; - let port = listener.local_addr()?.port(); - // We do not capture SIGINT for the server since the user may enter Ctrl+C when - // interacting with the Python interpreter. - // The server will be terminated when the Python interpreter exits. - let shutdown = async { - // Wait on a channel that will never be sent to. - let _ = rx.await; - }; - let task = async { - let _ = serve(listener, shutdown, config, handle).await; - }; - >>::Ok((port, task)) - })?; - runtime.handle().primary().spawn(server_task); - Python::attach(|py| -> PyResult<_> { - let shell = Modules::SPARK_SHELL.load(py)?; - shell - .getattr("run_pyspark_shell")? - .call((server_port,), None)?; + let (tx, rx) = oneshot::channel::<()>(); + // Listen on only the loopback interface for security. + let address = (IpAddr::V4(Ipv4Addr::new(127, 0, 0, 1)), 0); + // We do not capture SIGINT for the server since the user may enter Ctrl+C when + // interacting with the Python interpreter. + // The server will be terminated when the Python interpreter exits. + let shutdown = async { + // The shutdown signal receiver will be notified when `tx` is dropped, + // even if no value is sent through the channel. + let _ = rx.await; + }; + with_spark_connect_server(address, shutdown, |addr| async move { + // Move `tx` to the async block so that it will be dropped after running the PySpark shell, + // which will signal the server to shut down. + // Note: `let _ = tx;` does not work!!! + let _tx = tx; + Python::attach(|py| -> PyResult<_> { + let shell = Modules::SPARK_SHELL.load(py)?; + shell + .getattr("run_pyspark_shell")? + .call((addr.port(),), None)?; + Ok(()) + })?; Ok(()) })?; Ok(()) diff --git a/crates/sail-common-datafusion/data/system/databases.yaml b/crates/sail-common-datafusion/data/system/databases.yaml index cfd9b0169c..776712ac10 100644 --- a/crates/sail-common-datafusion/data/system/databases.yaml +++ b/crates/sail-common-datafusion/data/system/databases.yaml @@ -226,6 +226,23 @@ - name: session description: The database containing user session data. tables: + - name: options + row_name: option + description: The table containing current application configuration options. + columns: + - name: key + description: The configuration key. + rust_type: String + sql_type: STRING + arrow_type: crate::system::types::string() + nullable: false + - name: value + description: The configuration value as a string that can be interpreted as a TOML value. + rust_type: String + sql_type: STRING + arrow_type: crate::system::types::string() + nullable: false + - name: sessions row_name: session description: The table containing user session information. diff --git a/crates/sail-common-datafusion/src/array/record_batch.rs b/crates/sail-common-datafusion/src/array/record_batch.rs index d782fbdce9..53944f8cca 100644 --- a/crates/sail-common-datafusion/src/array/record_batch.rs +++ b/crates/sail-common-datafusion/src/array/record_batch.rs @@ -60,6 +60,13 @@ pub fn cast_record_batch_relaxed_tz( batch: &RecordBatch, target: &SchemaRef, ) -> Result { + if target.fields().is_empty() { + return Ok(RecordBatch::try_new_with_options( + target.clone(), + vec![], + &RecordBatchOptions::default().with_row_count(Some(batch.num_rows())), + )?); + } let mut cols: Vec = Vec::with_capacity(target.fields().len()); for field in target.fields() { diff --git a/crates/sail-common-datafusion/src/catalog/mod.rs b/crates/sail-common-datafusion/src/catalog/mod.rs index d006cfa711..e802a04281 100644 --- a/crates/sail-common-datafusion/src/catalog/mod.rs +++ b/crates/sail-common-datafusion/src/catalog/mod.rs @@ -3,11 +3,12 @@ mod status; use datafusion_common::Column; use datafusion_expr::expr; +use serde::{Deserialize, Serialize}; pub use status::*; use crate::datasource::BucketBy; -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub enum CatalogTableConstraint { Unique { name: Option, @@ -19,7 +20,25 @@ pub enum CatalogTableConstraint { }, } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, PartialOrd, Default, Serialize, Deserialize)] +pub enum PartitionTransform { + #[default] + Identity, + Year, + Month, + Day, + Hour, + Bucket(u32), + Truncate(u32), +} + +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] +pub struct CatalogPartitionField { + pub column: String, + pub transform: Option, +} + +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CatalogTableBucketBy { pub columns: Vec, pub num_buckets: usize, @@ -38,7 +57,7 @@ impl From for BucketBy { } } -#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd)] +#[derive(Debug, Clone, Eq, PartialEq, Hash, PartialOrd, Serialize, Deserialize)] pub struct CatalogTableSort { pub column: String, pub ascending: bool, diff --git a/crates/sail-common-datafusion/src/catalog/status.rs b/crates/sail-common-datafusion/src/catalog/status.rs index be0ccc09ef..18048c36ce 100644 --- a/crates/sail-common-datafusion/src/catalog/status.rs +++ b/crates/sail-common-datafusion/src/catalog/status.rs @@ -3,7 +3,9 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field}; use datafusion_expr::LogicalPlan; -use crate::catalog::{CatalogTableBucketBy, CatalogTableConstraint, CatalogTableSort}; +use crate::catalog::{ + CatalogPartitionField, CatalogTableBucketBy, CatalogTableConstraint, CatalogTableSort, +}; #[derive(Debug, Clone)] pub struct DatabaseStatus { @@ -30,7 +32,7 @@ pub enum TableKind { constraints: Vec, location: Option, format: String, - partition_by: Vec, + partition_by: Vec, sort_by: Vec, bucket_by: Option, options: Vec<(String, String)>, @@ -187,3 +189,14 @@ impl TableColumnStatus { Field::new(self.name.clone(), self.data_type.clone(), self.nullable) } } + +pub fn identity_partition_fields(columns: &[String]) -> Vec { + columns + .iter() + .cloned() + .map(|column| CatalogPartitionField { + column, + transform: None, + }) + .collect() +} diff --git a/crates/sail-common-datafusion/src/datasource.rs b/crates/sail-common-datafusion/src/datasource.rs index 475247a70d..6b078de4d7 100644 --- a/crates/sail-common-datafusion/src/datasource.rs +++ b/crates/sail-common-datafusion/src/datasource.rs @@ -14,6 +14,7 @@ use datafusion_common::{not_impl_err, plan_err, Constraints, DFSchema, Result}; use datafusion_expr::expr::Sort; use datafusion_expr::TableSource; +use crate::catalog::CatalogPartitionField; use crate::extension::SessionExtension; use crate::logical_expr::ExprWithSource; @@ -67,16 +68,45 @@ pub struct SourceInfo { #[derive(Debug, Clone)] pub struct SinkInfo { pub input: Arc, - pub path: String, pub mode: PhysicalSinkMode, - pub partition_by: Vec, + pub partition_by: Vec, pub bucket_by: Option, pub sort_order: Option, + pub table_properties: HashMap, /// The sets of options for the data sink. /// A later set of options can override earlier ones. + /// The path for the sink is stored under the `"path"` key in options. pub options: Vec>, } +impl SinkInfo { + /// Returns the path from options, or an empty string if not set. + /// Checks the `"path"` key first, then `"location"`. + /// Key comparison is case-insensitive. + pub fn path(&self) -> String { + find_option(&self.options, "path") + .or_else(|| find_option(&self.options, "location")) + .unwrap_or_default() + } +} + +/// Searches option sets in reverse order for a case-insensitive key match. +/// Returns the value from the last option set that contains the key, or `None`. +pub fn find_option(options: &[HashMap], key: &str) -> Option { + for set in options.iter().rev() { + if let Some(value) = set.iter().find_map(|(k, v)| { + if k.eq_ignore_ascii_case(key) { + Some(v.clone()) + } else { + None + } + }) { + return Some(value); + } + } + None +} + /// Information required to create a data deleter. #[derive(Debug, Clone)] pub struct DeleteInfo { diff --git a/crates/sail-common-datafusion/src/system/observable.rs b/crates/sail-common-datafusion/src/system/observable.rs index 035ffe5554..7c3d36f757 100644 --- a/crates/sail-common-datafusion/src/system/observable.rs +++ b/crates/sail-common-datafusion/src/system/observable.rs @@ -3,7 +3,7 @@ use datafusion::common::Result; use datafusion_common::DataFusionError; use tokio::sync::oneshot; -use crate::system::catalog::{JobRow, SessionRow, StageRow, TaskRow, WorkerRow}; +use crate::system::catalog::{JobRow, OptionRow, SessionRow, StageRow, TaskRow, WorkerRow}; use crate::system::predicate::Predicate; /// A trait for observing the state of a component. @@ -60,6 +60,11 @@ pub enum SessionManagerObserver { fetch: usize, result: oneshot::Sender>>, }, + Options { + key: Predicate, + fetch: usize, + result: oneshot::Sender>>, + }, } impl Observer for SessionManagerObserver { @@ -80,6 +85,9 @@ impl Observer for SessionManagerObserver { SessionManagerObserver::Workers { result, .. } => { let _ = result.send(Ok(vec![])); } + SessionManagerObserver::Options { result, .. } => { + let _ = result.send(Ok(vec![])); + } } } @@ -100,6 +108,9 @@ impl Observer for SessionManagerObserver { SessionManagerObserver::Workers { result, .. } => { let _ = result.send(Err(e)); } + SessionManagerObserver::Options { result, .. } => { + let _ = result.send(Err(e)); + } } } } diff --git a/crates/sail-common/Cargo.toml b/crates/sail-common/Cargo.toml index ad0580d3d4..be8427bdb7 100644 --- a/crates/sail-common/Cargo.toml +++ b/crates/sail-common/Cargo.toml @@ -27,3 +27,4 @@ half = { workspace = true } iana-time-zone = { workspace = true } tokio = { workspace = true } secrecy = { workspace = true } +toml = { workspace = true } diff --git a/crates/sail-common/src/config/application.rs b/crates/sail-common/src/config/application.rs index b0cc862ef4..2d76f16fd7 100644 --- a/crates/sail-common/src/config/application.rs +++ b/crates/sail-common/src/config/application.rs @@ -2,18 +2,21 @@ use figment::providers::Env; use figment::value::{Dict, Empty, Map, Tag, Value}; use figment::{Error, Figment, Metadata, Profile, Provider}; use secrecy::SecretString; -use serde::Deserialize; +use serde::{Deserialize, Serialize}; use crate::config::loader::{ deserialize_non_empty_string, deserialize_non_zero, deserialize_unknown_unit, ConfigDefinition, }; +use crate::config::observer::{ + serialize_non_empty_string, serialize_non_zero, serialize_optional_secret, +}; use crate::error::{CommonError, CommonResult}; const APP_CONFIG: &str = include_str!("application.yaml"); pub const SAIL_ENV_VAR_PREFIX: &str = "SAIL_"; -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] pub struct AppConfig { pub mode: ExecutionMode, pub runtime: RuntimeConfig, @@ -30,7 +33,7 @@ pub struct AppConfig { /// This field ensures that environment variables with prefix `SAIL_INTERNAL_` /// can only be used for internal configuration. /// Such environment variables are ignored by application configuration. - #[serde(deserialize_with = "deserialize_unknown_unit")] + #[serde(skip_serializing, deserialize_with = "deserialize_unknown_unit")] pub internal: (), } @@ -65,7 +68,7 @@ impl AppConfig { } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum ExecutionMode { Local, @@ -81,7 +84,7 @@ pub enum ExecutionMode { KubernetesCluster, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RuntimeConfig { pub stack_size: usize, @@ -90,30 +93,30 @@ pub struct RuntimeConfig { pub temporary_files: TemporaryFilesConfig, } -#[derive(Debug, Clone, Deserialize)] -#[serde(from = "memory_pool::MemoryPool")] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde(into = "memory_pool::MemoryPool", from = "memory_pool::MemoryPool")] pub enum MemoryPoolConfig { Unbounded, Greedy(GreedyMemoryPoolConfig), Fair(FairMemoryPoolConfig), } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct GreedyMemoryPoolConfig { pub max_size: usize, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct FairMemoryPoolConfig { pub max_size: usize, } mod memory_pool { - use serde::Deserialize; + use serde::{Deserialize, Serialize}; - #[derive(Debug, Clone, Deserialize)] + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum Type { Unbounded, @@ -121,12 +124,12 @@ mod memory_pool { Fair, } - #[derive(Debug, Clone, Deserialize)] + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct MemoryPool { - r#type: Type, - greedy: super::GreedyMemoryPoolConfig, - fair: super::FairMemoryPoolConfig, + pub r#type: Type, + pub greedy: super::GreedyMemoryPoolConfig, + pub fair: super::FairMemoryPoolConfig, } impl From for super::MemoryPoolConfig { @@ -138,16 +141,38 @@ mod memory_pool { } } } + + impl From for MemoryPool { + fn from(value: super::MemoryPoolConfig) -> Self { + match value { + super::MemoryPoolConfig::Unbounded => MemoryPool { + r#type: Type::Unbounded, + greedy: super::GreedyMemoryPoolConfig { max_size: 0 }, + fair: super::FairMemoryPoolConfig { max_size: 0 }, + }, + super::MemoryPoolConfig::Greedy(g) => MemoryPool { + r#type: Type::Greedy, + greedy: g, + fair: super::FairMemoryPoolConfig { max_size: 0 }, + }, + super::MemoryPoolConfig::Fair(f) => MemoryPool { + r#type: Type::Fair, + greedy: super::GreedyMemoryPoolConfig { max_size: 0 }, + fair: f, + }, + } + } + } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct TemporaryFilesConfig { pub paths: Vec, pub max_size: usize, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct ClusterConfig { pub enable_tls: bool, @@ -155,6 +180,7 @@ pub struct ClusterConfig { pub driver_listen_port: u16, pub driver_external_host: String, pub driver_external_port: u16, + #[serde(skip_serializing)] pub worker_id: u64, pub worker_listen_host: String, pub worker_listen_port: u16, @@ -174,21 +200,24 @@ pub struct ClusterConfig { pub rpc_retry_strategy: RetryStrategy, } -#[derive(Debug, Clone, Deserialize)] -#[serde(from = "retry_strategy::RetryStrategy")] +#[derive(Debug, Clone, Serialize, Deserialize)] +#[serde( + into = "retry_strategy::RetryStrategy", + from = "retry_strategy::RetryStrategy" +)] pub enum RetryStrategy { Fixed(FixedRetryStrategy), ExponentialBackoff(ExponentialBackoffRetryStrategy), } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct FixedRetryStrategy { pub max_count: usize, pub delay_secs: u64, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct ExponentialBackoffRetryStrategy { pub max_count: usize, @@ -198,9 +227,9 @@ pub struct ExponentialBackoffRetryStrategy { } mod retry_strategy { - use serde::Deserialize; + use serde::{Deserialize, Serialize}; - #[derive(Debug, Clone, Deserialize)] + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum Type { Fixed, @@ -208,12 +237,12 @@ mod retry_strategy { ExponentialBackoff, } - #[derive(Debug, Clone, Deserialize)] + #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct RetryStrategy { - r#type: Type, - fixed: super::FixedRetryStrategy, - exponential_backoff: super::ExponentialBackoffRetryStrategy, + pub r#type: Type, + pub fixed: super::FixedRetryStrategy, + pub exponential_backoff: super::ExponentialBackoffRetryStrategy, } impl From for super::RetryStrategy { @@ -226,9 +255,34 @@ mod retry_strategy { } } } + + impl From for RetryStrategy { + fn from(value: super::RetryStrategy) -> Self { + match value { + super::RetryStrategy::Fixed(f) => RetryStrategy { + r#type: Type::Fixed, + fixed: f, + exponential_backoff: super::ExponentialBackoffRetryStrategy { + max_count: 0, + initial_delay_secs: 0, + max_delay_secs: 0, + factor: 0, + }, + }, + super::RetryStrategy::ExponentialBackoff(e) => RetryStrategy { + r#type: Type::ExponentialBackoff, + fixed: super::FixedRetryStrategy { + max_count: 0, + delay_secs: 0, + }, + exponential_backoff: e, + }, + } + } + } } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct ExecutionConfig { pub batch_size: usize, @@ -238,17 +292,23 @@ pub struct ExecutionConfig { pub file_listing_cache: FileListingCacheConfig, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct FileListingCacheConfig { pub r#type: CacheType, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub ttl: Option, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub max_entries: Option, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct KubernetesConfig { pub image: String, @@ -260,13 +320,16 @@ pub struct KubernetesConfig { pub worker_pod_template: String, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct ParquetConfig { pub enable_page_index: bool, pub pruning: bool, pub skip_metadata: bool, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub metadata_size_hint: Option, pub pushdown_filters: bool, pub reorder_filters: bool, @@ -282,12 +345,21 @@ pub struct ParquetConfig { pub dictionary_page_size_limit: usize, pub statistics_enabled: String, pub max_row_group_size: usize, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub column_index_truncate_length: Option, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub statistics_truncate_length: Option, pub data_page_row_count_limit: usize, - #[serde(deserialize_with = "deserialize_non_empty_string")] + #[serde( + serialize_with = "serialize_non_empty_string", + deserialize_with = "deserialize_non_empty_string" + )] pub encoding: Option, pub bloom_filter_on_read: bool, pub bloom_filter_on_write: bool, @@ -300,27 +372,39 @@ pub struct ParquetConfig { pub file_metadata_cache: FileMetadataCacheConfig, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct FileStatisticsCacheConfig { pub r#type: CacheType, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub ttl: Option, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub max_entries: Option, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct FileMetadataCacheConfig { pub r#type: CacheType, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub ttl: Option, - #[serde(deserialize_with = "deserialize_non_zero")] + #[serde( + serialize_with = "serialize_non_zero", + deserialize_with = "deserialize_non_zero" + )] pub size_limit: Option, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum CacheType { None, @@ -328,29 +412,33 @@ pub enum CacheType { Session, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct CatalogConfig { - #[serde(deserialize_with = "deserialize_non_empty_string")] + #[serde( + serialize_with = "serialize_non_empty_string", + deserialize_with = "deserialize_non_empty_string" + )] pub default_catalog: Option, pub default_database: Vec, pub global_temporary_database: Vec, pub list: Vec, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct OptimizerConfig { pub enable_join_reorder: bool, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] #[serde(tag = "type", rename_all = "snake_case")] pub enum CatalogType { Memory { name: String, initial_database: Vec, + #[serde(skip_serializing_if = "Option::is_none")] initial_database_comment: Option, }, #[serde(alias = "iceberg-rest")] @@ -359,38 +447,60 @@ pub enum CatalogType { // https://iceberg.apache.org/docs/nightly/spark-configuration/#catalog-configuration name: String, uri: String, + #[serde(skip_serializing_if = "Option::is_none")] warehouse: Option, + #[serde(skip_serializing_if = "Option::is_none")] prefix: Option, + #[serde( + skip_serializing_if = "Option::is_none", + serialize_with = "serialize_optional_secret" + )] oauth_access_token: Option, + #[serde( + skip_serializing_if = "Option::is_none", + serialize_with = "serialize_optional_secret" + )] bearer_access_token: Option, }, Unity { name: String, + #[serde(skip_serializing_if = "Option::is_none")] uri: Option, + #[serde(skip_serializing_if = "Option::is_none")] default_catalog: Option, + #[serde( + skip_serializing_if = "Option::is_none", + serialize_with = "serialize_optional_secret" + )] token: Option, }, #[serde(alias = "onelake")] OneLake { name: String, url: String, + #[serde( + skip_serializing_if = "Option::is_none", + serialize_with = "serialize_optional_secret" + )] bearer_token: Option, }, Glue { name: String, + #[serde(skip_serializing_if = "Option::is_none")] region: Option, + #[serde(skip_serializing_if = "Option::is_none")] endpoint_url: Option, }, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct SparkConfig { pub session_timeout_secs: u64, pub execution_heartbeat_interval_secs: u64, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct PythonConfig { pub data_source_write_channel_capacity: usize, @@ -398,7 +508,7 @@ pub struct PythonConfig { pub data_source_slow_read_warn_ms: u64, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(deny_unknown_fields)] pub struct TelemetryConfig { pub export_traces: bool, @@ -409,12 +519,13 @@ pub struct TelemetryConfig { pub otlp_timeout_secs: u64, pub traces_export_interval_secs: u64, pub metrics_export_interval_secs: u64, + pub metrics_collection_interval_secs: u64, pub logs_export_interval_secs: u64, pub logs_export_max_queue_size: u64, pub logs_export_batch_size: u64, } -#[derive(Debug, Clone, Deserialize)] +#[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum OtlpProtocol { Grpc, diff --git a/crates/sail-common/src/config/application.yaml b/crates/sail-common/src/config/application.yaml index 5ccab0b4cb..c1d80fea9a 100644 --- a/crates/sail-common/src/config/application.yaml +++ b/crates/sail-common/src/config/application.yaml @@ -784,6 +784,12 @@ description: The interval in seconds for exporting metrics. experimental: true +- key: telemetry.metrics_collection_interval_secs + type: number + default: "5" + description: The interval in seconds for collecting metrics. + experimental: true + - key: telemetry.logs_export_interval_secs type: number default: "1" diff --git a/crates/sail-common/src/config/mod.rs b/crates/sail-common/src/config/mod.rs index 9d2767511c..aff82589dd 100644 --- a/crates/sail-common/src/config/mod.rs +++ b/crates/sail-common/src/config/mod.rs @@ -1,6 +1,7 @@ mod application; mod cli; mod loader; +mod observer; // Same default as Spark // https://github.com/apache/spark/blob/9cec3c4f7c1b467023f0eefff69e8b7c5105417d/python/pyspark/sql/connect/client/core.py#L126 diff --git a/crates/sail-common/src/config/observer.rs b/crates/sail-common/src/config/observer.rs new file mode 100644 index 0000000000..07d770f061 --- /dev/null +++ b/crates/sail-common/src/config/observer.rs @@ -0,0 +1,66 @@ +use secrecy::SecretString; + +use super::AppConfig; +use crate::error::{CommonError, CommonResult}; + +impl AppConfig { + /// Returns the current application configuration as a list of `(key, value)` pairs. + /// Keys use dot notation. + /// The `AppConfig` instance is serialized to a TOML table and then recursively walked + /// to produce key-value pairs. Arrays are converted to their TOML string representation. + pub fn raw(&self) -> CommonResult> { + let table = + toml::Table::try_from(self).map_err(|e| CommonError::InvalidArgument(e.to_string()))?; + let mut pairs = Vec::new(); + walk_toml_table(&table, String::new(), &mut pairs); + Ok(pairs) + } +} + +/// Recursively walks a TOML table, emitting `(dot.notation.key, value_string)` pairs +/// for every primitive (string, integer, float, boolean) and array leaf value. +fn walk_toml_table(table: &toml::Table, prefix: String, pairs: &mut Vec<(String, String)>) { + for (key, value) in table { + let full_key = if prefix.is_empty() { + key.clone() + } else { + format!("{prefix}.{key}") + }; + match value { + toml::Value::Table(t) => walk_toml_table(t, full_key, pairs), + toml::Value::Array(_) => pairs.push((full_key, value.to_string())), + toml::Value::String(s) => pairs.push((full_key, s.clone())), + toml::Value::Integer(i) => pairs.push((full_key, i.to_string())), + toml::Value::Float(f) => pairs.push((full_key, f.to_string())), + toml::Value::Boolean(b) => pairs.push((full_key, b.to_string())), + toml::Value::Datetime(d) => pairs.push((full_key, d.to_string())), + } + } +} + +pub fn serialize_non_zero(v: &Option, s: S) -> Result +where + S: serde::Serializer, + T: num_traits::Zero + serde::Serialize, +{ + match v { + Some(x) => x.serialize(s), + None => T::zero().serialize(s), + } +} + +pub fn serialize_non_empty_string(v: &Option, s: S) -> Result +where + S: serde::Serializer, +{ + s.serialize_str(v.as_deref().unwrap_or("")) +} + +pub fn serialize_optional_secret(v: &Option, s: S) -> Result +where + S: serde::Serializer, +{ + // Only called when Some (combined with skip_serializing_if = "Option::is_none") + let _ = v; + s.serialize_str("[REDACTED]") +} diff --git a/crates/sail-common/src/spec/data_type.rs b/crates/sail-common/src/spec/data_type.rs index 6799ff027a..67521d4b9b 100644 --- a/crates/sail-common/src/spec/data_type.rs +++ b/crates/sail-common/src/spec/data_type.rs @@ -149,10 +149,10 @@ pub enum DataType { Duration { time_unit: TimeUnit, }, - /// A "calendar" interval which models types that don't necessarily - /// have a precise duration without the context of a base timestamp (e.g. - /// days can differ in length during daylight savings time transitions). - /// Corresponds to [`arrow_schema::DataType::Interval`]. + /// Represents Spark's interval types. + /// `YearMonth` and `MonthDayNano` are resolved to [`arrow_schema::DataType::Interval`]. + /// `DayTime` is resolved to [`arrow_schema::DataType::Duration`] with microsecond + /// precision to match Spark's `DayTimeIntervalType`. Interval { interval_unit: IntervalUnit, start_field: Option, @@ -474,8 +474,9 @@ impl Display for UnionMode { pub enum IntervalUnit { /// Indicates the number of elapsed whole months, stored as 4-byte integers. YearMonth = 0, - /// Indicates the number of elapsed days and milliseconds, - /// stored as 2 contiguous 32-bit integers (days, milliseconds) (8-bytes in total). + /// Represents Spark's `DayTimeIntervalType` with microsecond precision. + /// Resolved to [`arrow_schema::DataType::Duration`] instead of Arrow's + /// `IntervalUnit::DayTime` (which only has millisecond precision). DayTime = 1, /// A triple of the number of elapsed months, days, and nanoseconds. /// The values are stored contiguously in 16 byte blocks. Months and diff --git a/crates/sail-common/src/spec/expression.rs b/crates/sail-common/src/spec/expression.rs index 34151142f9..86a221a678 100644 --- a/crates/sail-common/src/spec/expression.rs +++ b/crates/sail-common/src/spec/expression.rs @@ -137,6 +137,9 @@ pub enum Expr { value: String, timestamp_type: TimestampType, }, + IdentifierClause { + expr: Box, + }, } /// An identifier with only one part. diff --git a/crates/sail-common/src/spec/plan.rs b/crates/sail-common/src/spec/plan.rs index 0ec5f8ccd3..2f7014d37e 100644 --- a/crates/sail-common/src/spec/plan.rs +++ b/crates/sail-common/src/spec/plan.rs @@ -601,12 +601,11 @@ pub enum MergeNotMatchedByTargetAction { #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase", rename_all_fields = "camelCase")] -#[expect(clippy::large_enum_variant)] pub enum ReadType { - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - NamedTable(ReadNamedTable), - Udtf(ReadUdtf), - DataSource(ReadDataSource), + NamedTable(Box), + Udtf(Box), + DataSource(Box), + DynamicTable(Box), } #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] @@ -618,6 +617,14 @@ pub struct ReadNamedTable { pub options: Vec<(String, String)>, } +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct ReadDynamicTable { + pub name: Expr, + pub sample: Option, + pub options: Vec<(String, String)>, +} + #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "camelCase", rename_all_fields = "camelCase")] pub enum TableTemporal { @@ -857,7 +864,7 @@ pub struct TableDefinition { pub location: Option, pub file_format: Option, pub row_format: Option, - pub partition_by: Vec, + pub partition_by: Vec, pub sort_by: Vec, pub bucket_by: Option, pub cluster_by: Vec, @@ -933,7 +940,7 @@ pub struct Write { pub save_type: SaveType, pub mode: Option, pub sort_columns: Vec, - pub partitioning_columns: Vec, + pub partitioning_columns: Vec, pub clustering_columns: Vec, pub bucket_by: Option, pub options: Vec<(String, String)>, diff --git a/crates/sail-data-source/src/formats/binary/source.rs b/crates/sail-data-source/src/formats/binary/source.rs index a9abdc0331..e583d04d99 100644 --- a/crates/sail-data-source/src/formats/binary/source.rs +++ b/crates/sail-data-source/src/formats/binary/source.rs @@ -10,7 +10,7 @@ use datafusion_datasource::file_stream::{FileOpenFuture, FileOpener}; use datafusion_datasource::projection::{ProjectionOpener, SplitProjection}; use datafusion_datasource::{PartitionedFile, TableSchema}; use futures::StreamExt; -use object_store::ObjectStore; +use object_store::{GetResult, ObjectStore, ObjectStoreExt}; use crate::formats::binary::reader::{BinaryFileMetadata, BinaryFileReader}; @@ -146,7 +146,7 @@ impl FileOpener for BinaryOpener { ); Ok(Box::pin(async move { - let get_result = store.get(&location).await?; + let get_result: GetResult = store.get(&location).await?; let content = get_result.bytes().await?; let modification_time = last_modified.timestamp_micros(); let metadata = BinaryFileMetadata { diff --git a/crates/sail-data-source/src/formats/console/mod.rs b/crates/sail-data-source/src/formats/console/mod.rs index a40d298c15..0cdbb75d85 100644 --- a/crates/sail-data-source/src/formats/console/mod.rs +++ b/crates/sail-data-source/src/formats/console/mod.rs @@ -34,13 +34,14 @@ impl TableFormat for ConsoleTableFormat { _ctx: &dyn Session, info: SinkInfo, ) -> Result> { + let path = info.path(); let SinkInfo { input, - path, mode, partition_by, bucket_by, sort_order, + table_properties: _, options, } = info; if !is_flow_event_schema(&input.schema()) { diff --git a/crates/sail-data-source/src/formats/console/writer.rs b/crates/sail-data-source/src/formats/console/writer.rs index a0ab214bda..246e0f73e6 100644 --- a/crates/sail-data-source/src/formats/console/writer.rs +++ b/crates/sail-data-source/src/formats/console/writer.rs @@ -15,12 +15,12 @@ use futures::StreamExt; #[derive(Debug)] pub struct ConsoleSinkExec { input: Arc, - properties: PlanProperties, + properties: Arc, } impl ConsoleSinkExec { pub fn new(input: Arc) -> Self { - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(Arc::new(Schema::empty())), Partitioning::UnknownPartitioning( input.properties().output_partitioning().partition_count(), @@ -28,7 +28,7 @@ impl ConsoleSinkExec { EmissionType::Final, // The node returns no data, so it is bounded. Boundedness::Bounded, - ); + )); Self { input, properties } } @@ -56,7 +56,7 @@ impl ExecutionPlan for ConsoleSinkExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/formats/listing.rs b/crates/sail-data-source/src/formats/listing.rs index 51153bbf65..be45dcf115 100644 --- a/crates/sail-data-source/src/formats/listing.rs +++ b/crates/sail-data-source/src/formats/listing.rs @@ -8,7 +8,7 @@ use datafusion::arrow::datatypes::{DataType, Schema}; use datafusion::catalog::{Session, TableProvider}; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::{ListingOptions, ListingTable, ListingTableConfig}; -use datafusion::datasource::physical_plan::FileSinkConfig; +use datafusion::datasource::physical_plan::{FileOutputMode, FileSinkConfig}; use datafusion::logical_expr::dml::InsertOp; use datafusion::physical_plan::ExecutionPlan; use datafusion_common::parsers::CompressionTypeVariant; @@ -190,14 +190,15 @@ impl TableFormat for ListingTableFormat { ctx: &dyn Session, info: SinkInfo, ) -> Result> { + let path = info.path(); let SinkInfo { input, - path, // TODO: sink mode is ignored since the file formats only support append operation mode: _, partition_by, bucket_by, sort_order, + table_properties: _, options, } = info; if is_flow_event_schema(&input.schema()) { @@ -206,6 +207,9 @@ impl TableFormat for ListingTableFormat { if bucket_by.is_some() { return not_impl_err!("bucketing for writing listing table format"); } + if partition_by.iter().any(|field| field.transform.is_some()) { + return not_impl_err!("partition transforms for writing listing table format"); + } // always write multi-file output let path = if path.ends_with(object_store::path::DELIMITER) { path @@ -223,7 +227,7 @@ impl TableFormat for ListingTableFormat { // This is how DataFusion handles physical planning for `LogicalPlan::Copy`. let table_partition_cols = partition_by .iter() - .map(|s| (s.clone(), DataType::Null)) + .map(|field| (field.column.clone(), DataType::Null)) .collect::>(); let (format, compression) = self.inner.create_write_format(ctx, options)?; let file_extension = if let Some(file_compression_type) = format.compression_type() { @@ -265,6 +269,7 @@ impl TableFormat for ListingTableFormat { insert_op: InsertOp::Append, keep_partition_by_columns: false, file_extension, + file_output_mode: FileOutputMode::Automatic, }; format .create_writer_physical_plan(input, ctx, conf, sort_order) diff --git a/crates/sail-data-source/src/formats/python/arrow_utils.rs b/crates/sail-data-source/src/formats/python/arrow_utils.rs index 541fab833d..7c8c66e896 100644 --- a/crates/sail-data-source/src/formats/python/arrow_utils.rs +++ b/crates/sail-data-source/src/formats/python/arrow_utils.rs @@ -582,7 +582,7 @@ fn build_array_from_rows( } /// Extract a value from a Python row tuple. -fn extract_value<'py, T: pyo3::FromPyObject<'py>>( +fn extract_value<'py, T: for<'a> pyo3::FromPyObject<'a, 'py>>( row: &Bound<'py, PyAny>, col_idx: usize, ) -> Result> { @@ -592,7 +592,7 @@ fn extract_value<'py, T: pyo3::FromPyObject<'py>>( return Ok(None); } - item.extract::().map(Some).map_err(py_err) + item.extract::().map(Some).map_err(|e| py_err(e.into())) } /// Re-export py_err and import_cloudpickle from error module. diff --git a/crates/sail-data-source/src/formats/python/commit_exec.rs b/crates/sail-data-source/src/formats/python/commit_exec.rs index 55067aa5fe..dd42d1d654 100644 --- a/crates/sail-data-source/src/formats/python/commit_exec.rs +++ b/crates/sail-data-source/src/formats/python/commit_exec.rs @@ -32,7 +32,7 @@ pub struct PythonDataSourceWriteCommitExec { /// Number of partition results expected from the write stage. expected_partitions: usize, /// Execution plan properties. - properties: PlanProperties, + properties: Arc, } impl PythonDataSourceWriteCommitExec { @@ -42,12 +42,12 @@ impl PythonDataSourceWriteCommitExec { pickled_writer: Vec, expected_partitions: usize, ) -> Self { - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(Arc::new(Schema::empty())), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { input, @@ -92,7 +92,7 @@ impl ExecutionPlan for PythonDataSourceWriteCommitExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/formats/python/python_datasource.rs b/crates/sail-data-source/src/formats/python/datasource.rs similarity index 82% rename from crates/sail-data-source/src/formats/python/python_datasource.rs rename to crates/sail-data-source/src/formats/python/datasource.rs index f91a307e0b..f19d2864cc 100644 --- a/crates/sail-data-source/src/formats/python/python_datasource.rs +++ b/crates/sail-data-source/src/formats/python/datasource.rs @@ -156,16 +156,21 @@ impl PythonDataSource { .call_method0("schema") .map_err(|e| ctx.wrap_py_error(e))?; - // Schema should be a PyArrow Schema or DDL string + // Schema should be a PyArrow Schema, PySpark DataType, or DDL string // Try PyArrow Schema first if let Ok(schema) = py_schema_to_rust(py, &schema_obj) { return Ok(schema); } + // Try PySpark DataType (StructType) + if let Some(schema) = self.try_pyspark_data_type_schema(py, &schema_obj, &ctx)? { + return Ok(schema); + } + // Try DDL string let schema_str: String = schema_obj.extract().map_err(|e| { PythonDataSourceError::SchemaError(format!( - "[{}::schema] schema() must return PyArrow Schema or DDL string: {}", + "[{}::schema] schema() must return PyArrow Schema, PySpark DataType, or DDL string: {}", self.name, e )) })?; @@ -178,6 +183,72 @@ impl PythonDataSource { } } + fn try_pyspark_data_type_schema( + &self, + py: Python<'_>, + schema_obj: &Bound<'_, PyAny>, + ctx: &PythonDataSourceContext, + ) -> Result> { + let types_module = match py.import("pyspark.sql.types") { + Ok(module) => module, + Err(_) => return Ok(None), + }; + let data_type_class = types_module + .getattr("DataType") + .map_err(|e| ctx.wrap_py_error(e))?; + + let is_data_type = schema_obj + .is_instance(&data_type_class) + .map_err(|e| ctx.wrap_py_error(e))?; + if !is_data_type { + return Ok(None); + } + + let pandas_types = py + .import("pyspark.sql.pandas.types") + .map_err(|e| ctx.wrap_py_error(e))?; + let to_arrow_type = pandas_types + .getattr("to_arrow_type") + .map_err(|e| ctx.wrap_py_error(e))?; + // TODO: pass options such as `prefers_large_types` as specified for the session + let arrow_type = to_arrow_type + .call1((schema_obj,)) + .map_err(|e| ctx.wrap_py_error(e))?; + + let pa_types = py + .import("pyarrow.types") + .map_err(|e| ctx.wrap_py_error(e))?; + let is_struct: bool = pa_types + .getattr("is_struct") + .map_err(|e| ctx.wrap_py_error(e))? + .call1((arrow_type.clone(),)) + .map_err(|e| ctx.wrap_py_error(e))? + .extract() + .map_err(|e| ctx.wrap_py_error(e))?; + + if !is_struct { + return Err(PythonDataSourceError::SchemaError(format!( + "[{}::schema] schema() DataType must be StructType to be used as a schema", + self.name + )) + .into()); + } + + let builtins = py.import("builtins").map_err(|e| ctx.wrap_py_error(e))?; + let list_fn = builtins.getattr("list").map_err(|e| ctx.wrap_py_error(e))?; + let fields_list = list_fn + .call1((arrow_type,)) + .map_err(|e| ctx.wrap_py_error(e))?; + let pa = py.import("pyarrow").map_err(|e| ctx.wrap_py_error(e))?; + let pa_schema = pa + .getattr("schema") + .map_err(|e| ctx.wrap_py_error(e))? + .call1((fields_list,)) + .map_err(|e| ctx.wrap_py_error(e))?; + + Ok(Some(py_schema_to_rust(py, &pa_schema)?)) + } + /// Get the number of partitions for parallel reading. /// /// Calls the Python data source's `partitioning()` method. @@ -197,10 +268,9 @@ impl PythonDataSource { .map_err(|e| ctx.wrap_py_error(e))?; // Convert to list - let partitions_list = - partitions.downcast::().map_err(|e| { - ctx.wrap_error(format!("partitioning() must return a list: {}", e)) - })?; + let partitions_list = partitions.cast::().map_err(|e| { + ctx.wrap_error(format!("partitioning() must return a list: {}", e)) + })?; Ok(partitions_list.len()) }) diff --git a/crates/sail-data-source/src/formats/python/exec.rs b/crates/sail-data-source/src/formats/python/exec.rs index c52ba32c5b..cf876003a9 100644 --- a/crates/sail-data-source/src/formats/python/exec.rs +++ b/crates/sail-data-source/src/formats/python/exec.rs @@ -55,7 +55,7 @@ pub struct PythonDataSourceExec { /// Partitions for parallel reading partitions: Vec, /// Execution plan properties - properties: PlanProperties, + properties: Arc, } impl PythonDataSourceExec { @@ -72,12 +72,12 @@ impl PythonDataSourceExec { partitions: Vec, ) -> Self { let num_partitions = partitions.len().max(1); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), Partitioning::UnknownPartitioning(num_partitions), EmissionType::Incremental, Boundedness::Bounded, - ); + )); Self { pickled_reader, @@ -122,7 +122,7 @@ impl ExecutionPlan for PythonDataSourceExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/formats/python/executor.rs b/crates/sail-data-source/src/formats/python/executor.rs index 041789ff40..afc427dadb 100644 --- a/crates/sail-data-source/src/formats/python/executor.rs +++ b/crates/sail-data-source/src/formats/python/executor.rs @@ -322,13 +322,33 @@ impl PythonExecutor for InProcessExecutor { } // Now call partitions() on the same reader that has the filters - let partitions = reader - .call_method0("partitions") - .map_err(|e| ctx.wrap_py_error(e))?; + let partitions = match reader.call_method0("partitions") { + Ok(partitions) => partitions, + Err(err) => { + if is_pyspark_not_implemented(py, &err) { + log::debug!( + "[{}::partitions] partitions() not implemented; using single default partition", + ds_name + ); + pyo3::types::PyList::new(py, [py.None()]) + .map_err(|e| { + ctx.wrap_error(format!( + "Failed to create default partitions list: {}", + e + )) + })? + .into_any() + } else { + return Err(datafusion_common::DataFusionError::External(Box::new( + ctx.wrap_py_error(err), + ))); + } + } + }; // Convert Python partitions to Rust let partitions_list = - partitions.downcast::().map_err(|e| { + partitions.cast::().map_err(|e| { ctx.wrap_error(format!("partitions() must return a list: {}", e)) })?; @@ -832,6 +852,19 @@ fn pickle_object(py: pyo3::Python<'_>, obj: &pyo3::Bound<'_, pyo3::PyAny>) -> Re Ok(bytes) } +fn is_pyspark_not_implemented(py: pyo3::Python<'_>, err: &pyo3::PyErr) -> bool { + let errors_module = match py.import("pyspark.errors") { + Ok(module) => module, + Err(_) => return false, + }; + let not_impl = match errors_module.getattr("PySparkNotImplementedError") { + Ok(cls) => cls, + Err(_) => return false, + }; + + err.is_instance(py, ¬_impl) +} + /// Re-export py_err from error module. use super::error::py_err; diff --git a/crates/sail-data-source/src/formats/python/mod.rs b/crates/sail-data-source/src/formats/python/mod.rs index 9502604bcc..a426541213 100644 --- a/crates/sail-data-source/src/formats/python/mod.rs +++ b/crates/sail-data-source/src/formats/python/mod.rs @@ -23,20 +23,21 @@ //! - `commit_exec`: Single-partition commit/abort execution plan pub mod arrow_utils; mod commit_exec; +mod datasource; mod discovery; mod error; mod exec; mod executor; mod filter; -mod python_datasource; -mod python_table_provider; mod stream; mod table_format; +mod table_provider; mod write_exec; // Public exports - always available // Public exports - require python feature pub use commit_exec::PythonDataSourceWriteCommitExec; +pub use datasource::PythonDataSource; pub use discovery::{ discover_data_sources, validate_datasource_class, DataSourceEntry, PythonDataSourceRegistry, DATA_SOURCE_REGISTRY, @@ -45,8 +46,7 @@ pub use error::PythonDataSourceError; pub use exec::PythonDataSourceExec; pub use executor::{InProcessExecutor, InputPartition, PythonExecutor}; pub use filter::{exprs_to_python_filters, ColumnPath, FilterValue, PythonFilter}; -pub use python_datasource::PythonDataSource; -pub use python_table_provider::PythonTableProvider; pub use stream::{PythonDataSourceStream, RowBatchCollector, DEFAULT_BATCH_SIZE}; pub use table_format::PythonTableFormat; +pub use table_provider::PythonTableProvider; pub use write_exec::PythonDataSourceWriteExec; diff --git a/crates/sail-data-source/src/formats/python/table_format.rs b/crates/sail-data-source/src/formats/python/table_format.rs index df935d5748..7c4d53d897 100644 --- a/crates/sail-data-source/src/formats/python/table_format.rs +++ b/crates/sail-data-source/src/formats/python/table_format.rs @@ -11,10 +11,10 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion_common::Result; use sail_common_datafusion::datasource::{SinkInfo, SourceInfo, TableFormat, TableFormatRegistry}; +use super::datasource::PythonDataSource; use super::discovery::DATA_SOURCE_REGISTRY; use super::executor::InProcessExecutor; -use super::python_datasource::PythonDataSource; -use super::python_table_provider::PythonTableProvider; +use super::table_provider::PythonTableProvider; /// TableFormat implementation for a Python data source. /// @@ -174,11 +174,12 @@ impl TableFormat for PythonTableFormat { // Create PythonDataSource from options let datasource = self.create_datasource(&info.options)?; - // Get schema (use provided schema or discover from Python) - let schema = if let Some(schema) = info.schema { - Arc::new(schema) - } else { - datasource.schema()? + // Get schema (use provided schema or discover from Python). + // When a table is created without column definitions (e.g. `CREATE TABLE t USING fmt`), + // the catalog stores an empty schema. Fall back to Python discovery in that case. + let schema = match info.schema { + Some(schema) if !schema.fields().is_empty() => Arc::new(schema), + _ => datasource.schema()?, }; // Create executor (MVP: in-process via PyO3) @@ -199,9 +200,9 @@ impl TableFormat for PythonTableFormat { let SinkInfo { input, - path, mode, partition_by, + table_properties: _, mut options, .. } = info; @@ -215,13 +216,9 @@ impl TableFormat for PythonTableFormat { ); } - // Inject save path into options so the Python DataSource receives it - // via self.options["path"] in __init__ (matches PySpark behavior). - if !path.is_empty() { - let path_option: HashMap = - [("path".to_string(), path)].into_iter().collect(); - options.push(path_option); - } + // The path (if any) is already present in options under the "path" key, + // so it will be forwarded to the Python DataSource via self.options["path"] + // in __init__ (matches PySpark behavior). No additional injection needed. // Map save mode to overwrite bool (PySpark convention). // PySpark's DataSource.writer(schema, overwrite) only receives a boolean: diff --git a/crates/sail-data-source/src/formats/python/python_table_provider.rs b/crates/sail-data-source/src/formats/python/table_provider.rs similarity index 100% rename from crates/sail-data-source/src/formats/python/python_table_provider.rs rename to crates/sail-data-source/src/formats/python/table_provider.rs diff --git a/crates/sail-data-source/src/formats/python/write_exec.rs b/crates/sail-data-source/src/formats/python/write_exec.rs index af665a2f8f..68af2bf243 100644 --- a/crates/sail-data-source/src/formats/python/write_exec.rs +++ b/crates/sail-data-source/src/formats/python/write_exec.rs @@ -66,19 +66,19 @@ pub struct PythonDataSourceWriteExec { /// Whether writer is DataSourceArrowWriter (true) or DataSourceWriter (false) is_arrow: bool, /// Execution plan properties - properties: PlanProperties, + properties: Arc, } impl PythonDataSourceWriteExec { /// Create a new distributed write execution plan. pub fn new(input: Arc, pickled_writer: Vec, is_arrow: bool) -> Self { let output_partition_count = input.properties().partitioning.partition_count(); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(write_result_schema()), Partitioning::UnknownPartitioning(output_partition_count), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { input, @@ -124,7 +124,7 @@ impl ExecutionPlan for PythonDataSourceWriteExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/formats/rate/reader.rs b/crates/sail-data-source/src/formats/rate/reader.rs index 9ceb53ae2b..2741e572fa 100644 --- a/crates/sail-data-source/src/formats/rate/reader.rs +++ b/crates/sail-data-source/src/formats/rate/reader.rs @@ -86,7 +86,7 @@ pub struct RateSourceExec { original_schema: SchemaRef, projected_schema: SchemaRef, projection: Vec, - properties: PlanProperties, + properties: Arc, } impl RateSourceExec { @@ -100,14 +100,14 @@ impl RateSourceExec { let time_zone = Self::infer_time_zone(&schema)?; let projected_schema = Arc::new(schema.project(&projection)?); let output_schema = Arc::new(to_flow_event_schema(&projected_schema)); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(output_schema), Partitioning::UnknownPartitioning(options.num_partitions), EmissionType::Both, Boundedness::Unbounded { requires_infinite_memory: false, }, - ); + )); Ok(Self { options, time_zone, @@ -171,7 +171,7 @@ impl ExecutionPlan for RateSourceExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/formats/socket/reader.rs b/crates/sail-data-source/src/formats/socket/reader.rs index 62a7e6f2cc..ce25356abe 100644 --- a/crates/sail-data-source/src/formats/socket/reader.rs +++ b/crates/sail-data-source/src/formats/socket/reader.rs @@ -82,7 +82,7 @@ pub struct SocketSourceExec { original_schema: SchemaRef, projected_schema: SchemaRef, projection: Vec, - properties: PlanProperties, + properties: Arc, } impl SocketSourceExec { @@ -95,14 +95,14 @@ impl SocketSourceExec { ) -> Result { let projected_schema = Arc::new(schema.project(&projection)?); let output_schema = Arc::new(to_flow_event_schema(&projected_schema)); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(output_schema), Partitioning::UnknownPartitioning(1), EmissionType::Both, Boundedness::Unbounded { requires_infinite_memory: false, }, - ); + )); Ok(Self { options, original_schema: schema, @@ -150,7 +150,7 @@ impl ExecutionPlan for SocketSourceExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-data-source/src/listing.rs b/crates/sail-data-source/src/listing.rs index 3d2923a1f4..89c3f23760 100644 --- a/crates/sail-data-source/src/listing.rs +++ b/crates/sail-data-source/src/listing.rs @@ -5,6 +5,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::catalog::Session; use datafusion::datasource::listing::{ListingOptions, ListingTableConfig, ListingTableUrl}; +use datafusion::execution::cache::cache_manager::CachedFileList; use datafusion::execution::cache::TableScopedPath; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{internal_err, plan_err, DataFusionError, GetExt, Result}; @@ -12,7 +13,7 @@ use datafusion_datasource::file_compression_type::FileCompressionType; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; use log::debug; -use object_store::{ObjectMeta, ObjectStore}; +use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; use crate::formats::listing::{ListingFormat, ListingTableFormat}; @@ -203,11 +204,11 @@ pub async fn list_all_files<'a>( }; if let Some(res) = cache.get(&key) { debug!("Hit list all files cache"); - futures::stream::iter(res.as_ref().clone().into_iter().map(Ok)).boxed() + futures::stream::iter(res.files.as_ref().clone().into_iter().map(Ok)).boxed() } else { let list_res = store.list(Some(url.prefix())); let vec = list_res.try_collect::>().await?; - cache.put(&key, Arc::new(vec.clone())); + cache.put(&key, CachedFileList::new(vec.clone())); futures::stream::iter(vec.into_iter().map(Ok)).boxed() } } diff --git a/crates/sail-data-source/src/options/data/delta_read.yaml b/crates/sail-data-source/src/options/data/delta_read.yaml index 96c5ef0e4b..afc916f981 100644 --- a/crates/sail-data-source/src/options/data/delta_read.yaml +++ b/crates/sail-data-source/src/options/data/delta_read.yaml @@ -185,3 +185,34 @@ supported: false rust_type: bool rust_deserialize_with: crate::options::serde::deserialize_bool + +- key: metadata_as_data_read + aliases: + - metadataAsDataRead + description: | + Enable the metadata-as-data read path: avoid eagerly loading file metadata on the driver; + use log replay and discovery in the physical plan instead. When false (default), use the + driver path with pre-loaded file list. + default: "false" + supported: true + rust_type: bool + rust_deserialize_with: crate::options::serde::deserialize_bool + +- key: delta_log_replay_strategy + aliases: + - deltaLogReplayStrategy + description: | + Strategy for Delta log replay in the metadata-as-data path. Values: "auto", "sort", "hash". + default: "auto" + supported: true + rust_type: String + +- key: delta_log_replay_hash_threshold + aliases: + - deltaLogReplayHashThreshold + description: | + Max commit JSON file count to use hash-no-sort replay when strategy is "auto". + default: "100" + supported: true + rust_type: usize + rust_deserialize_with: crate::options::serde::deserialize_usize diff --git a/crates/sail-data-source/src/options/data/delta_write.yaml b/crates/sail-data-source/src/options/data/delta_write.yaml index 235f8dbc73..31d2d82d07 100644 --- a/crates/sail-data-source/src/options/data/delta_write.yaml +++ b/crates/sail-data-source/src/options/data/delta_write.yaml @@ -2,6 +2,11 @@ # References: # - [1] https://github.com/delta-io/delta/blob/master/spark/src/main/scala/org/apache/spark/sql/delta/DeltaOptions.scala # - [2] https://docs.delta.io/latest/delta-batch.html#deltadataframewrites +# +# Note: +# Delta table properties such as `delta.columnMapping.mode` are not modeled here because they +# persist in the Delta log metadata instead of affecting a single write. Routes those keys +# from write options into table properties before write planning. - key: replace_where aliases: @@ -127,15 +132,21 @@ rust_type: usize rust_deserialize_with: crate::options::serde::deserialize_non_zero_usize -- key: column_mapping_mode +- key: delta_log_replay_strategy aliases: - - columnMappingMode - - column_mapping - # SQL DDL table property (TBLPROPERTIES) for Delta column mapping - - delta.columnMapping.mode - description: | - Column mapping mode for new Delta tables. Supported values: "none", "name", and "id". - When set to "name" or "id", new tables will be created with Delta Column Mapping. - default: "none" + - deltaLogReplayStrategy + description: | + Strategy for Delta log replay in the metadata-as-data path. Values: "auto", "sort", "hash". + default: "auto" supported: true rust_type: String + +- key: delta_log_replay_hash_threshold + aliases: + - deltaLogReplayHashThreshold + description: | + Max commit JSON file count to use hash-no-sort replay when strategy is "auto". + default: "100" + supported: true + rust_type: usize + rust_deserialize_with: crate::options::serde::deserialize_usize diff --git a/crates/sail-data-source/src/url.rs b/crates/sail-data-source/src/url.rs index 4be8f3c17d..f7af7e3037 100644 --- a/crates/sail-data-source/src/url.rs +++ b/crates/sail-data-source/src/url.rs @@ -9,6 +9,7 @@ use datafusion::datasource::listing::ListingTableUrl; use datafusion_common::{not_impl_err, plan_datafusion_err, plan_err, DataFusionError, Result}; use glob::Pattern; use log::debug; +use object_store::ObjectStoreExt; use percent_encoding::percent_decode; use url::Url; diff --git a/crates/sail-delta-lake/Cargo.toml b/crates/sail-delta-lake/Cargo.toml index c787860cc4..6f0ac53517 100644 --- a/crates/sail-delta-lake/Cargo.toml +++ b/crates/sail-delta-lake/Cargo.toml @@ -20,8 +20,6 @@ sail-common = { path = "../sail-common" } sail-common-datafusion = { path = "../sail-common-datafusion" } sail-data-source = { path = "../sail-data-source" } sail-physical-plan = { path = "../sail-physical-plan" } -# Delta Lake -delta_kernel = { workspace = true } # DataFusion dependencies datafusion = { workspace = true } @@ -52,6 +50,8 @@ itertools = { workspace = true } percent-encoding = { workspace = true } regex = { workspace = true } serde_arrow = { workspace = true } +moka = { workspace = true } +once_cell = { workspace = true } [lints] workspace = true diff --git a/crates/sail-delta-lake/src/conversion/mod.rs b/crates/sail-delta-lake/src/conversion/mod.rs index bb7aa04f05..200ca68630 100644 --- a/crates/sail-delta-lake/src/conversion/mod.rs +++ b/crates/sail-delta-lake/src/conversion/mod.rs @@ -1,5 +1,8 @@ pub mod scalar; pub mod type_promotion; -pub use scalar::{ScalarConverter, ScalarExt}; +pub use scalar::{ + parse_optional_partition_value, parse_partition_value, scalar_from_array_opt, + scalar_value_to_array, ScalarConverter, ScalarExt, +}; pub use type_promotion::DeltaTypeConverter; diff --git a/crates/sail-delta-lake/src/conversion/scalar.rs b/crates/sail-delta-lake/src/conversion/scalar.rs index f2e038b1d0..f45818c92d 100644 --- a/crates/sail-delta-lake/src/conversion/scalar.rs +++ b/crates/sail-delta-lake/src/conversion/scalar.rs @@ -24,21 +24,18 @@ use std::sync::Arc; use chrono::{DateTime, TimeZone, Utc}; use datafusion::arrow::array::{ - self, Array, BinaryArray, BooleanArray, Date32Array, Decimal128Array, Float32Array, - Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, StringArray, - TimestampMicrosecondArray, + Array, ArrayRef, BooleanArray, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, + Int8Array, LargeStringArray, RecordBatch, StringArray, UInt16Array, UInt32Array, UInt64Array, + UInt8Array, }; -use datafusion::arrow::compute::{cast_with_options, CastOptions}; +use datafusion::arrow::compute::{cast, cast_with_options, CastOptions}; use datafusion::arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; use datafusion::common::scalar::ScalarValue; use datafusion::common::Result as DataFusionResult; -use delta_kernel::engine::arrow_conversion::TryIntoKernel as _; -use delta_kernel::expressions::{Scalar, StructData}; -use delta_kernel::schema::{DataType, PrimitiveType, StructField}; use percent_encoding::{utf8_percent_encode, AsciiSet, NON_ALPHANUMERIC}; use serde_json::Value; -use crate::kernel::{DeltaResult as DeltaResultLocal, DeltaTableError}; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult as DeltaResultLocal, StatValue}; pub const NULL_PARTITION_VALUE_DATA_PATH: &str = "__HIVE_DEFAULT_PARTITION__"; @@ -52,145 +49,246 @@ const RFC3986_PART: &AsciiSet = &NON_ALPHANUMERIC pub struct ScalarConverter; impl ScalarConverter { - pub fn json_to_arrow_scalar_value( - stat_val: &serde_json::Value, + pub fn stat_values_to_array( + values: &[Option<&StatValue>], field_dt: &ArrowDataType, - ) -> DataFusionResult> { - match stat_val { - serde_json::Value::Array(_) | serde_json::Value::Object(_) => Ok(None), - serde_json::Value::Null => Ok(Some(ScalarValue::try_new_null(field_dt)?)), - serde_json::Value::String(value) => { - Ok(Some(Self::string_to_arrow_scalar_value(value, field_dt)?)) - } - other => { - let owned = other.to_string(); - Ok(Some(Self::string_to_arrow_scalar_value(&owned, field_dt)?)) - } + ) -> DataFusionResult> { + macro_rules! typed_array { + ($array_ty:ty, $extract:expr) => {{ + let mut out = Vec::with_capacity(values.len()); + for value in values { + match value { + None => out.push(None), + Some(StatValue::Null) => out.push(None), + Some(value) => { + let Some(converted) = $extract(value) else { + return Ok(None); + }; + out.push(Some(converted)); + } + } + } + Ok(Some(Arc::new(<$array_ty>::from(out)) as ArrayRef)) + }}; } - } - pub fn string_to_arrow_scalar_value( - value: &str, - field_dt: &ArrowDataType, - ) -> DataFusionResult { match field_dt { - ArrowDataType::Timestamp(_, _) => Self::parse_timestamp_str(value, field_dt), - ArrowDataType::Date32 => Self::parse_date_str(value, field_dt), - _ => ScalarValue::try_from_string(value.to_string(), field_dt), - } - } - - pub fn scalars_to_arrow_array( - field: &StructField, - values: &[Scalar], - ) -> DeltaResultLocal> { - let array: Arc = match field.data_type() { - DataType::Primitive(PrimitiveType::String) => { - Arc::new(StringArray::from_iter(values.iter().map(|v| match v { - Scalar::String(s) => Some(s.clone()), - Scalar::Null(_) => None, + ArrowDataType::Boolean => typed_array!(BooleanArray, |value: &StatValue| match value { + StatValue::Boolean(value) => Some(*value), + _ => None, + }), + ArrowDataType::Int8 => { + typed_array!(Int8Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_i64().and_then(|v| i8::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Long) => { - Arc::new(Int64Array::from_iter(values.iter().map(|v| match v { - Scalar::Long(i) => Some(*i), - Scalar::Null(_) => None, + ArrowDataType::Int16 => { + typed_array!(Int16Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_i64().and_then(|v| i16::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Integer) => { - Arc::new(Int32Array::from_iter(values.iter().map(|v| match v { - Scalar::Integer(i) => Some(*i), - Scalar::Null(_) => None, + ArrowDataType::Int32 => { + typed_array!(Int32Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_i64().and_then(|v| i32::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Short) => { - Arc::new(Int16Array::from_iter(values.iter().map(|v| match v { - Scalar::Short(i) => Some(*i), - Scalar::Null(_) => None, + ArrowDataType::Int64 => typed_array!(Int64Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_i64(), + _ => None, + }), + ArrowDataType::UInt8 => { + typed_array!(UInt8Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_u64().and_then(|v| u8::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Byte) => { - Arc::new(Int8Array::from_iter(values.iter().map(|v| match v { - Scalar::Byte(i) => Some(*i), - Scalar::Null(_) => None, + ArrowDataType::UInt16 => { + typed_array!(UInt16Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_u64().and_then(|v| u16::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Float) => { - Arc::new(Float32Array::from_iter(values.iter().map(|v| match v { - Scalar::Float(f) => Some(*f), - Scalar::Null(_) => None, + ArrowDataType::UInt32 => { + typed_array!(UInt32Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_u64().and_then(|v| u32::try_from(v).ok()), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Double) => { - Arc::new(Float64Array::from_iter(values.iter().map(|v| match v { - Scalar::Double(f) => Some(*f), - Scalar::Null(_) => None, + ArrowDataType::UInt64 => typed_array!(UInt64Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_u64(), + _ => None, + }), + ArrowDataType::Float32 => { + typed_array!(Float32Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_f64().map(|v| v as f32), _ => None, - }))) + }) } - DataType::Primitive(PrimitiveType::Boolean) => { - Arc::new(BooleanArray::from_iter(values.iter().map(|v| match v { - Scalar::Boolean(b) => Some(*b), - Scalar::Null(_) => None, + ArrowDataType::Float64 => typed_array!(Float64Array, |value: &StatValue| match value { + StatValue::Number(value) => value.as_f64(), + _ => None, + }), + ArrowDataType::Utf8 => typed_array!(StringArray, |value: &StatValue| match value { + StatValue::String(value) => Some(value.clone()), + _ => None, + }), + ArrowDataType::LargeUtf8 => { + typed_array!(LargeStringArray, |value: &StatValue| match value { + StatValue::String(value) => Some(value.clone()), _ => None, - }))) - } - DataType::Primitive(PrimitiveType::Binary) => { - Arc::new(BinaryArray::from_iter(values.iter().map(|v| match v { - Scalar::Binary(b) => Some(b.clone()), - Scalar::Null(_) => None, - _ => None, - }))) - } - DataType::Primitive(PrimitiveType::Date) => { - Arc::new(Date32Array::from_iter(values.iter().map(|v| match v { - Scalar::Date(d) => Some(*d), - Scalar::Null(_) => None, - _ => None, - }))) - } - DataType::Primitive(PrimitiveType::Timestamp) => Arc::new( - TimestampMicrosecondArray::from_iter(values.iter().map(|v| match v { - Scalar::Timestamp(ts) => Some(*ts), - Scalar::Null(_) => None, - _ => None, - })) - .with_timezone("UTC"), - ), - DataType::Primitive(PrimitiveType::TimestampNtz) => Arc::new( - TimestampMicrosecondArray::from_iter(values.iter().map(|v| match v { - Scalar::TimestampNtz(ts) => Some(*ts), - Scalar::Null(_) => None, - _ => None, - })), - ), - DataType::Primitive(PrimitiveType::Decimal(decimal)) => { - let array = Decimal128Array::from_iter(values.iter().map(|v| match v { - Scalar::Decimal(d) => Some(d.bits()), - Scalar::Null(_) => None, - _ => None, - })); - let array = array - .with_precision_and_scale(decimal.precision(), decimal.scale() as i8) - .map_err(|e| { - DeltaTableError::generic(format!("Decimal precision error: {e}")) - })?; - Arc::new(array) - } - _ => { - return Err(DeltaTableError::generic( - "complex partition values are not supported", - )) + }) } + _ => Ok(None), + } + } + + pub fn stat_value_to_arrow_scalar_value( + stat_val: &StatValue, + field_dt: &ArrowDataType, + ) -> DataFusionResult> { + match stat_val { + StatValue::Null => Ok(Some(ScalarValue::try_new_null(field_dt)?)), + StatValue::Boolean(value) => Self::bool_to_arrow_scalar_value(*value, field_dt), + StatValue::Number(value) => Self::number_to_arrow_scalar_value(value, field_dt), + StatValue::String(value) => Self::string_json_to_arrow_scalar_value(value, field_dt), + } + } + + pub fn string_values_to_array( + values: &[Option<&str>], + field_dt: &ArrowDataType, + ) -> DataFusionResult { + let utf8_array: ArrayRef = Arc::new(StringArray::from( + values + .iter() + .map(|value| value.map(ToOwned::to_owned)) + .collect::>(), + )); + + match field_dt { + ArrowDataType::Utf8 => Ok(utf8_array), + ArrowDataType::LargeUtf8 => Ok(Arc::new(LargeStringArray::from( + values + .iter() + .map(|value| value.map(ToOwned::to_owned)) + .collect::>(), + ))), + _ => Ok(cast(&utf8_array, field_dt)?), + } + } + + pub fn json_to_arrow_scalar_value( + stat_val: &serde_json::Value, + field_dt: &ArrowDataType, + ) -> DataFusionResult> { + let Some(stat_val) = (match stat_val { + serde_json::Value::Array(_) | serde_json::Value::Object(_) => None, + serde_json::Value::Null => Some(StatValue::Null), + serde_json::Value::Bool(value) => Some(StatValue::Boolean(*value)), + serde_json::Value::Number(value) => Some(StatValue::Number(value.clone())), + serde_json::Value::String(value) => Some(StatValue::String(value.clone())), + }) else { + return Ok(None); }; + Self::stat_value_to_arrow_scalar_value(&stat_val, field_dt) + } - Ok(array) + fn string_json_to_arrow_scalar_value( + value: &str, + field_dt: &ArrowDataType, + ) -> DataFusionResult> { + match field_dt { + ArrowDataType::Utf8 => Ok(Some(ScalarValue::Utf8(Some(value.to_string())))), + ArrowDataType::LargeUtf8 => Ok(Some(ScalarValue::LargeUtf8(Some(value.to_string())))), + ArrowDataType::Utf8View => Ok(Some(ScalarValue::Utf8View(Some(value.to_string())))), + _ => Ok(Some(Self::string_to_arrow_scalar_value(value, field_dt)?)), + } + } + + fn bool_to_arrow_scalar_value( + value: bool, + field_dt: &ArrowDataType, + ) -> DataFusionResult> { + match field_dt { + ArrowDataType::Boolean => Ok(Some(ScalarValue::Boolean(Some(value)))), + _ => Ok(Some(Self::string_to_arrow_scalar_value( + if value { "true" } else { "false" }, + field_dt, + )?)), + } + } + + fn number_to_arrow_scalar_value( + value: &serde_json::Number, + field_dt: &ArrowDataType, + ) -> DataFusionResult> { + let scalar = match field_dt { + ArrowDataType::Int8 => value + .as_i64() + .and_then(|v| i8::try_from(v).ok()) + .map(|v| ScalarValue::Int8(Some(v))), + ArrowDataType::Int16 => value + .as_i64() + .and_then(|v| i16::try_from(v).ok()) + .map(|v| ScalarValue::Int16(Some(v))), + ArrowDataType::Int32 => value + .as_i64() + .and_then(|v| i32::try_from(v).ok()) + .map(|v| ScalarValue::Int32(Some(v))), + ArrowDataType::Int64 => value.as_i64().map(|v| ScalarValue::Int64(Some(v))), + ArrowDataType::UInt8 => value + .as_u64() + .and_then(|v| u8::try_from(v).ok()) + .map(|v| ScalarValue::UInt8(Some(v))), + ArrowDataType::UInt16 => value + .as_u64() + .and_then(|v| u16::try_from(v).ok()) + .map(|v| ScalarValue::UInt16(Some(v))), + ArrowDataType::UInt32 => value + .as_u64() + .and_then(|v| u32::try_from(v).ok()) + .map(|v| ScalarValue::UInt32(Some(v))), + ArrowDataType::UInt64 => value.as_u64().map(|v| ScalarValue::UInt64(Some(v))), + ArrowDataType::Float32 => value.as_f64().map(|v| ScalarValue::Float32(Some(v as f32))), + ArrowDataType::Float64 => value.as_f64().map(|v| ScalarValue::Float64(Some(v))), + _ => None, + }; + + match scalar { + Some(scalar) => Ok(Some(scalar)), + None => Ok(Some(Self::string_to_arrow_scalar_value( + &value.to_string(), + field_dt, + )?)), + } + } + + pub fn string_to_arrow_scalar_value( + value: &str, + field_dt: &ArrowDataType, + ) -> DataFusionResult { + match field_dt { + ArrowDataType::Timestamp(_, _) => Self::parse_timestamp_str(value, field_dt), + ArrowDataType::Date32 => Self::parse_date_str(value, field_dt), + _ => ScalarValue::try_from_string(value.to_string(), field_dt), + } + } + + /// Convert a column from a `RecordBatch` into a `Vec` for partition value use. + pub fn column_to_scalar_values( + batch: &RecordBatch, + col_idx: usize, + ) -> DeltaResultLocal> { + let col = batch.column(col_idx); + (0..col.len()) + .map(|i| { + ScalarValue::try_from_array(col.as_ref(), i) + .map_err(|e| DeltaTableError::generic(format!("Failed to read scalar: {e}"))) + }) + .collect() } fn parse_date_str(date_str: &str, field_dt: &ArrowDataType) -> DataFusionResult { @@ -231,32 +329,50 @@ fn encode_partition_value(value: &str) -> String { utf8_percent_encode(value, RFC3986_PART).to_string() } +/// Extension trait providing Delta-specific serialization and extraction for `ScalarValue`. pub trait ScalarExt: Sized { + /// Serialize to a partition value string (Delta log format). fn serialize(&self) -> Cow<'_, str>; + /// Serialize with percent-encoding for use in Hive partition paths. fn serialize_encoded(&self) -> String; + /// Extract a scalar from an Arrow array at the given index. fn from_array(arr: &dyn Array, index: usize) -> Option; + /// Convert to a `serde_json::Value`. fn to_json(&self) -> Value; } -impl ScalarExt for Scalar { +impl ScalarExt for ScalarValue { fn serialize(&self) -> Cow<'_, str> { match self { - Self::String(value) => Cow::Borrowed(value), - Self::Byte(value) => Cow::Owned(value.to_string()), - Self::Short(value) => Cow::Owned(value.to_string()), - Self::Integer(value) => Cow::Owned(value.to_string()), - Self::Long(value) => Cow::Owned(value.to_string()), - Self::Float(value) => Cow::Owned(value.to_string()), - Self::Double(value) => Cow::Owned(value.to_string()), - Self::Boolean(value) => Cow::Owned(value.to_string()), - Self::TimestampNtz(ts) | Self::Timestamp(ts) => Cow::Owned(format_timestamp(*ts)), - Self::Date(days) => Cow::Owned(format_date(*days)), - Self::Decimal(decimal) => { - Cow::Owned(serialize_decimal(decimal.bits(), decimal.scale() as i8)) + ScalarValue::Utf8(Some(v)) + | ScalarValue::LargeUtf8(Some(v)) + | ScalarValue::Utf8View(Some(v)) => Cow::Borrowed(v.as_str()), + ScalarValue::Int8(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Int16(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Int32(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Int64(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::UInt8(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::UInt16(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::UInt32(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::UInt64(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Float32(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Float64(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::Boolean(Some(v)) => Cow::Owned(v.to_string()), + ScalarValue::TimestampMicrosecond(Some(ts), _) => Cow::Owned(format_timestamp(*ts)), + ScalarValue::Date32(Some(days)) => Cow::Owned(format_date(*days)), + ScalarValue::Decimal128(Some(bits), _, scale) => { + Cow::Owned(serialize_decimal(*bits, *scale)) + } + ScalarValue::Binary(Some(bytes)) + | ScalarValue::LargeBinary(Some(bytes)) + | ScalarValue::BinaryView(Some(bytes)) => { + Cow::Owned(create_escaped_binary_string(bytes.as_slice())) + } + ScalarValue::FixedSizeBinary(_, Some(bytes)) => { + Cow::Owned(create_escaped_binary_string(bytes.as_slice())) } - Self::Binary(bytes) => Cow::Owned(create_escaped_binary_string(bytes.as_slice())), - Self::Null(_) => Cow::Borrowed("null"), - Self::Struct(_) | Self::Array(_) | Self::Map(_) => Cow::Owned(self.to_string()), + _ if self.is_null() => Cow::Borrowed("null"), + other => Cow::Owned(other.to_string()), } } @@ -271,57 +387,64 @@ impl ScalarExt for Scalar { if arr.len() <= index { return None; } - if arr.is_null(index) { - return Some(Self::Null(arr.data_type().try_into_kernel().ok()?)); - } - - ScalarValue::try_from_array(arr, index) - .ok() - .and_then(kernel_scalar_from_datafusion) + ScalarValue::try_from_array(arr, index).ok() } fn to_json(&self) -> Value { match self { - Self::String(value) => Value::String(value.to_owned()), - Self::Byte(value) => Value::Number((*value).into()), - Self::Short(value) => Value::Number((*value).into()), - Self::Integer(value) => Value::Number((*value).into()), - Self::Long(value) => Value::Number((*value).into()), - Self::Float(value) => number_from_f64(*value as f64), - Self::Double(value) => number_from_f64(*value), - Self::Boolean(value) => Value::Bool(*value), - Self::TimestampNtz(ts) | Self::Timestamp(ts) => Value::String(format_timestamp(*ts)), - Self::Date(days) => Value::String(format_date(*days)), - Self::Decimal(decimal) => { - Value::String(serialize_decimal(decimal.bits(), decimal.scale() as i8)) + ScalarValue::Utf8(Some(v)) + | ScalarValue::LargeUtf8(Some(v)) + | ScalarValue::Utf8View(Some(v)) => Value::String(v.clone()), + ScalarValue::Int8(Some(v)) => Value::Number((*v).into()), + ScalarValue::Int16(Some(v)) => Value::Number((*v).into()), + ScalarValue::Int32(Some(v)) => Value::Number((*v).into()), + ScalarValue::Int64(Some(v)) => Value::Number((*v).into()), + ScalarValue::UInt8(Some(v)) => Value::Number((*v).into()), + ScalarValue::UInt16(Some(v)) => Value::Number((*v).into()), + ScalarValue::UInt32(Some(v)) => Value::Number((*v).into()), + ScalarValue::UInt64(Some(v)) => Value::Number((*v).into()), + ScalarValue::Float32(Some(v)) => number_from_f64(*v as f64), + ScalarValue::Float64(Some(v)) => number_from_f64(*v), + ScalarValue::Boolean(Some(v)) => Value::Bool(*v), + ScalarValue::TimestampMicrosecond(Some(ts), _) => Value::String(format_timestamp(*ts)), + ScalarValue::Date32(Some(days)) => Value::String(format_date(*days)), + ScalarValue::Decimal128(Some(bits), _, scale) => { + Value::String(serialize_decimal(*bits, *scale)) + } + ScalarValue::Binary(Some(bytes)) + | ScalarValue::LargeBinary(Some(bytes)) + | ScalarValue::BinaryView(Some(bytes)) => { + Value::String(create_escaped_binary_string(bytes.as_slice())) } - Self::Binary(bytes) => Value::String(create_escaped_binary_string(bytes.as_slice())), - Self::Null(_) => Value::Null, - Self::Struct(data) => { - let map: serde_json::Map = data - .fields() + ScalarValue::FixedSizeBinary(_, Some(bytes)) => { + Value::String(create_escaped_binary_string(bytes.as_slice())) + } + ScalarValue::Struct(struct_array) => { + let fields = struct_array.fields(); + let map: serde_json::Map = fields .iter() - .zip(data.values().iter()) - .map(|(field, value)| (field.name.clone(), value.to_json())) + .enumerate() + .map(|(i, field)| { + let col = struct_array.column(i); + let sv = ScalarValue::try_from_array(col.as_ref(), 0) + .unwrap_or(ScalarValue::Null); + (field.name().clone(), sv.to_json()) + }) .collect(); Value::Object(map) } - Self::Array(array_data) => { - let values: Vec = array_data - .array_elements() - .iter() - .map(|value| value.to_json()) + ScalarValue::List(list_array) => { + let values: Vec = (0..list_array.len()) + .map(|i| { + ScalarValue::try_from_array(list_array.as_ref(), i) + .map(|sv| sv.to_json()) + .unwrap_or(Value::Null) + }) .collect(); Value::Array(values) } - Self::Map(map_data) => { - let map: serde_json::Map = map_data - .pairs() - .iter() - .map(|(key, value)| (key.to_string(), value.to_json())) - .collect(); - Value::Object(map) - } + _ if self.is_null() => Value::Null, + other => Value::String(other.to_string()), } } } @@ -383,61 +506,73 @@ fn number_from_f64(value: f64) -> Value { .unwrap_or_else(|| Value::String(value.to_string())) } -fn kernel_scalar_from_datafusion(value: ScalarValue) -> Option { - match value { - ScalarValue::Utf8(Some(v)) - | ScalarValue::LargeUtf8(Some(v)) - | ScalarValue::Utf8View(Some(v)) => Some(Scalar::String(v)), - ScalarValue::Boolean(Some(v)) => Some(Scalar::Boolean(v)), - ScalarValue::Binary(Some(bytes)) - | ScalarValue::LargeBinary(Some(bytes)) - | ScalarValue::BinaryView(Some(bytes)) - | ScalarValue::FixedSizeBinary(_, Some(bytes)) => Some(Scalar::Binary(bytes)), - ScalarValue::Int8(Some(v)) => Some(Scalar::Byte(v)), - ScalarValue::Int16(Some(v)) => Some(Scalar::Short(v)), - ScalarValue::Int32(Some(v)) => Some(Scalar::Integer(v)), - ScalarValue::Int64(Some(v)) => Some(Scalar::Long(v)), - ScalarValue::UInt8(Some(v)) => Some(Scalar::Byte(v as i8)), - ScalarValue::UInt16(Some(v)) => Some(Scalar::Short(v as i16)), - ScalarValue::UInt32(Some(v)) => Some(Scalar::Integer(v as i32)), - ScalarValue::UInt64(Some(v)) => Some(Scalar::Long(v as i64)), - ScalarValue::Float32(Some(v)) => Some(Scalar::Float(v)), - ScalarValue::Float64(Some(v)) => Some(Scalar::Double(v)), - ScalarValue::Decimal128(Some(bits), precision, scale) => { - let scale = u8::try_from(scale).ok()?; - Scalar::decimal(bits, precision, scale).ok() - } - ScalarValue::Date32(Some(days)) => Some(Scalar::Date(days)), - ScalarValue::TimestampMicrosecond(Some(value), None) => Some(Scalar::TimestampNtz(value)), - ScalarValue::TimestampMicrosecond(Some(value), Some(tz)) - if tz.eq_ignore_ascii_case("utc") => - { - Some(Scalar::Timestamp(value)) - } - ScalarValue::Struct(struct_array) => { - struct_data_from_array(struct_array.as_ref()).map(Scalar::Struct) - } - _ => None, +/// Parse a partition value string into a `ScalarValue` for the given Arrow data type. +/// +/// This implements Delta-specific parsing rules for partition values stored in the log. +pub fn parse_partition_value(raw: &str, field_dt: &ArrowDataType) -> DeltaResultLocal { + if raw.is_empty() || raw == NULL_PARTITION_VALUE_DATA_PATH { + return ScalarValue::try_new_null(field_dt) + .map_err(|e| DeltaTableError::generic(format!("Failed to create null scalar: {e}"))); } + ScalarConverter::string_to_arrow_scalar_value(raw, field_dt) + .map_err(|e| DeltaTableError::generic(format!("Failed to parse partition value: {e}"))) } -fn struct_data_from_array(struct_array: &array::StructArray) -> Option { - let fields = struct_array.fields(); - let columns = struct_array.columns(); +pub fn parse_optional_partition_value( + raw: Option<&str>, + field_dt: &ArrowDataType, +) -> DeltaResultLocal { + match raw { + Some(raw) => parse_partition_value(raw, field_dt), + None => ScalarValue::try_new_null(field_dt) + .map_err(|e| DeltaTableError::generic(format!("Failed to create null scalar: {e}"))), + } +} - if fields.len() != columns.len() { +/// Build a `ScalarValue` from an Arrow array at the given index, returning `None` for nulls. +/// +/// Returns `None` if the value is null or if extraction fails. +pub fn scalar_from_array_opt(arr: &dyn Array, index: usize) -> Option { + if arr.len() <= index || arr.is_null(index) { return None; } + ScalarValue::try_from_array(arr, index).ok() +} - let mut struct_fields = Vec::with_capacity(fields.len()); - let mut values = Vec::with_capacity(columns.len()); +/// Convert a `ScalarValue` to an `Arc` suitable for use as a partition column. +pub fn scalar_value_to_array( + value: &ScalarValue, + len: usize, +) -> DeltaResultLocal> { + value + .to_array_of_size(len) + .map_err(|e| DeltaTableError::generic(format!("Failed to convert scalar to array: {e}"))) +} + +#[cfg(test)] +mod tests { + use datafusion::arrow::datatypes::DataType as ArrowDataType; + use datafusion::common::ScalarValue; + + use super::{ + parse_optional_partition_value, parse_partition_value, NULL_PARTITION_VALUE_DATA_PATH, + }; - for (field, column) in fields.iter().zip(columns.iter()) { - let kernel_field = field.as_ref().try_into_kernel().ok()?; - let value = Scalar::from_array(column.as_ref(), 0)?; - struct_fields.push(kernel_field); - values.push(value); + #[test] + fn test_parse_partition_value_treats_hive_default_partition_as_null_for_strings() { + #[expect(clippy::expect_used)] + let value = parse_partition_value(NULL_PARTITION_VALUE_DATA_PATH, &ArrowDataType::Utf8) + .expect("partition value should parse"); + + assert_eq!(value, ScalarValue::Utf8(None)); } - StructData::try_new(struct_fields, values).ok() + #[test] + fn test_parse_optional_partition_value_none_returns_typed_null() { + #[expect(clippy::expect_used)] + let value = parse_optional_partition_value(None, &ArrowDataType::Utf8) + .expect("partition value should parse"); + + assert_eq!(value, ScalarValue::Utf8(None)); + } } diff --git a/crates/sail-delta-lake/src/datasource/actions.rs b/crates/sail-delta-lake/src/datasource/actions.rs index 1505a522ea..423352fffe 100644 --- a/crates/sail-delta-lake/src/datasource/actions.rs +++ b/crates/sail-delta-lake/src/datasource/actions.rs @@ -23,9 +23,8 @@ use datafusion::datasource::listing::PartitionedFile; use object_store::ObjectMeta; /// [Credit]: -use crate::conversion::ScalarConverter; -use crate::kernel::models::{Add, Remove}; -use crate::kernel::{DeltaResult, DeltaTableError}; +use crate::conversion::parse_optional_partition_value; +use crate::spec::{Add, DeltaError as DeltaTableError, DeltaResult, Remove}; /// Convert an Add action to a PartitionedFile for DataFusion scanning pub fn partitioned_file_from_action( @@ -46,13 +45,9 @@ pub fn partitioned_file_from_action( .get(physical_name) .or_else(|| action.partition_values.get(logical_name)) .and_then(|value| value.as_ref()) - .map(|value| { - ScalarConverter::string_to_arrow_scalar_value(value, field.data_type()) - .unwrap_or(ScalarValue::Null) - }) - .unwrap_or_else(|| { - ScalarValue::try_new_null(field.data_type()).unwrap_or(ScalarValue::Null) - }) + .map(|value| parse_optional_partition_value(Some(value), field.data_type())) + .unwrap_or_else(|| parse_optional_partition_value(None, field.data_type())) + .unwrap_or(ScalarValue::Null) }) .collect::>(); @@ -77,6 +72,7 @@ pub fn partitioned_file_from_action( extensions: None, range: None, statistics: None, + ordering: None, metadata_size_hint: None, }) } diff --git a/crates/sail-delta-lake/src/datasource/expressions.rs b/crates/sail-delta-lake/src/datasource/expressions.rs index 142fd7496a..7f9a479afb 100644 --- a/crates/sail-delta-lake/src/datasource/expressions.rs +++ b/crates/sail-delta-lake/src/datasource/expressions.rs @@ -22,14 +22,13 @@ use std::sync::Arc; use datafusion::catalog::Session; use datafusion::common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion::common::{Column, DFSchema, Result}; -use datafusion::logical_expr::execution_props::ExecutionProps; use datafusion::logical_expr::simplify::SimplifyContext; use datafusion::logical_expr::{BinaryExpr, Expr, Operator, TableProviderFilterPushDown}; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::expressions::Column as PhysicalColumn; -use crate::kernel::DeltaResult; +use crate::spec::DeltaResult; // [Credit]: @@ -39,8 +38,7 @@ pub fn simplify_expr( df_schema: &DFSchema, expr: Expr, ) -> Result> { - let props = ExecutionProps::new(); - let simplify_context = SimplifyContext::new(&props).with_schema(df_schema.clone().into()); + let simplify_context = SimplifyContext::default().with_schema(df_schema.clone().into()); let simplifier = ExprSimplifier::new(simplify_context).with_max_cycles(10); let simplified = simplifier.simplify(expr)?; diff --git a/crates/sail-delta-lake/src/datasource/mod.rs b/crates/sail-delta-lake/src/datasource/mod.rs index d5d7d9e9d7..f2000d0bae 100644 --- a/crates/sail-delta-lake/src/datasource/mod.rs +++ b/crates/sail-delta-lake/src/datasource/mod.rs @@ -28,9 +28,10 @@ use datafusion::datasource::object_store::ObjectStoreUrl; use serde::{Deserialize, Serialize}; use url::Url; -use crate::kernel::snapshot::LogDataHandler; -use crate::kernel::{DeltaResult, DeltaTableError}; -use crate::table::DeltaTableState; +use crate::kernel::snapshot::SnapshotPruningStats; +use crate::options::{default_delta_log_replay_hash_threshold, DeltaLogReplayStrategyOption}; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; +use crate::table::DeltaSnapshot; pub const PATH_COLUMN: &str = "__sail_file_path"; pub const COMMIT_VERSION_COLUMN: &str = "_commit_version"; pub const COMMIT_TIMESTAMP_COLUMN: &str = "_commit_timestamp"; @@ -50,7 +51,7 @@ pub use expressions::{ pub use provider::DeltaTableProvider; pub use pruning::{prune_files, PruningResult}; pub use scan::build_file_scan_config; -pub use schema::{df_logical_schema, DataFusionMixins}; +pub use schema::df_logical_schema; pub(crate) fn create_object_store_url(location: &Url) -> DeltaResult { Ok(ObjectStoreUrl::parse( @@ -58,20 +59,20 @@ pub(crate) fn create_object_store_url(location: &Url) -> DeltaResult) -> Option; -} - -impl DeltaTableStateExt for DeltaTableState { - fn datafusion_table_statistics(&self, mask: Option<&[bool]>) -> Option { +impl DeltaSnapshot { + pub(crate) fn datafusion_table_statistics(&self, mask: Option<&[bool]>) -> Option { + if !self.load_config().require_files { + return None; + } if let Some(mask) = mask { - let es = self.snapshot(); + let files = self.files_batch().ok()?; let boolean_array = BooleanArray::from(mask.to_vec()); - let pruned_files = filter_record_batch(&es.files, &boolean_array).ok()?; - LogDataHandler::new(&pruned_files, es.table_configuration()).statistics() + let pruned_files = filter_record_batch(files, &boolean_array).ok()?; + SnapshotPruningStats::try_new(&pruned_files, self) + .ok()? + .statistics() } else { - self.snapshot().log_data().statistics() + self.pruning_stats().ok()?.statistics() } } } @@ -98,6 +99,10 @@ pub struct DeltaScanConfigBuilder { commit_version_column_name: Option, /// Column name that contains the commit timestamp. commit_timestamp_column_name: Option, + /// Strategy for log replay planning. + delta_log_replay_strategy: DeltaLogReplayStrategyOption, + /// Threshold for auto replay strategy. + delta_log_replay_hash_threshold: usize, } impl Default for DeltaScanConfigBuilder { @@ -111,6 +116,8 @@ impl Default for DeltaScanConfigBuilder { include_commit_metadata: false, commit_version_column_name: None, commit_timestamp_column_name: None, + delta_log_replay_strategy: DeltaLogReplayStrategyOption::Auto, + delta_log_replay_hash_threshold: 100, } } } @@ -141,8 +148,23 @@ impl DeltaScanConfigBuilder { self } + /// Configure replay strategy for log replay planning. + pub fn with_delta_log_replay_strategy( + mut self, + strategy: DeltaLogReplayStrategyOption, + ) -> Self { + self.delta_log_replay_strategy = strategy; + self + } + + /// Configure threshold for `Auto` replay strategy. + pub fn with_delta_log_replay_hash_threshold(mut self, threshold: usize) -> Self { + self.delta_log_replay_hash_threshold = threshold; + self + } + /// Build a DeltaScanConfig and ensure no column name conflicts occur during downstream processing - pub fn build(&self, snapshot: &DeltaTableState) -> DeltaResult { + pub fn build(&self, snapshot: &DeltaSnapshot) -> DeltaResult { let file_column_name = if self.include_file_column { let input_schema = snapshot.input_schema()?; let mut column_names: HashSet<&String> = HashSet::new(); @@ -228,6 +250,8 @@ impl DeltaScanConfigBuilder { schema: self.schema.clone(), commit_version_column_name, commit_timestamp_column_name, + delta_log_replay_strategy: self.delta_log_replay_strategy, + delta_log_replay_hash_threshold: self.delta_log_replay_hash_threshold, }) } } @@ -247,4 +271,10 @@ pub struct DeltaScanConfig { pub commit_version_column_name: Option, /// Commit timestamp virtual column name. pub commit_timestamp_column_name: Option, + /// Strategy for log replay planning. + #[serde(default)] + pub delta_log_replay_strategy: DeltaLogReplayStrategyOption, + /// Threshold for `Auto` replay strategy. + #[serde(default = "default_delta_log_replay_hash_threshold")] + pub delta_log_replay_hash_threshold: usize, } diff --git a/crates/sail-delta-lake/src/datasource/provider.rs b/crates/sail-delta-lake/src/datasource/provider.rs index 4644bfa078..5d2c0a9ddb 100644 --- a/crates/sail-delta-lake/src/datasource/provider.rs +++ b/crates/sail-delta-lake/src/datasource/provider.rs @@ -31,18 +31,15 @@ use datafusion::datasource::{TableProvider, TableType}; use datafusion::logical_expr::{Expr, LogicalPlan, TableProviderFilterPushDown}; use datafusion::physical_plan::ExecutionPlan; -use crate::datasource::{ - df_logical_schema, get_pushdown_filters, DeltaScanConfig, DeltaTableStateExt, -}; -use crate::kernel::models::Add; -use crate::kernel::DeltaResult; +use crate::datasource::{df_logical_schema, get_pushdown_filters, DeltaScanConfig}; use crate::physical::scan_planner::plan_delta_scan; +use crate::spec::{Add, DeltaResult}; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; /// A Delta table provider that enables additional metadata columns to be included during the scan pub struct DeltaTableProvider { - snapshot: DeltaTableState, + snapshot: Arc, log_store: LogStoreRef, config: DeltaScanConfig, schema: Arc, @@ -65,13 +62,13 @@ impl std::fmt::Debug for DeltaTableProvider { impl DeltaTableProvider { pub fn try_new( - snapshot: DeltaTableState, + snapshot: Arc, log_store: LogStoreRef, config: DeltaScanConfig, ) -> DeltaResult { Ok(DeltaTableProvider { schema: df_logical_schema( - &snapshot, + snapshot.as_ref(), &config.file_column_name, &config.commit_version_column_name, &config.commit_timestamp_column_name, @@ -89,7 +86,7 @@ impl DeltaTableProvider { self } - pub fn snapshot(&self) -> &DeltaTableState { + pub fn snapshot(&self) -> &Arc { &self.snapshot } @@ -133,7 +130,7 @@ impl TableProvider for DeltaTableProvider { ) -> Result> { plan_delta_scan( session, - &self.snapshot, + self.snapshot.as_ref(), &self.log_store, &self.config, self.files.clone(), diff --git a/crates/sail-delta-lake/src/datasource/pruning.rs b/crates/sail-delta-lake/src/datasource/pruning.rs index 01321f6910..0928c10851 100644 --- a/crates/sail-delta-lake/src/datasource/pruning.rs +++ b/crates/sail-delta-lake/src/datasource/pruning.rs @@ -20,6 +20,7 @@ use std::sync::Arc; +use datafusion::arrow::array::{ArrayRef, UInt64Array}; use datafusion::arrow::datatypes::SchemaRef; use datafusion::catalog::Session; use datafusion::common::{Result, ToDFSchema}; @@ -29,13 +30,12 @@ use datafusion::physical_optimizer::pruning::PruningPredicate; use datafusion_common::pruning::PruningStatistics; use datafusion_common::scalar::ScalarValue; use datafusion_common::{Column, DataFusionError}; -use futures::TryStreamExt; -use crate::kernel::models::Add; -use crate::kernel::statistics::{ColumnCountStat, ColumnValueStat, Stats}; -use crate::kernel::DeltaResult; +use crate::conversion::{parse_optional_partition_value, ScalarConverter}; +use crate::spec::statistics::Stats; +use crate::spec::Add; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; /// Result of file pruning operation #[derive(Debug, Clone)] @@ -46,22 +46,10 @@ pub struct PruningResult { pub pruning_mask: Option>, } -async fn collect_add_actions( - snapshot: &DeltaTableState, - log_store: &LogStoreRef, -) -> DeltaResult> { - snapshot - .snapshot() - .files(log_store.as_ref(), None) - .map_ok(|view| view.add_action()) - .try_collect() - .await -} - /// Core file pruning function that filters files based on predicates and limit pub async fn prune_files( - snapshot: &DeltaTableState, - log_store: &LogStoreRef, + snapshot: &DeltaSnapshot, + _log_store: &LogStoreRef, session: &dyn Session, filters: &[Expr], limit: Option, @@ -71,31 +59,32 @@ pub async fn prune_files( // Early return if no filters and no limit if filter_expr.is_none() && limit.is_none() { - let files = collect_add_actions(snapshot, log_store).await?; + let files = snapshot.adds().to_vec(); return Ok(PruningResult { files, pruning_mask: None, }); } - let log_data = snapshot.snapshot().log_data(); - let num_containers = log_data.num_containers(); + let all_files = snapshot.adds().to_vec(); + let num_containers = all_files.len(); // Apply predicate-based pruning let files_to_prune = if let Some(predicate) = &filter_expr { - // Convert logical expression to physical expression for pruning let df_schema = logical_schema.clone().to_dfschema()?; let physical_predicate = session.create_physical_expr(predicate.clone(), &df_schema)?; - + let referenced_columns = crate::datasource::collect_physical_columns(&physical_predicate); + let stats = AddStatsPruningStatistics::try_new( + logical_schema.clone(), + all_files.clone(), + referenced_columns, + )?; let pruning_predicate = PruningPredicate::try_new(physical_predicate, logical_schema)?; - pruning_predicate.prune(&log_data)? + pruning_predicate.prune(&stats)? } else { vec![true; num_containers] }; - // Collect all files and apply pruning logic - let all_files = collect_add_actions(snapshot, log_store).await?; - // Apply limit-based pruning with statistics consideration let mut pruned_without_stats = vec![]; let mut rows_collected = 0; @@ -155,12 +144,21 @@ pub(crate) fn prune_adds_by_physical_predicate( pruning_predicate.prune(&stats) } +#[derive(Debug)] +struct MaterializedColumnStats { + min_values: Option, + max_values: Option, + null_counts: Option, + row_counts: Option, +} + #[derive(Debug)] struct AddStatsPruningStatistics { table_schema: SchemaRef, adds: Vec, stats: Vec>, referenced_columns: std::collections::HashSet, + materialized_columns: std::collections::HashMap, } impl AddStatsPruningStatistics { @@ -176,12 +174,36 @@ impl AddStatsPruningStatistics { .map_err(|e| DataFusionError::External(Box::new(e)))?; stats.push(parsed); } - Ok(Self { + let mut out = Self { table_schema, adds, stats, referenced_columns, - }) + materialized_columns: Default::default(), + }; + out.materialize_referenced_columns(); + Ok(out) + } + + fn materialize_referenced_columns(&mut self) { + self.materialized_columns = self + .referenced_columns + .iter() + .filter_map(|name| { + let column = Column::from_name(name.clone()); + self.field_for(&column).map(|_| { + ( + name.clone(), + MaterializedColumnStats { + min_values: self.compute_min_values(&column), + max_values: self.compute_max_values(&column), + null_counts: self.compute_null_counts(&column), + row_counts: self.compute_row_counts(&column), + }, + ) + }) + }) + .collect(); } fn field_for(&self, column: &Column) -> Option> { @@ -193,69 +215,137 @@ impl AddStatsPruningStatistics { .map(Arc::new) } - fn null_scalar(dt: &datafusion::arrow::datatypes::DataType) -> ScalarValue { - ScalarValue::try_from(dt).unwrap_or_else(|_| ScalarValue::Null) + fn should_build_stats_for(&self, column: &Column) -> bool { + self.field_for(column) + .is_some_and(|field| self.referenced_columns.contains(field.name())) } - fn scalar_from_json( - dt: &datafusion::arrow::datatypes::DataType, - v: &serde_json::Value, - ) -> Option { - match v { - serde_json::Value::Null => Some(Self::null_scalar(dt)), - serde_json::Value::Bool(b) => ScalarValue::try_from_string(b.to_string(), dt).ok(), - serde_json::Value::Number(n) => ScalarValue::try_from_string(n.to_string(), dt).ok(), - serde_json::Value::String(s) => ScalarValue::try_from_string(s.clone(), dt).ok(), - other => ScalarValue::try_from_string(other.to_string(), dt).ok(), + fn build_json_stat_array( + &self, + column: &Column, + lookup: impl for<'a> Fn(&'a Stats, &'a str) -> Option<&'a crate::spec::StatValue>, + ) -> Option { + if !self.should_build_stats_for(column) { + return None; + } + + let field = self.field_for(column)?; + let name = column.name(); + if self + .adds + .iter() + .any(|add| add.partition_values.contains_key(name)) + { + return None; } + + let mut has_value = false; + let values: Vec> = self + .stats + .iter() + .map(|stats| { + let value = stats.as_ref().and_then(|stats| lookup(stats, name)); + has_value |= + value.is_some_and(|value| !matches!(value, crate::spec::StatValue::Null)); + value + }) + .collect(); + + if !has_value { + return None; + } + + ScalarConverter::stat_values_to_array(&values, field.data_type()) + .ok() + .flatten() } - fn scalar_from_partition_value( - dt: &datafusion::arrow::datatypes::DataType, - v: &Option, - ) -> ScalarValue { - match v { - None => Self::null_scalar(dt), - Some(s) => ScalarValue::try_from_string(s.clone(), dt).unwrap_or_else(|_| { - // If we can't parse the partition value into the target type, treat it as unknown. - Self::null_scalar(dt) - }), + fn build_count_array( + &self, + column: &Column, + value_at: impl Fn(&Add, Option<&Stats>) -> Option, + ) -> Option { + if !self.should_build_stats_for(column) { + return None; } + + let mut has_value = false; + let values: Vec> = self + .adds + .iter() + .zip(self.stats.iter()) + .map(|(add, stats)| { + let value = value_at(add, stats.as_ref()); + has_value |= value.is_some(); + value + }) + .collect(); + + has_value.then(|| Arc::new(UInt64Array::from(values)) as ArrayRef) } - fn lookup_value_stat<'a>( - map: &'a std::collections::HashMap, - name: &str, - ) -> Option<&'a serde_json::Value> { - let mut parts = name.split('.'); - let first = parts.next()?; - let mut cur = map.get(first)?; - for p in parts { - cur = cur.as_column()?.get(p)?; + fn build_partition_array(&self, column: &Column) -> Option { + if !self.should_build_stats_for(column) { + return None; } - cur.as_value() + + let field = self.field_for(column)?; + let name = column.name(); + let values: Option>> = self + .adds + .iter() + .map(|add| add.partition_values.get(name).map(|value| value.as_deref())) + .collect(); + let values = values?; + + ScalarConverter::string_values_to_array(&values, field.data_type()).ok() + } + + fn null_scalar(dt: &datafusion::arrow::datatypes::DataType) -> ScalarValue { + ScalarValue::try_new_null(dt).unwrap_or(ScalarValue::Null) } - fn lookup_count_stat( - map: &std::collections::HashMap, - name: &str, - ) -> Option { - let mut parts = name.split('.'); - let first = parts.next()?; - let mut cur = map.get(first)?; - for p in parts { - cur = cur.as_column()?.get(p)?; + fn coerce_scalar_to_type( + dt: &datafusion::arrow::datatypes::DataType, + value: ScalarValue, + ) -> ScalarValue { + if value.is_null() { + return Self::null_scalar(dt); + } + + if value.data_type() == *dt { + return value; } - cur.as_value() + + match value.cast_to(dt) { + Ok(casted) if !casted.is_null() => casted, + Ok(_) | Err(_) => Self::null_scalar(dt), + } + } + + fn scalar_from_partition_value( + dt: &datafusion::arrow::datatypes::DataType, + v: &Option, + ) -> ScalarValue { + parse_optional_partition_value(v.as_deref(), dt).unwrap_or_else(|_| { + // If we can't parse the partition value into the target type, treat it as unknown. + Self::null_scalar(dt) + }) } fn build_array( &self, column: &Column, + count_stat: bool, f: impl Fn(&Add, Option<&Stats>, &datafusion::arrow::datatypes::DataType) -> ScalarValue, ) -> Option { let field = self.field_for(column)?; - let dt = field.data_type(); + let field_dt = field.data_type(); + + // DataFusion expects null/row count stats as UInt64 arrays, independent of the + // corresponding column's logical data type. + let count_dt = datafusion::arrow::datatypes::DataType::UInt64; + let target_dt = if count_stat { &count_dt } else { field_dt }; // Only compute arrays for columns that are actually referenced by the predicate. This // reduces repeated stats parsing work in `PruningPredicate`. @@ -266,7 +356,19 @@ impl AddStatsPruningStatistics { let mut has_value = false; let mut scalars = Vec::with_capacity(self.adds.len()); for (a, s) in self.adds.iter().zip(self.stats.iter()) { - let sv = f(a, s.as_ref(), dt); + let sv = f(a, s.as_ref(), field_dt); + let sv = Self::coerce_scalar_to_type(target_dt, sv); + + if sv.data_type() == datafusion::arrow::datatypes::DataType::Null + && *target_dt != datafusion::arrow::datatypes::DataType::Null + { + return None; + } + + if !sv.is_null() && sv.data_type() != *target_dt { + return None; + } + has_value |= !sv.is_null(); scalars.push(sv); } @@ -274,70 +376,114 @@ impl AddStatsPruningStatistics { if !has_value { return None; } - ScalarValue::iter_to_array(scalars).ok() + + let array = ScalarValue::iter_to_array(scalars).ok()?; + + if array.data_type() != target_dt { + return datafusion::arrow::compute::cast(&array, target_dt).ok(); + } + + Some(array) } -} -impl PruningStatistics for AddStatsPruningStatistics { - fn min_values(&self, column: &Column) -> Option { - self.build_array(column, |a, s, dt| { + fn compute_min_values(&self, column: &Column) -> Option { + if let Some(array) = self.build_partition_array(column) { + return Some(array); + } + if let Some(array) = self.build_json_stat_array(column, |stats, name| stats.min_value(name)) + { + return Some(array); + } + + self.build_array(column, false, |a, s, dt| { let name = column.name(); if let Some(pv) = a.partition_values.get(name) { return Self::scalar_from_partition_value(dt, pv); } if let Some(s) = s { - if let Some(v) = Self::lookup_value_stat(&s.min_values, name) { - return Self::scalar_from_json(dt, v).unwrap_or_else(|| Self::null_scalar(dt)); + if let Some(v) = s.min_value(name) { + return ScalarConverter::stat_value_to_arrow_scalar_value(v, dt) + .ok() + .flatten() + .unwrap_or_else(|| Self::null_scalar(dt)); } } Self::null_scalar(dt) }) } - fn max_values(&self, column: &Column) -> Option { - self.build_array(column, |a, s, dt| { + fn compute_max_values(&self, column: &Column) -> Option { + if let Some(array) = self.build_partition_array(column) { + return Some(array); + } + if let Some(array) = self.build_json_stat_array(column, |stats, name| stats.max_value(name)) + { + return Some(array); + } + + self.build_array(column, false, |a, s, dt| { let name = column.name(); if let Some(pv) = a.partition_values.get(name) { return Self::scalar_from_partition_value(dt, pv); } if let Some(s) = s { - if let Some(v) = Self::lookup_value_stat(&s.max_values, name) { - return Self::scalar_from_json(dt, v).unwrap_or_else(|| Self::null_scalar(dt)); + if let Some(v) = s.max_value(name) { + return ScalarConverter::stat_value_to_arrow_scalar_value(v, dt) + .ok() + .flatten() + .unwrap_or_else(|| Self::null_scalar(dt)); } } Self::null_scalar(dt) }) } - fn num_containers(&self) -> usize { - self.adds.len() - } - - fn null_counts(&self, column: &Column) -> Option { - self.build_array(column, |a, s, _dt| { + fn compute_null_counts(&self, column: &Column) -> Option { + self.build_count_array(column, |a, s| { let name = column.name(); - // Partition columns: all rows in file share same partition value. if let Some(pv) = a.partition_values.get(name) { - let n = s.map(|s| s.num_records).unwrap_or(0); - let cnt = if pv.is_none() { n } else { 0 }; - return ScalarValue::UInt64(Some(cnt.max(0) as u64)); - } - if let Some(s) = s { - if let Some(v) = Self::lookup_count_stat(&s.null_count, name) { - return ScalarValue::UInt64(Some(v.max(0) as u64)); + if pv.is_none() { + return s.map(|s| s.num_records.max(0) as u64); } + return Some(0); } - ScalarValue::UInt64(None) + s.and_then(|s| s.null_count_value(name)) + .map(|v| v.max(0) as u64) }) } + fn compute_row_counts(&self, column: &Column) -> Option { + self.build_count_array(column, |_a, s| s.map(|s| s.num_records.max(0) as u64)) + } +} + +impl PruningStatistics for AddStatsPruningStatistics { + fn min_values(&self, column: &Column) -> Option { + self.materialized_columns + .get(column.name()) + .and_then(|stats| stats.min_values.clone()) + } + + fn max_values(&self, column: &Column) -> Option { + self.materialized_columns + .get(column.name()) + .and_then(|stats| stats.max_values.clone()) + } + + fn num_containers(&self) -> usize { + self.adds.len() + } + + fn null_counts(&self, column: &Column) -> Option { + self.materialized_columns + .get(column.name()) + .and_then(|stats| stats.null_counts.clone()) + } + fn row_counts(&self, column: &Column) -> Option { - self.build_array(column, |_a, s, _dt| { - let Some(s) = s else { - return ScalarValue::UInt64(None); - }; - ScalarValue::UInt64(Some(s.num_records.max(0) as u64)) - }) + self.materialized_columns + .get(column.name()) + .and_then(|stats| stats.row_counts.clone()) } fn contained( @@ -348,3 +494,137 @@ impl PruningStatistics for AddStatsPruningStatistics { None } } + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use datafusion::arrow::array::UInt64Array; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::pruning::PruningStatistics; + use datafusion_common::{Column, DataFusionError, Result}; + + use super::AddStatsPruningStatistics; + use crate::spec::Add; + + fn add_with_stats(stats_json: &str) -> Add { + Add { + path: "part-00000.parquet".to_string(), + partition_values: HashMap::new(), + size: 1, + modification_time: 0, + data_change: true, + stats: Some(stats_json.to_string()), + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + clustering_provider: None, + commit_version: None, + commit_timestamp: None, + } + } + + fn add_with_partition_value(name: &str, value: Option<&str>) -> Add { + Add { + path: "part-00000.parquet".to_string(), + partition_values: HashMap::from([(name.to_string(), value.map(ToOwned::to_owned))]), + size: 1, + modification_time: 0, + data_change: true, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + clustering_provider: None, + commit_version: None, + commit_timestamp: None, + } + } + + #[test] + fn row_counts_use_uint64_for_decimal_columns() -> Result<()> { + let table_schema = Arc::new(Schema::new(vec![Field::new( + "dec_col", + DataType::Decimal128(7, 2), + true, + )])); + let adds = vec![add_with_stats(r#"{"numRecords":2382848}"#)]; + let mut referenced_columns = HashSet::new(); + referenced_columns.insert("dec_col".to_string()); + + let stats = AddStatsPruningStatistics::try_new(table_schema, adds, referenced_columns)?; + let array = stats + .row_counts(&Column::from_name("dec_col")) + .ok_or_else(|| { + DataFusionError::Internal("row count stats should be available".to_string()) + })?; + + assert_eq!(array.data_type(), &DataType::UInt64); + let values = array + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Internal("array should be UInt64".to_string()))?; + assert_eq!(values.value(0), 2_382_848); + Ok(()) + } + + #[test] + fn null_counts_use_uint64_for_date_columns() -> Result<()> { + let table_schema = Arc::new(Schema::new(vec![Field::new( + "date_col", + DataType::Date32, + true, + )])); + let adds = vec![add_with_stats( + r#"{"numRecords":10,"nullCount":{"date_col":0}}"#, + )]; + let mut referenced_columns = HashSet::new(); + referenced_columns.insert("date_col".to_string()); + + let stats = AddStatsPruningStatistics::try_new(table_schema, adds, referenced_columns)?; + let array = stats + .null_counts(&Column::from_name("date_col")) + .ok_or_else(|| { + DataFusionError::Internal("null count stats should be available".to_string()) + })?; + + assert_eq!(array.data_type(), &DataType::UInt64); + let values = array + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Internal("array should be UInt64".to_string()))?; + assert_eq!(values.value(0), 0); + Ok(()) + } + + #[test] + fn partition_min_values_build_arrays_without_scalar_roundtrip() -> Result<()> { + let table_schema = Arc::new(Schema::new(vec![Field::new( + "part_col", + DataType::Int32, + true, + )])); + let adds = vec![ + add_with_partition_value("part_col", Some("10")), + add_with_partition_value("part_col", Some("20")), + ]; + let referenced_columns = HashSet::from(["part_col".to_string()]); + + let stats = AddStatsPruningStatistics::try_new(table_schema, adds, referenced_columns)?; + let array = stats + .min_values(&Column::from_name("part_col")) + .ok_or_else(|| DataFusionError::Internal("partition min values missing".to_string()))?; + + assert_eq!(array.data_type(), &DataType::Int32); + let values = array + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Internal("array should be Int32".to_string()))?; + assert_eq!(values.value(0), 10); + assert_eq!(values.value(1), 20); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/datasource/scan.rs b/crates/sail-delta-lake/src/datasource/scan.rs index fbe6b77eb7..9f37049afa 100644 --- a/crates/sail-delta-lake/src/datasource/scan.rs +++ b/crates/sail-delta-lake/src/datasource/scan.rs @@ -24,7 +24,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, SchemaRef}; use datafusion::catalog::Session; use datafusion::common::stats::{ColumnStatistics, Precision, Statistics}; -use datafusion::common::{DataFusionError, Result}; +use datafusion::common::{DataFusionError, Result, ScalarValue}; use datafusion::config::TableParquetOptions; use datafusion::datasource::listing::PartitionedFile; use datafusion::datasource::physical_plan::{ @@ -35,14 +35,13 @@ use datafusion::datasource::table_schema::TableSchema; use datafusion::physical_expr::{LexOrdering, PhysicalExpr}; use object_store::path::Path; -use crate::datasource::{ - create_object_store_url, partitioned_file_from_action, DataFusionMixins, DeltaScanConfig, - DeltaTableStateExt, -}; -use crate::kernel::models::Add; +use crate::conversion::ScalarConverter; +use crate::datasource::{create_object_store_url, partitioned_file_from_action, DeltaScanConfig}; use crate::physical_plan::DeltaPhysicalExprAdapterFactory; +use crate::schema::arrow_field_physical_name; +use crate::spec::Add; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; /// Parameters for building file scan configuration pub struct FileScanParams<'a> { @@ -51,11 +50,26 @@ pub struct FileScanParams<'a> { pub limit: Option, pub pushdown_filter: Option>, pub sort_order: Option, + /// How to populate table-level statistics for the scan. + /// + /// This is separate from per-file statistics attached to each [`PartitionedFile`]. + pub table_stats_mode: TableStatsMode, +} + +/// Strategy for providing table-level statistics to DataFusion. +#[derive(Debug, Clone, Copy)] +pub enum TableStatsMode { + /// Use snapshot/log-derived statistics (can be expensive for large snapshots). + Snapshot, + /// Aggregate statistics only from the provided `Add` actions (chunk-local). + AddsOnly, + /// Do not compute statistics; return unknown stats. + Unknown, } /// Build a FileScanConfig from pruned files and scan configuration pub fn build_file_scan_config( - snapshot: &DeltaTableState, + snapshot: &DeltaSnapshot, log_store: &LogStoreRef, files: &[Add], scan_config: &DeltaScanConfig, @@ -71,24 +85,15 @@ pub fn build_file_scan_config( let config = scan_config.clone(); let table_partition_cols = snapshot.metadata().partition_columns(); let column_mapping_mode = snapshot.effective_column_mapping_mode(); - let kernel_schema = snapshot.snapshot().schema(); - let partition_columns_mapped: Vec<(String, String)> = table_partition_cols - .iter() - .map(|logical| { - let physical = kernel_schema - .field(logical) - .map(|f| f.physical_name(column_mapping_mode).to_string()) - .unwrap_or_else(|| logical.clone()); - (logical.clone(), physical) - }) - .collect(); + let kernel_schema = snapshot.schema(); + let partition_columns_mapped = snapshot.physical_partition_columns(); let mut physical_to_logical = HashMap::new(); for field in complete_schema.fields() { let logical = field.name().clone(); let physical = kernel_schema - .field(&logical) - .map(|f| f.physical_name(column_mapping_mode).to_string()) - .unwrap_or_else(|| logical.clone()); + .field_with_name(&logical) + .map(|f| arrow_field_physical_name(f, column_mapping_mode).to_string()) + .unwrap_or_else(|_| logical.clone()); physical_to_logical.entry(physical).or_insert(logical); } @@ -98,10 +103,16 @@ pub fn build_file_scan_config( Vec, > = HashMap::new(); + // Collect per-file statistics while building `PartitionedFile`s so we can reuse them to + // produce chunk-local table statistics without re-parsing JSON. + let mut per_file_stats: Vec> = Vec::new(); + for action in files.iter() { let mut part = partitioned_file_from_action(action, &partition_columns_mapped, &complete_schema)?; - if let Some(stats) = stats_for_add(action, &file_schema, &physical_to_logical)? { + let action_stats = stats_for_add(action, &file_schema, &physical_to_logical)?; + if let Some(stats) = action_stats { + per_file_stats.push(Arc::clone(&stats)); part.statistics = Some(stats); } @@ -138,12 +149,10 @@ pub fn build_file_scan_config( // Rewrite file paths with table location prefix file_groups.iter_mut().for_each(|(_, files)| { files.iter_mut().for_each(|file| { - file.object_meta.location = Path::from(format!( - "{}{}{}", - log_store.config().location.path(), - object_store::path::DELIMITER, - file.object_meta.location - )); + file.object_meta.location = rewrite_data_file_location( + Path::from(log_store.config().location.path()), + file.object_meta.location.clone(), + ); }); }); @@ -204,16 +213,37 @@ pub fn build_file_scan_config( }; let table_schema = TableSchema::new(Arc::clone(&file_schema), table_partition_cols_schema); - // Calculate table statistics + // Calculate table statistics. // - // `Statistics::column_statistics` expects the same length as the table - // schema (file schema + partition columns). If this vector is shorter, projection statistics - // can panic when encountering a `Column` referring to a partition column. - let mut stats = snapshot - .datafusion_table_statistics(params.pruning_mask) - .unwrap_or_else(|| { + // `Statistics::column_statistics` expects the same length as the table schema + // (file schema + partition columns + optional virtual columns). If this vector is shorter, + // projection statistics can panic when encountering a `Column` referring to a partition + // column. + let mut stats = match params.table_stats_mode { + TableStatsMode::Snapshot => snapshot + .datafusion_table_statistics(params.pruning_mask) + .unwrap_or_else(|| { + datafusion::common::stats::Statistics::new_unknown( + table_schema.table_schema().as_ref(), + ) + }), + TableStatsMode::AddsOnly => { + // Compute stats only for the current `files` slice to match chunked execution. + // If any file is missing stats, fall back to unknown rather than mixing partial + // aggregates (which can be misleading for the optimizer). + let all_have_stats = per_file_stats.len() == files.len(); + if all_have_stats { + aggregate_table_stats_from_files(&per_file_stats) + } else { + datafusion::common::stats::Statistics::new_unknown( + table_schema.table_schema().as_ref(), + ) + } + } + TableStatsMode::Unknown => { datafusion::common::stats::Statistics::new_unknown(table_schema.table_schema().as_ref()) - }); + } + }; let expected_cols = table_schema.table_schema().fields().len(); if stats.column_statistics.len() < expected_cols { stats.column_statistics.extend( @@ -223,6 +253,9 @@ pub fn build_file_scan_config( } else if stats.column_statistics.len() > expected_cols { stats.column_statistics.truncate(expected_cols); } + + sanitize_statistics_for_schema(table_schema.table_schema(), &mut stats); + let mut parquet_source = ParquetSource::new(table_schema).with_table_parquet_options(parquet_options); @@ -266,6 +299,168 @@ pub fn build_file_scan_config( Ok(file_scan_config) } +fn aggregate_table_stats_from_files(file_stats: &[Arc]) -> Statistics { + let mut num_rows = Precision::Exact(0usize); + let mut column_statistics: Option> = None; + + for s in file_stats { + num_rows = match (num_rows, s.num_rows) { + (Precision::Exact(a), Precision::Exact(b)) => Precision::Exact(a.saturating_add(b)), + _ => Precision::Absent, + }; + + match (&mut column_statistics, s.column_statistics.as_slice()) { + (None, cols) => column_statistics = Some(cols.to_vec()), + (Some(acc), cols) => { + let n = acc.len().min(cols.len()); + for i in 0..n { + acc[i] = add_column_statistics(&acc[i], &cols[i]); + } + } + } + } + + Statistics { + num_rows, + total_byte_size: Precision::Absent, + column_statistics: column_statistics.unwrap_or_default(), + } +} + +fn add_column_statistics(a: &ColumnStatistics, b: &ColumnStatistics) -> ColumnStatistics { + ColumnStatistics { + null_count: a.null_count.add(&b.null_count), + max_value: merge_max_bounds(&a.max_value, &b.max_value), + min_value: merge_min_bounds(&a.min_value, &b.min_value), + sum_value: Precision::Absent, + distinct_count: a.distinct_count.add(&b.distinct_count), + byte_size: a.byte_size.add(&b.byte_size), + } +} + +fn sanitize_statistics_for_schema(schema: &SchemaRef, stats: &mut Statistics) { + for (idx, field) in schema.fields().iter().enumerate() { + if let Some(column_stats) = stats.column_statistics.get_mut(idx) { + sanitize_column_statistics_for_field(column_stats, field.name(), field.data_type()); + } + } +} + +fn sanitize_column_statistics_for_field( + column_stats: &mut ColumnStatistics, + _column_name: &str, + data_type: &ArrowDataType, +) { + column_stats.min_value = sanitize_bound_for_type(&column_stats.min_value, data_type); + column_stats.max_value = sanitize_bound_for_type(&column_stats.max_value, data_type); + + let min_type = column_stats + .min_value + .get_value() + .map(ScalarValue::data_type); + let max_type = column_stats + .max_value + .get_value() + .map(ScalarValue::data_type); + if let (Some(min_type), Some(max_type)) = (min_type, max_type) { + if min_type != max_type { + column_stats.min_value = Precision::Absent; + column_stats.max_value = Precision::Absent; + } + } +} + +fn sanitize_bound_for_type( + bound: &Precision, + data_type: &ArrowDataType, +) -> Precision { + let sanitize_value = |value: &ScalarValue| { + if value.is_null() { + return None; + } + if value.data_type() == *data_type { + return Some(value.clone()); + } + value + .cast_to(data_type) + .ok() + .filter(|casted| !casted.is_null()) + }; + + match bound { + Precision::Exact(value) => sanitize_value(value) + .map(Precision::Exact) + .unwrap_or(Precision::Absent), + Precision::Inexact(value) => sanitize_value(value) + .map(Precision::Inexact) + .unwrap_or(Precision::Absent), + Precision::Absent => Precision::Absent, + } +} + +fn merge_max_bounds( + a: &Precision, + b: &Precision, +) -> Precision { + if bounds_have_mismatched_types(a, b) { + Precision::Absent + } else { + a.max(b) + } +} + +fn merge_min_bounds( + a: &Precision, + b: &Precision, +) -> Precision { + if bounds_have_mismatched_types(a, b) { + Precision::Absent + } else { + a.min(b) + } +} + +fn bounds_have_mismatched_types(a: &Precision, b: &Precision) -> bool { + let lhs = match a { + Precision::Exact(v) | Precision::Inexact(v) => Some(v), + Precision::Absent => None, + }; + let rhs = match b { + Precision::Exact(v) | Precision::Inexact(v) => Some(v), + Precision::Absent => None, + }; + + match (lhs, rhs) { + (Some(lhs), Some(rhs)) => lhs.data_type() != rhs.data_type(), + _ => false, + } +} + +fn rewrite_data_file_location(table_root: Path, location: Path) -> Path { + let raw = location.as_ref(); + if looks_like_absolute_uri(raw) { + return location; + } + + Path::from(format!( + "{}{}{}", + table_root, + object_store::path::DELIMITER, + location + )) +} + +fn looks_like_absolute_uri(path: &str) -> bool { + let Some((scheme, rest)) = path.split_once(':') else { + return false; + }; + !scheme.is_empty() + && scheme + .chars() + .all(|c| c.is_ascii_alphanumeric() || matches!(c, '+' | '-' | '.')) + && rest.starts_with('/') +} + fn stats_for_add( action: &Add, file_schema: &SchemaRef, @@ -292,21 +487,29 @@ fn stats_for_add( for name in name_candidates { if min_value == Precision::Absent { - if let Some(value) = lookup_value_stat(&stats.min_values, name) - .and_then(|v| scalar_from_json(field.data_type(), v)) - { - min_value = Precision::Exact(value); + if let Some(value) = stats.min_value(name).and_then(|v| { + ScalarConverter::stat_value_to_arrow_scalar_value(v, field.data_type()) + .ok() + .flatten() + }) { + if !value.is_null() { + min_value = Precision::Exact(value); + } } } if max_value == Precision::Absent { - if let Some(value) = lookup_value_stat(&stats.max_values, name) - .and_then(|v| scalar_from_json(field.data_type(), v)) - { - max_value = Precision::Exact(value); + if let Some(value) = stats.max_value(name).and_then(|v| { + ScalarConverter::stat_value_to_arrow_scalar_value(v, field.data_type()) + .ok() + .flatten() + }) { + if !value.is_null() { + max_value = Precision::Exact(value); + } } } if null_count == Precision::Absent { - if let Some(value) = lookup_count_stat(&stats.null_count, name) { + if let Some(value) = stats.null_count_value(name) { null_count = Precision::Exact(value.max(0) as usize); } } @@ -335,50 +538,98 @@ fn stats_for_add( }))) } -fn lookup_value_stat<'a>( - map: &'a std::collections::HashMap, - name: &str, -) -> Option<&'a serde_json::Value> { - let mut parts = name.split('.'); - let first = parts.next()?; - let mut cur = map.get(first)?; - for p in parts { - cur = cur.as_column()?.get(p)?; +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::stats::{ColumnStatistics, Precision, Statistics}; + use datafusion::common::ScalarValue; + use object_store::path::Path; + + use super::{ + add_column_statistics, rewrite_data_file_location, sanitize_statistics_for_schema, + }; + use crate::conversion::ScalarConverter; + + #[test] + fn test_scalar_from_json_null_returns_typed_null() { + #[expect(clippy::unwrap_used)] + let value = + ScalarConverter::json_to_arrow_scalar_value(&serde_json::Value::Null, &DataType::Int64) + .unwrap(); + assert_eq!(value, Some(ScalarValue::Int64(None))); } - cur.as_value() -} -fn lookup_count_stat( - map: &std::collections::HashMap, - name: &str, -) -> Option { - let mut parts = name.split('.'); - let first = parts.next()?; - let mut cur = map.get(first)?; - for p in parts { - cur = cur.as_column()?.get(p)?; + #[test] + fn test_add_column_statistics_absents_mismatched_bounds() { + let lhs = ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Null), + min_value: Precision::Exact(ScalarValue::Null), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }; + let rhs = ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(5))), + min_value: Precision::Exact(ScalarValue::Int64(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }; + + let merged = add_column_statistics(&lhs, &rhs); + assert_eq!(merged.max_value, Precision::Absent); + assert_eq!(merged.min_value, Precision::Absent); } - cur.as_value() -} -fn scalar_from_json( - dt: &datafusion::arrow::datatypes::DataType, - v: &serde_json::Value, -) -> Option { - match v { - serde_json::Value::Null => Some( - datafusion::common::ScalarValue::try_from(dt) - .unwrap_or(datafusion::common::ScalarValue::Null), - ), - serde_json::Value::Bool(b) => { - datafusion::common::ScalarValue::try_from_string(b.to_string(), dt).ok() - } - serde_json::Value::Number(n) => { - datafusion::common::ScalarValue::try_from_string(n.to_string(), dt).ok() - } - serde_json::Value::String(s) => { - datafusion::common::ScalarValue::try_from_string(s.clone(), dt).ok() - } - other => datafusion::common::ScalarValue::try_from_string(other.to_string(), dt).ok(), + #[test] + fn test_sanitize_statistics_for_schema_removes_untyped_null_bounds() { + let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int64, true)])); + let mut stats = Statistics { + num_rows: Precision::Exact(10), + total_byte_size: Precision::Absent, + column_statistics: vec![ColumnStatistics { + null_count: Precision::Absent, + max_value: Precision::Exact(ScalarValue::Int64(Some(5))), + min_value: Precision::Exact(ScalarValue::Null), + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }], + }; + + sanitize_statistics_for_schema(&schema, &mut stats); + + assert_eq!( + stats.column_statistics[0].max_value, + Precision::Exact(ScalarValue::Int64(Some(5))) + ); + assert_eq!(stats.column_statistics[0].min_value, Precision::Absent); + } + + #[test] + fn test_rewrite_data_file_location_preserves_absolute_uri_paths() { + let table_root = Path::from("bucket/table"); + let absolute = Path::from("s3://other-bucket/path/part-000.parquet"); + + let rewritten = rewrite_data_file_location(table_root, absolute.clone()); + + assert_eq!(rewritten, absolute); + } + + #[test] + fn test_rewrite_data_file_location_prefixes_relative_paths() { + let rewritten = rewrite_data_file_location( + Path::from("bucket/table"), + Path::from("part=1/part-000.parquet"), + ); + + assert_eq!( + rewritten, + Path::from("bucket/table/part=1/part-000.parquet") + ); } } diff --git a/crates/sail-delta-lake/src/datasource/schema.rs b/crates/sail-delta-lake/src/datasource/schema.rs index de109f7a74..25df5763f2 100644 --- a/crates/sail-delta-lake/src/datasource/schema.rs +++ b/crates/sail-delta-lake/src/datasource/schema.rs @@ -21,81 +21,21 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{ - DataType as ArrowDataType, Field, Schema as ArrowSchema, SchemaRef, SchemaRef as ArrowSchemaRef, + DataType as ArrowDataType, Field, Schema as ArrowSchema, SchemaRef, }; -use delta_kernel::engine::arrow_conversion::TryIntoArrow; -use crate::kernel::snapshot::{EagerSnapshot, LogDataHandler, Snapshot}; -use crate::kernel::{DeltaResult, DeltaTableError}; -use crate::schema::arrow_schema_from_struct_type; -use crate::table::DeltaTableState; - -/// Convenience trait for calling common methods on snapshot hierarchies -pub trait DataFusionMixins { - /// The physical datafusion schema of a table - fn arrow_schema(&self) -> DeltaResult; - - /// Get the table schema as an [`ArrowSchemaRef`] - fn input_schema(&self) -> DeltaResult; -} - -impl DataFusionMixins for Snapshot { - fn arrow_schema(&self) -> DeltaResult { - arrow_schema_impl(self, true) - } - - fn input_schema(&self) -> DeltaResult { - arrow_schema_impl(self, false) - } -} - -impl DataFusionMixins for EagerSnapshot { - fn arrow_schema(&self) -> DeltaResult { - arrow_schema_from_struct_type(self.schema(), self.metadata().partition_columns(), true) - } - - fn input_schema(&self) -> DeltaResult { - arrow_schema_from_struct_type(self.schema(), self.metadata().partition_columns(), false) - } -} - -impl DataFusionMixins for DeltaTableState { - fn arrow_schema(&self) -> DeltaResult { - Ok(Arc::new(self.schema().try_into_arrow()?)) - } - - fn input_schema(&self) -> DeltaResult { - self.arrow_schema() - } -} - -impl DataFusionMixins for LogDataHandler<'_> { - fn arrow_schema(&self) -> DeltaResult { - unimplemented!("arrow_schema for LogDataHandler"); - } - - fn input_schema(&self) -> DeltaResult { - unimplemented!("input_schema for LogDataHandler"); - } -} - -fn arrow_schema_impl(snapshot: &Snapshot, wrap_partitions: bool) -> DeltaResult { - arrow_schema_from_struct_type( - snapshot.schema(), - snapshot.metadata().partition_columns(), - wrap_partitions, - ) -} +use crate::kernel::snapshot::DeltaSnapshot; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; /// The logical schema for a Deltatable is different from the protocol level schema since partition /// columns must appear at the end of the schema. This is to align with how partition are handled /// at the physical level pub fn df_logical_schema( - snapshot: &DeltaTableState, + snapshot: &DeltaSnapshot, file_column_name: &Option, commit_version_column_name: &Option, commit_timestamp_column_name: &Option, - schema: Option, + schema: Option, ) -> DeltaResult { let input_schema = match schema { Some(schema) => schema, @@ -106,7 +46,7 @@ pub fn df_logical_schema( let mut fields: Vec> = input_schema .fields() .iter() - .filter(|f| !table_partition_cols.contains(f.name())) + .filter(|field| !table_partition_cols.contains(field.name())) .cloned() .collect(); diff --git a/crates/sail-delta-lake/src/delta_log/cleanup.rs b/crates/sail-delta-lake/src/delta_log/cleanup.rs new file mode 100644 index 0000000000..1164402ff8 --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/cleanup.rs @@ -0,0 +1,455 @@ +use std::collections::BTreeSet; +use std::sync::Arc; + +use futures::{StreamExt, TryStreamExt}; +use object_store::{ObjectMeta, ObjectStore}; +use uuid::Uuid; + +use super::{ + parse_checkpoint_version_from_location, parse_checksum_version_from_location, + parse_commit_version_from_location, resolve_version_timestamp, +}; +use crate::kernel::snapshot::DeltaSnapshot; +use crate::spec::{delta_log_root_path, DeltaResult}; +use crate::storage::LogStore; + +#[derive(Debug, Clone, Copy)] +struct LogRetentionWindow { + latest_version: i64, + cutoff_timestamp: i64, +} + +impl LogRetentionWindow { + fn new(latest_version: i64, cutoff_timestamp: i64) -> Self { + Self { + latest_version, + cutoff_timestamp, + } + } + + fn includes_commit(self, version_timestamp: i64, version: i64) -> bool { + version <= self.latest_version && version_timestamp <= self.cutoff_timestamp + } + + fn includes_checkpoint(self, version: i64) -> bool { + version <= self.latest_version + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum DeltaLogFile { + Commit(i64), + Checksum(i64), + Checkpoint(i64), +} + +impl DeltaLogFile { + fn from_meta(meta: &ObjectMeta) -> Option { + parse_commit_version_from_location(&meta.location) + .map(Self::Commit) + .or_else(|| parse_checksum_version_from_location(&meta.location).map(Self::Checksum)) + .or_else(|| { + parse_checkpoint_version_from_location(&meta.location).map(Self::Checkpoint) + }) + } + + fn version(self) -> i64 { + match self { + Self::Commit(version) | Self::Checksum(version) | Self::Checkpoint(version) => version, + } + } + + fn expires_before(self, retention_checkpoint_version: i64) -> bool { + self.version() < retention_checkpoint_version + } +} + +#[derive(Debug, Default)] +struct RetentionCleanupBoundary { + cutoff_commit_version: Option, + checkpoint_versions: BTreeSet, +} + +impl RetentionCleanupBoundary { + fn observe_checkpoint(&mut self, version: i64, retention: LogRetentionWindow) { + if retention.includes_checkpoint(version) { + self.checkpoint_versions.insert(version); + } + } + + fn observe_commit( + &mut self, + version: i64, + version_timestamp: i64, + retention: LogRetentionWindow, + ) { + if retention.includes_commit(version_timestamp, version) { + self.cutoff_commit_version = self.cutoff_commit_version.max(Some(version)); + } + } + + fn retention_checkpoint_version(&self) -> Option { + let cutoff_commit_version = self.cutoff_commit_version?; + self.checkpoint_versions + .range(..=cutoff_commit_version) + .next_back() + .copied() + } +} + +pub(crate) async fn cleanup_expired_delta_log_files( + table_state: &DeltaSnapshot, + log_store: &dyn LogStore, + cutoff_timestamp: i64, + operation_id: Option, +) -> DeltaResult { + let latest_version = table_state.version(); + let object_store = log_store.object_store(operation_id); + let retention = LogRetentionWindow::new(latest_version, cutoff_timestamp); + + let Some(retention_checkpoint_version) = + find_retention_checkpoint_version(table_state, log_store, object_store.clone(), retention) + .await? + else { + return Ok(0); + }; + + delete_logs_before_checkpoint_version(object_store, retention_checkpoint_version).await +} + +async fn find_retention_checkpoint_version( + table_state: &DeltaSnapshot, + log_store: &dyn LogStore, + object_store: Arc, + retention: LogRetentionWindow, +) -> DeltaResult> { + let mut boundary = RetentionCleanupBoundary::default(); + let mut commit_entries = Vec::new(); + let log_path = delta_log_root_path(); + let mut log_entries = object_store.list(Some(&log_path)); + while let Some(meta) = log_entries.next().await { + let Ok(meta) = meta else { + continue; + }; + let Some(file) = DeltaLogFile::from_meta(&meta) else { + continue; + }; + match file { + DeltaLogFile::Commit(version) if version <= retention.latest_version => { + commit_entries.push((version, meta)); + } + DeltaLogFile::Checkpoint(version) => { + boundary.observe_checkpoint(version, retention); + } + _ => {} + } + } + commit_entries.sort_by_key(|(version, _)| *version); + + for (version, _) in commit_entries { + let version_timestamp = resolve_version_timestamp( + log_store, + version, + table_state.version_timestamp(version), + table_state.protocol(), + table_state.metadata(), + ) + .await?; + boundary.observe_commit(version, version_timestamp, retention); + } + + Ok(boundary.retention_checkpoint_version()) +} + +async fn delete_logs_before_checkpoint_version( + object_store: Arc, + retention_checkpoint_version: i64, +) -> DeltaResult { + let log_path = delta_log_root_path(); + let locations = object_store + .list(Some(&log_path)) + .filter_map(move |meta| async move { + let meta = meta.ok()?; + expired_log_location(&meta, retention_checkpoint_version).map(Ok) + }) + .boxed(); + + Ok(object_store + .delete_stream(locations) + .try_collect::>() + .await? + .len()) +} + +fn expired_log_location( + meta: &ObjectMeta, + retention_checkpoint_version: i64, +) -> Option { + DeltaLogFile::from_meta(meta) + .filter(|file| file.expires_before(retention_checkpoint_version)) + .map(|_| meta.location.clone()) +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use std::sync::Arc; + + use futures::TryStreamExt; + use object_store::memory::InMemory; + use object_store::path::Path; + use object_store::{ObjectStore, ObjectStoreExt}; + use url::Url; + + use super::*; + use crate::kernel::snapshot::DeltaSnapshot; + use crate::spec::{ + checkpoint_path, checksum_path, commit_path, Action, CommitInfo, DataType, Metadata, + Protocol, StructField, StructType, TableFeature, VersionChecksum, + }; + use crate::storage::{default_logstore, LogStoreRef, StorageConfig}; + + fn test_log_store(store: Arc) -> LogStoreRef { + default_logstore( + store.clone(), + store, + &Url::parse("memory:///").unwrap(), + &StorageConfig, + ) + } + + async fn put_log_file(store: &Arc, path: Path) { + store.put(&path, b"{}".to_vec().into()).await.unwrap(); + } + + fn test_metadata( + configuration: impl IntoIterator, + ) -> Metadata { + Metadata::try_new( + None, + None, + StructType::try_new([StructField::not_null("id", DataType::LONG)]).unwrap(), + Vec::new(), + 0, + configuration + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ) + .unwrap() + } + + async fn put_commit(store: &Arc, version: i64, actions: &[Action]) { + let mut bytes = Vec::new(); + for (index, action) in actions.iter().enumerate() { + if index > 0 { + bytes.push(b'\n'); + } + serde_json::to_writer(&mut bytes, action).unwrap(); + } + store + .put(&commit_path(version), bytes.into()) + .await + .unwrap(); + } + + async fn put_checksum( + store: &Arc, + version: i64, + protocol: &Protocol, + metadata: &Metadata, + in_commit_timestamp_opt: Option, + ) { + let checksum = VersionChecksum { + txn_id: None, + table_size_bytes: 0, + num_files: 0, + num_metadata: 1, + num_protocol: 1, + in_commit_timestamp_opt, + set_transactions: None, + domain_metadata: None, + metadata: metadata.clone(), + protocol: protocol.clone(), + file_size_histogram: None, + all_files: None, + }; + store + .put( + &checksum_path(version), + serde_json::to_vec(&checksum).unwrap().into(), + ) + .await + .unwrap(); + } + + async fn load_snapshot(log_store: &LogStoreRef, version: i64) -> Arc { + Arc::new( + DeltaSnapshot::try_new(log_store.as_ref(), Default::default(), Some(version), None) + .await + .unwrap(), + ) + } + + async fn list_log_file_paths(store: &Arc) -> Vec { + let mut paths = store + .list(Some(&delta_log_root_path())) + .map_ok(|meta| meta.location.as_ref().to_string()) + .try_collect::>() + .await + .unwrap(); + paths.sort(); + paths + } + + #[tokio::test] + async fn cleanup_expired_delta_log_files_deletes_entries_before_retention_checkpoint() { + let store: Arc = Arc::new(InMemory::new()); + let protocol = Protocol::new(1, 2, None, None); + let metadata = test_metadata([]); + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo::default()), + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + ) + .await; + put_commit(&store, 1, &[Action::CommitInfo(CommitInfo::default())]).await; + put_commit(&store, 2, &[Action::CommitInfo(CommitInfo::default())]).await; + put_commit(&store, 3, &[Action::CommitInfo(CommitInfo::default())]).await; + + let log_store = test_log_store(store.clone()); + let snapshot = load_snapshot(&log_store, 3).await; + + put_checksum(&store, 1, &protocol, &metadata, None).await; + put_log_file(&store, checkpoint_path(1)).await; + put_log_file(&store, checkpoint_path(2)).await; + + let deleted = + cleanup_expired_delta_log_files(snapshot.as_ref(), log_store.as_ref(), i64::MAX, None) + .await + .unwrap(); + + assert_eq!(deleted, 4); + assert_eq!( + list_log_file_paths(&store).await, + vec![ + "_delta_log/00000000000000000002.checkpoint.parquet".to_string(), + "_delta_log/00000000000000000002.json".to_string(), + "_delta_log/00000000000000000003.json".to_string(), + ] + ); + } + + #[tokio::test] + async fn cleanup_expired_delta_log_files_skips_when_no_checkpoint_is_eligible() { + let store: Arc = Arc::new(InMemory::new()); + let protocol = Protocol::new(1, 2, None, None); + let metadata = test_metadata([]); + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo::default()), + Action::Protocol(protocol), + Action::Metadata(metadata), + ], + ) + .await; + put_commit(&store, 1, &[Action::CommitInfo(CommitInfo::default())]).await; + + let log_store = test_log_store(store.clone()); + let snapshot = load_snapshot(&log_store, 1).await; + + put_log_file(&store, checkpoint_path(2)).await; + put_commit(&store, 2, &[Action::CommitInfo(CommitInfo::default())]).await; + + let deleted = + cleanup_expired_delta_log_files(snapshot.as_ref(), log_store.as_ref(), i64::MAX, None) + .await + .unwrap(); + + assert_eq!(deleted, 0); + assert_eq!( + list_log_file_paths(&store).await, + vec![ + "_delta_log/00000000000000000000.json".to_string(), + "_delta_log/00000000000000000001.json".to_string(), + "_delta_log/00000000000000000002.checkpoint.parquet".to_string(), + "_delta_log/00000000000000000002.json".to_string(), + ] + ); + } + + #[tokio::test] + async fn cleanup_expired_delta_log_files_uses_ict_cutoff_instead_of_object_mtime() { + let store: Arc = Arc::new(InMemory::new()); + let protocol = Protocol::new(1, 7, None, Some(vec![TableFeature::InCommitTimestamp])); + let metadata = test_metadata([("delta.enableInCommitTimestamps", "true")]); + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(100), + ..Default::default() + }), + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + ) + .await; + put_commit( + &store, + 1, + &[Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(200), + ..Default::default() + })], + ) + .await; + put_commit( + &store, + 2, + &[Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(300), + ..Default::default() + })], + ) + .await; + put_commit( + &store, + 3, + &[Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(400), + ..Default::default() + })], + ) + .await; + + let log_store = test_log_store(store.clone()); + let snapshot = load_snapshot(&log_store, 3).await; + + put_checksum(&store, 1, &protocol, &metadata, Some(200)).await; + put_log_file(&store, checkpoint_path(1)).await; + put_log_file(&store, checkpoint_path(2)).await; + + let deleted = + cleanup_expired_delta_log_files(snapshot.as_ref(), log_store.as_ref(), 350, None) + .await + .unwrap(); + + assert_eq!(deleted, 4); + assert_eq!( + list_log_file_paths(&store).await, + vec![ + "_delta_log/00000000000000000002.checkpoint.parquet".to_string(), + "_delta_log/00000000000000000002.json".to_string(), + "_delta_log/00000000000000000003.json".to_string(), + ] + ); + } +} diff --git a/crates/sail-delta-lake/src/delta_log/listing.rs b/crates/sail-delta-lake/src/delta_log/listing.rs new file mode 100644 index 0000000000..4a91693a8d --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/listing.rs @@ -0,0 +1,186 @@ +use std::sync::Arc; + +use futures::TryStreamExt; +use object_store::path::Path; +use object_store::{Error as ObjectStoreError, ObjectMeta, ObjectStore, ObjectStoreExt}; + +use crate::spec::{ + delta_log_prefix_path, delta_log_root_path, last_checkpoint_path, parse_checkpoint_version, + parse_checksum_version, parse_commit_version, DeltaResult, LastCheckpointHint, +}; + +pub(crate) fn parse_delta_log_entry_version(meta: &ObjectMeta) -> Option { + let filename = meta.location.as_ref().rsplit('/').next()?; + parse_commit_version(filename).or_else(|| parse_checkpoint_version(filename)) +} + +pub(crate) fn parse_checksum_version_from_location(location: &Path) -> Option { + location + .as_ref() + .rsplit('/') + .next() + .and_then(parse_checksum_version) +} + +pub(crate) fn parse_commit_version_from_location(location: &Path) -> Option { + location + .as_ref() + .rsplit('/') + .next() + .and_then(parse_commit_version) +} + +pub(crate) fn parse_checkpoint_version_from_location(location: &Path) -> Option { + location + .as_ref() + .rsplit('/') + .next() + .and_then(parse_checkpoint_version) +} + +pub(crate) async fn read_last_checkpoint_version_from_store( + store: Arc, +) -> Option { + let bytes = store + .get(&last_checkpoint_path()) + .await + .ok()? + .bytes() + .await + .ok()?; + let hint: LastCheckpointHint = serde_json::from_slice(&bytes).ok()?; + Some(hint.version) +} + +pub(crate) async fn list_delta_log_entries_from( + store: Arc, + offset_version: i64, +) -> DeltaResult> { + // `delta_log_prefix_path(version)` is a prefix, not a concrete filename, so + // files for `version` still compare greater than the offset. + let log_path = delta_log_root_path(); + let offset = delta_log_prefix_path(offset_version); + let entries = match store + .list_with_offset(Some(&log_path), &offset) + .try_collect::>() + .await + { + Ok(entries) => entries, + Err(ObjectStoreError::NotSupported { .. } | ObjectStoreError::NotImplemented { .. }) => { + // TODO: Apply the same `location > offset` filter here if needed for the specific store implementation. + store.list(Some(&log_path)).try_collect::>().await? + } + Err(err) => return Err(err.into()), + }; + Ok(entries) +} + +pub(crate) async fn latest_version_from_listing( + store: Arc, +) -> DeltaResult> { + let offset_version = read_last_checkpoint_version_from_store(store.clone()) + .await + .map(|v| v.saturating_sub(1)) + .unwrap_or(0); + let entries = list_delta_log_entries_from(store, offset_version).await?; + + let mut max_version: Option = None; + for meta in entries { + if let Some(version) = parse_delta_log_entry_version(&meta) { + max_version = Some(max_version.map_or(version, |curr| curr.max(version))); + } + } + Ok(max_version) +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use object_store::memory::InMemory; + + use super::*; + + #[tokio::test] + async fn latest_version_from_listing_works_without_last_checkpoint_hint() { + let store: Arc = Arc::new(InMemory::new()); + store + .put( + &Path::from("_delta_log/00000000000000000007.json"), + b"{}".to_vec().into(), + ) + .await + .unwrap(); + + assert_eq!(latest_version_from_listing(store).await.unwrap(), Some(7)); + } + + #[tokio::test] + async fn latest_version_from_listing_uses_checkpoint_when_commits_are_pruned() { + let store: Arc = Arc::new(InMemory::new()); + let hint = serde_json::to_vec(&LastCheckpointHint { + version: 20, + ..Default::default() + }) + .unwrap(); + + store + .put(&Path::from("_delta_log/_last_checkpoint"), hint.into()) + .await + .unwrap(); + store + .put( + &Path::from("_delta_log/00000000000000000020.checkpoint.parquet"), + b"parquet".to_vec().into(), + ) + .await + .unwrap(); + + assert_eq!(latest_version_from_listing(store).await.unwrap(), Some(20)); + } + + #[tokio::test] + async fn latest_version_from_listing_finds_commits_newer_than_last_checkpoint() { + let store: Arc = Arc::new(InMemory::new()); + let hint = serde_json::to_vec(&LastCheckpointHint { + version: 20, + ..Default::default() + }) + .unwrap(); + + store + .put(&Path::from("_delta_log/_last_checkpoint"), hint.into()) + .await + .unwrap(); + store + .put( + &Path::from("_delta_log/00000000000000000020.checkpoint.parquet"), + b"parquet".to_vec().into(), + ) + .await + .unwrap(); + store + .put( + &Path::from("_delta_log/00000000000000000021.json"), + b"{}".to_vec().into(), + ) + .await + .unwrap(); + + assert_eq!(latest_version_from_listing(store).await.unwrap(), Some(21)); + } + + #[tokio::test] + async fn list_delta_log_entries_from_keeps_empty_results_empty() { + let store: Arc = Arc::new(InMemory::new()); + store + .put( + &Path::from("_delta_log/00000000000000000020.checkpoint.parquet"), + b"parquet".to_vec().into(), + ) + .await + .unwrap(); + + let entries = list_delta_log_entries_from(store, 21).await.unwrap(); + assert!(entries.is_empty()); + } +} diff --git a/crates/sail-delta-lake/src/delta_log/mod.rs b/crates/sail-delta-lake/src/delta_log/mod.rs new file mode 100644 index 0000000000..654c2d4b15 --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/mod.rs @@ -0,0 +1,21 @@ +pub(crate) mod cleanup; +mod listing; +mod replay; +mod segment; +mod timestamps; + +pub(crate) use listing::{ + latest_version_from_listing, list_delta_log_entries_from, + parse_checkpoint_version_from_location, parse_checksum_version_from_location, + parse_commit_version_from_location, read_last_checkpoint_version_from_store, +}; +pub(crate) use replay::{ + latest_replayable_version, load_replayed_table_header, load_replayed_table_state, +}; +pub(crate) use segment::{ + list_log_files, LogSegmentResolver, ReplayedTableHeader, ResolvedLogSegment, +}; +pub(crate) use timestamps::{ + resolve_commit_timestamp_from_actions, resolve_effective_protocol_and_metadata, + resolve_version_timestamp, +}; diff --git a/crates/sail-delta-lake/src/delta_log/replay.rs b/crates/sail-delta-lake/src/delta_log/replay.rs new file mode 100644 index 0000000000..21f6eb1c50 --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/replay.rs @@ -0,0 +1,227 @@ +use std::collections::BTreeMap; +use std::sync::Arc; + +use log::debug; +use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; +use parquet::arrow::ProjectionMask; + +use super::{ + list_delta_log_entries_from, parse_checkpoint_version_from_location, + parse_commit_version_from_location, read_last_checkpoint_version_from_store, + LogSegmentResolver, ReplayedTableHeader, ResolvedLogSegment, +}; +use crate::kernel::checkpoints::{ + decode_checkpoint_rows, read_checkpoint_rows_from_parquet, replay_commit_actions, + replay_commit_header_actions, ReconciledCheckpointState, ReconciledHeaderState, + ReplayedTableState, +}; +use crate::spec::{CheckpointActionRow, DeltaError as DeltaTableError, DeltaResult}; +use crate::storage::LogStore; + +async fn read_checkpoint_header_from_parquet( + root_store: Arc, + meta: ObjectMeta, +) -> DeltaResult { + let bytes = root_store.get(&meta.location).await?.bytes().await?; + tokio::task::spawn_blocking(move || { + let builder = ParquetRecordBatchReaderBuilder::try_new(bytes) + .map_err(DeltaTableError::generic_err)?; + + let parquet_schema = builder.parquet_schema(); + let mask = ProjectionMask::columns(parquet_schema, ["metaData", "protocol", "txn"]); + + let mut batches = builder + .with_projection(mask) + .build() + .map_err(DeltaTableError::generic_err)?; + + let mut state = ReconciledHeaderState::default(); + for batch_result in &mut batches { + let batch = batch_result.map_err(DeltaTableError::generic_err)?; + let rows: Vec = decode_checkpoint_rows(&batch)?; + for row in rows { + state.apply_checkpoint_row(row); + } + } + Ok::<_, DeltaTableError>(state) + }) + .await + .map_err(DeltaTableError::generic_err)? +} + +pub(crate) async fn load_replayed_table_state( + version: i64, + log_store: &dyn LogStore, +) -> DeltaResult { + if version < 0 { + return Err(DeltaTableError::generic(format!( + "Cannot load table state for negative version: {version}" + ))); + } + + let segment = LogSegmentResolver::new(log_store, version, None) + .resolve_for_full_state() + .await?; + + let ResolvedLogSegment::FullReplay { + checkpoint, + commit_files, + target_version, + } = segment + else { + return Err(DeltaTableError::generic( + "resolve_for_full_state must return FullReplay", + )); + }; + + let store = log_store.object_store(None); + let mut state = ReconciledCheckpointState::default(); + let start_commit_version = if let Some(cp_meta) = checkpoint { + let rows = read_checkpoint_rows_from_parquet(store.clone(), cp_meta).await?; + for row in rows { + state.apply_checkpoint_row(row)?; + } + commit_files + .first() + .map(|(v, _)| *v) + .unwrap_or(target_version.saturating_add(1)) + } else { + 0 + }; + + let commit_timestamps = replay_commit_actions( + &mut state, + store, + &commit_files, + start_commit_version, + target_version, + ) + .await?; + + let protocol = state + .protocol + .ok_or_else(|| DeltaTableError::generic("Cannot load table state without protocol"))?; + let metadata = state + .metadata + .ok_or_else(|| DeltaTableError::generic("Cannot load table state without metadata"))?; + let txns = state.txns; + let adds = state + .adds + .into_iter() + .collect::>() + .into_values() + .collect::>(); + let removes = state + .removes + .into_iter() + .collect::>() + .into_values() + .collect::>(); + Ok(ReplayedTableState { + version: target_version, + protocol, + metadata, + txns, + adds, + removes, + commit_timestamps, + }) +} + +pub(crate) async fn load_replayed_table_header( + version: i64, + log_store: &dyn LogStore, + replay_hint: Option<&ReplayedTableHeader>, +) -> DeltaResult> { + if version < 0 { + return Err(DeltaTableError::generic(format!( + "Cannot load table header for negative version: {version}" + ))); + } + + let segment = LogSegmentResolver::new(log_store, version, replay_hint) + .resolve_for_header() + .await?; + + match segment { + ResolvedLogSegment::ExactChecksum { header } => Ok(Some(header)), + ResolvedLogSegment::Incremental { + base, + checkpoint, + commit_files, + target_version, + } => { + let store = log_store.object_store(None); + + let (mut state, start_commit_version, mut commit_timestamps) = match checkpoint { + Some(cp_meta) => { + let cp_state = + read_checkpoint_header_from_parquet(store.clone(), cp_meta).await?; + let next_v = commit_files + .first() + .map(|(v, _)| *v) + .unwrap_or(target_version.saturating_add(1)); + (cp_state, next_v, BTreeMap::new()) + } + None => { + let start = base.version.saturating_add(1); + let ts = Arc::unwrap_or_clone(base.commit_timestamps.clone()); + (ReconciledHeaderState::from_header(&base), start, ts) + } + }; + + if start_commit_version <= target_version { + commit_timestamps.extend( + replay_commit_header_actions( + &mut state, + store, + &commit_files, + start_commit_version, + target_version, + ) + .await?, + ); + } + + let protocol = state.protocol.ok_or_else(|| { + DeltaTableError::generic("Cannot load table header without protocol") + })?; + let metadata = state.metadata.ok_or_else(|| { + DeltaTableError::generic("Cannot load table header without metadata") + })?; + Ok(Some(ReplayedTableHeader { + version: target_version, + protocol, + metadata, + txns: Arc::new(state.txns), + commit_timestamps: Arc::new(commit_timestamps), + })) + } + ResolvedLogSegment::FullReplay { .. } => { + debug!( + "crc-header: no usable base state, returning None for header fast path target_version={version}" + ); + Ok(None) + } + } +} + +pub(crate) async fn latest_replayable_version(log_store: &dyn LogStore) -> DeltaResult { + let store = log_store.object_store(None); + let offset_version = read_last_checkpoint_version_from_store(store.clone()) + .await + .map(|v| v.saturating_sub(1)) + .unwrap_or(0); + let log_entries = list_delta_log_entries_from(store, offset_version).await?; + + let latest = log_entries + .iter() + .filter_map(|meta| { + parse_commit_version_from_location(&meta.location) + .or_else(|| parse_checkpoint_version_from_location(&meta.location)) + }) + .max(); + + latest.ok_or(crate::spec::DeltaError::MissingVersion) +} diff --git a/crates/sail-delta-lake/src/delta_log/segment.rs b/crates/sail-delta-lake/src/delta_log/segment.rs new file mode 100644 index 0000000000..d66ac4ebf9 --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/segment.rs @@ -0,0 +1,621 @@ +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; + +use log::debug; +use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; + +use super::timestamps::version_uses_in_commit_timestamps; +use super::{list_delta_log_entries_from, read_last_checkpoint_version_from_store}; +use crate::spec::{ + checksum_path, parse_checkpoint_version, parse_checksum_version, parse_commit_version, + DeltaError, DeltaResult, Metadata, Protocol, Transaction, VersionChecksum, +}; +use crate::storage::LogStore; + +const CHECKSUM_LOOKBACK_WINDOW: i64 = 100; + +#[derive(Debug, Clone)] +pub(crate) struct ReplayedTableHeader { + pub version: i64, + pub protocol: Protocol, + pub metadata: Metadata, + pub txns: Arc>, + pub commit_timestamps: Arc>, +} + +#[derive(Debug)] +pub(crate) enum ResolvedLogSegment { + ExactChecksum { + header: ReplayedTableHeader, + }, + Incremental { + base: ReplayedTableHeader, + checkpoint: Option, + commit_files: Vec<(i64, ObjectMeta)>, + target_version: i64, + }, + FullReplay { + checkpoint: Option, + commit_files: Vec<(i64, ObjectMeta)>, + target_version: i64, + }, +} + +pub(crate) struct LogSegmentResolver<'a> { + log_store: &'a dyn LogStore, + target_version: i64, + replay_hint: Option<&'a ReplayedTableHeader>, +} + +impl<'a> LogSegmentResolver<'a> { + pub(crate) fn new( + log_store: &'a dyn LogStore, + target_version: i64, + replay_hint: Option<&'a ReplayedTableHeader>, + ) -> Self { + Self { + log_store, + target_version, + replay_hint, + } + } + + pub(crate) async fn resolve_for_header(&self) -> DeltaResult { + let version = self.target_version; + + let store = self.log_store.object_store(None); + if let Some(header) = try_read_checksum_header(store.clone(), version).await { + debug!("crc-header: exact checksum hit target_version={version}"); + return Ok(ResolvedLogSegment::ExactChecksum { header }); + } + + let lower_bound = { + let lookback = version.saturating_sub(CHECKSUM_LOOKBACK_WINDOW); + let hint_floor = self + .replay_hint + .map(|h| h.version.saturating_add(1)) + .unwrap_or(0); + lookback.max(hint_floor) + }; + + let last_cp_hint_version = read_last_checkpoint_version_from_store(store.clone()) + .await + .map(|v| v.min(version).saturating_sub(1)) + .unwrap_or(0); + let list_offset = lower_bound.min(last_cp_hint_version); + + let (checksum_candidates, checkpoint_candidate, all_commits) = + list_log_files(store.clone(), list_offset, version).await?; + + let mut older_crc_base: Option = None; + for (crc_version, meta) in &checksum_candidates { + if *crc_version < lower_bound || *crc_version >= version { + continue; + } + let bytes = match store.get(&meta.location).await { + Ok(r) => match r.bytes().await { + Ok(b) => b, + Err(err) => { + debug!( + "crc-header: failed to read older checksum at version {crc_version}: {err}" + ); + continue; + } + }, + Err(object_store::Error::NotFound { .. }) => continue, + Err(err) => { + debug!( + "crc-header: failed to fetch older checksum at version {crc_version}: {err}" + ); + continue; + } + }; + let checksum: VersionChecksum = match serde_json::from_slice(&bytes) { + Ok(c) => c, + Err(err) => { + debug!( + "crc-header: failed to deserialize older checksum at version {crc_version}: {err}" + ); + continue; + } + }; + if let Some(header) = validate_and_build_header(*crc_version, checksum) { + older_crc_base = Some(header); + break; + } + } + + if let Some(ref base) = older_crc_base { + debug!( + "crc-header: older checksum hint hit target_version={version}, checksum_version={}", + base.version + ); + } + + let used_older_crc = older_crc_base.is_some(); + let base = match older_crc_base.or_else(|| self.replay_hint.cloned()) { + Some(b) => { + if !used_older_crc { + debug!( + "crc-header: reused snapshot hint target_version={version}, hint_version={}", + b.version + ); + } + b + } + None => { + debug!("crc-header: no usable checksum or replay hint target_version={version}"); + let start_version = match &checkpoint_candidate { + Some(cp_meta) => cp_meta_version(cp_meta)?.saturating_add(1), + None => 0, + }; + let commit_files: Vec<(i64, ObjectMeta)> = all_commits + .into_iter() + .filter(|(v, _)| *v >= start_version && *v <= version) + .collect(); + return Ok(ResolvedLogSegment::FullReplay { + checkpoint: checkpoint_candidate, + commit_files, + target_version: version, + }); + } + }; + + let use_checkpoint = checkpoint_candidate + .as_ref() + .map(cp_meta_version) + .transpose()? + .map(|v| v > base.version) + .unwrap_or(false); + + let (checkpoint, commit_files, start_version) = if use_checkpoint { + let cp_meta = checkpoint_candidate.ok_or_else(|| { + DeltaError::generic("checkpoint_candidate was None after use_checkpoint check") + })?; + let cp_version = cp_meta_version(&cp_meta)?; + let commits: Vec<(i64, ObjectMeta)> = all_commits + .into_iter() + .filter(|(v, _)| *v > cp_version && *v <= version) + .collect(); + (Some(cp_meta), commits, cp_version.saturating_add(1)) + } else { + let start = base.version.saturating_add(1); + let commits: Vec<(i64, ObjectMeta)> = all_commits + .into_iter() + .filter(|(v, _)| *v >= start && *v <= version) + .collect(); + (None, commits, start) + }; + + validate_commit_contiguity(&commit_files, start_version, version)?; + + Ok(ResolvedLogSegment::Incremental { + base, + checkpoint, + commit_files, + target_version: version, + }) + } + + pub(crate) async fn resolve_for_full_state(&self) -> DeltaResult { + let version = self.target_version; + let store = self.log_store.object_store(None); + + let last_cp_hint_version = read_last_checkpoint_version_from_store(store.clone()) + .await + .map(|v| v.min(version).saturating_sub(1)) + .unwrap_or(0); + + let (_, checkpoint, all_commits) = + list_log_files(store, last_cp_hint_version, version).await?; + + let start_version = match &checkpoint { + Some(cp_meta) => cp_meta_version(cp_meta)?.saturating_add(1), + None => 0, + }; + let commit_files: Vec<(i64, ObjectMeta)> = all_commits + .into_iter() + .filter(|(v, _)| *v >= start_version && *v <= version) + .collect(); + validate_commit_contiguity(&commit_files, start_version, version)?; + + Ok(ResolvedLogSegment::FullReplay { + checkpoint, + commit_files, + target_version: version, + }) + } +} + +fn cp_meta_version(meta: &ObjectMeta) -> DeltaResult { + meta.location + .as_ref() + .rsplit('/') + .next() + .and_then(parse_checkpoint_version) + .ok_or_else(|| { + DeltaError::generic(format!( + "checkpoint path does not contain a parseable version: {}", + meta.location + )) + }) +} + +async fn try_read_checksum_header( + store: Arc, + version: i64, +) -> Option { + let path = checksum_path(version); + let bytes = store.get(&path).await.ok()?.bytes().await.ok()?; + let checksum: VersionChecksum = match serde_json::from_slice(&bytes) { + Ok(c) => c, + Err(err) => { + debug!("crc-header: failed to deserialize checksum at version {version}: {err}"); + return None; + } + }; + validate_and_build_header(version, checksum) +} + +fn validate_and_build_header( + version: i64, + checksum: VersionChecksum, +) -> Option { + if checksum.num_metadata != 1 || checksum.num_protocol != 1 { + debug!( + "crc-header: invalid checksum at version {version}: \ + num_metadata={}, num_protocol={}", + checksum.num_metadata, checksum.num_protocol + ); + return None; + } + let txns = checksum + .set_transactions + .unwrap_or_default() + .into_iter() + .map(|txn| (txn.app_id.clone(), txn)) + .collect::>(); + let commit_timestamps = + if version_uses_in_commit_timestamps(version, &checksum.protocol, &checksum.metadata) { + checksum + .in_commit_timestamp_opt + .into_iter() + .map(|timestamp| (version, timestamp)) + .collect() + } else { + BTreeMap::new() + }; + Some(ReplayedTableHeader { + version, + protocol: checksum.protocol, + metadata: checksum.metadata, + txns: Arc::new(txns), + commit_timestamps: Arc::new(commit_timestamps), + }) +} + +pub(crate) async fn list_log_files( + store: Arc, + list_offset_version: i64, + max_version: i64, +) -> DeltaResult<( + Vec<(i64, ObjectMeta)>, + Option, + Vec<(i64, ObjectMeta)>, +)> { + let entries = list_delta_log_entries_from(store, list_offset_version).await?; + + let mut checkpoint_candidates: Vec<(i64, ObjectMeta)> = Vec::new(); + let mut commit_candidates: Vec<(i64, ObjectMeta)> = Vec::new(); + let mut checksum_candidates: Vec<(i64, ObjectMeta)> = Vec::new(); + + for meta in entries { + let filename = match meta.location.as_ref().rsplit('/').next() { + Some(f) => f, + None => continue, + }; + if let Some(v) = parse_checkpoint_version(filename) { + if v <= max_version { + checkpoint_candidates.push((v, meta)); + } + continue; + } + if let Some(v) = parse_commit_version(filename) { + if v <= max_version { + commit_candidates.push((v, meta)); + } + continue; + } + if let Some(v) = parse_checksum_version(filename) { + if v <= max_version { + checksum_candidates.push((v, meta)); + } + } + } + + let latest_checkpoint_version = checkpoint_candidates.iter().map(|(v, _)| *v).max(); + let checkpoint = latest_checkpoint_version.map(|latest_v| { + let mut files: Vec = checkpoint_candidates + .into_iter() + .filter_map(|(v, m)| (v == latest_v).then_some(m)) + .collect(); + files.sort_by(|a, b| a.location.as_ref().cmp(b.location.as_ref())); + files.remove(0) + }); + + commit_candidates.sort_by(|(av, _), (bv, _)| av.cmp(bv)); + checksum_candidates.sort_by(|(av, _), (bv, _)| bv.cmp(av)); + + Ok((checksum_candidates, checkpoint, commit_candidates)) +} + +fn validate_commit_contiguity( + commit_files: &[(i64, ObjectMeta)], + start_version: i64, + end_version: i64, +) -> DeltaResult<()> { + if start_version > end_version { + return Ok(()); + } + let mut expected = start_version; + for (v, _) in commit_files { + if *v < start_version || *v > end_version { + continue; + } + if *v != expected { + return Err(DeltaError::generic(format!( + "Missing commit file: expected version {expected}, found {v}" + ))); + } + expected = expected.saturating_add(1); + } + if expected.saturating_sub(1) != end_version { + return Err(DeltaError::generic(format!( + "Missing commit file: expected final version {end_version}, replay reached {}", + expected.saturating_sub(1) + ))); + } + Ok(()) +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use super::*; + use crate::spec::StructType; + + fn make_test_checksum( + num_metadata: i64, + num_protocol: i64, + in_commit_timestamp_opt: Option, + ) -> VersionChecksum { + use crate::spec::{Metadata, Protocol, StructType}; + + let protocol = Protocol::new(1, 2, None, None); + let metadata = Metadata::try_new( + None, + None, + StructType::try_new([]).unwrap(), + vec![], + 0, + Default::default(), + ) + .unwrap(); + + VersionChecksum { + txn_id: None, + table_size_bytes: 0, + num_files: 0, + num_metadata, + num_protocol, + in_commit_timestamp_opt, + set_transactions: None, + domain_metadata: None, + metadata, + protocol, + file_size_histogram: None, + all_files: None, + } + } + + #[test] + fn valid_checksum_builds_header() { + let checksum = make_test_checksum(1, 1, None); + let header = validate_and_build_header(42, checksum); + assert!(header.is_some()); + let h = header.unwrap(); + assert_eq!(h.version, 42); + assert!(h.txns.is_empty()); + assert!(h.commit_timestamps.is_empty()); + } + + #[test] + fn checksum_header_keeps_in_commit_timestamp() { + let mut checksum = make_test_checksum(1, 1, Some(123)); + checksum.protocol = Protocol::new( + 1, + 7, + None, + Some(vec![crate::spec::TableFeature::InCommitTimestamp]), + ); + checksum.metadata = Metadata::try_new( + None, + None, + StructType::try_new([]).unwrap(), + vec![], + 0, + HashMap::from([( + "delta.enableInCommitTimestamps".to_string(), + "true".to_string(), + )]), + ) + .unwrap(); + let header = validate_and_build_header(42, checksum); + assert!(header.is_some()); + let header = header.unwrap(); + assert_eq!(header.commit_timestamps.get(&42), Some(&123)); + } + + #[test] + fn checksum_header_ignores_pre_enable_in_commit_timestamp() { + let checksum = make_test_checksum(1, 1, Some(123)); + let header = validate_and_build_header(42, checksum); + assert!(header.is_some()); + let header = header.unwrap(); + assert!(header.commit_timestamps.is_empty()); + } + + #[test] + fn checksum_with_num_metadata_zero_is_rejected() { + let checksum = make_test_checksum(0, 1, None); + assert!(validate_and_build_header(1, checksum).is_none()); + } + + #[test] + fn checksum_with_num_protocol_zero_is_rejected() { + let checksum = make_test_checksum(1, 0, None); + assert!(validate_and_build_header(1, checksum).is_none()); + } + + #[test] + fn checksum_with_num_metadata_two_is_rejected() { + let checksum = make_test_checksum(2, 1, None); + assert!(validate_and_build_header(1, checksum).is_none()); + } + + #[test] + fn checksum_with_both_invalid_is_rejected() { + let checksum = make_test_checksum(0, 0, None); + assert!(validate_and_build_header(1, checksum).is_none()); + } + + fn dummy_meta(version: i64) -> ObjectMeta { + use object_store::path::Path; + + ObjectMeta { + location: Path::from(format!("_delta_log/{version:020}.json")), + last_modified: chrono::Utc::now(), + size: 0, + e_tag: None, + version: None, + } + } + + #[test] + fn contiguous_commits_pass() { + let files = vec![ + (3i64, dummy_meta(3)), + (4, dummy_meta(4)), + (5, dummy_meta(5)), + ]; + assert!(validate_commit_contiguity(&files, 3, 5).is_ok()); + } + + #[test] + fn empty_range_passes() { + let files: Vec<(i64, ObjectMeta)> = vec![]; + assert!(validate_commit_contiguity(&files, 6, 5).is_ok()); + } + + #[test] + fn gap_in_commits_fails() { + let files = vec![(3i64, dummy_meta(3)), (5, dummy_meta(5))]; + let err = validate_commit_contiguity(&files, 3, 5).unwrap_err(); + assert!(err.to_string().contains("Missing commit file")); + } + + #[test] + fn missing_final_commit_fails() { + let files = vec![(3i64, dummy_meta(3)), (4, dummy_meta(4))]; + let err = validate_commit_contiguity(&files, 3, 5).unwrap_err(); + assert!(err.to_string().contains("Missing commit file")); + } + + #[test] + fn single_commit_passes() { + let files = vec![(7i64, dummy_meta(7))]; + assert!(validate_commit_contiguity(&files, 7, 7).is_ok()); + } + + #[tokio::test] + async fn lower_bound_ignores_last_checkpoint_hint() { + use object_store::memory::InMemory; + use object_store::path::Path; + + let store: Arc = Arc::new(InMemory::new()); + + let checksum_55 = make_test_checksum(1, 1, None); + let crc_bytes = serde_json::to_vec(&checksum_55).unwrap(); + store + .put( + &Path::from("_delta_log/00000000000000000055.crc"), + crc_bytes.into(), + ) + .await + .unwrap(); + + let last_cp = serde_json::json!({"version": 120}); + store + .put( + &Path::from("_delta_log/_last_checkpoint"), + serde_json::to_vec(&last_cp).unwrap().into(), + ) + .await + .unwrap(); + + let (checksums, _, _) = list_log_files(store, 50, 150).await.unwrap(); + let found = checksums.iter().find(|(v, _)| *v == 55).map(|(v, _)| *v); + assert_eq!( + found, + Some(55), + "should find CRC at version 55 even though _last_checkpoint points to 120" + ); + } + + #[tokio::test] + async fn lower_bound_respects_replay_hint_floor() { + use object_store::memory::InMemory; + use object_store::path::Path; + + let store: Arc = Arc::new(InMemory::new()); + + let checksum_55 = make_test_checksum(1, 1, None); + let crc_bytes = serde_json::to_vec(&checksum_55).unwrap(); + store + .put( + &Path::from("_delta_log/00000000000000000055.crc"), + crc_bytes.into(), + ) + .await + .unwrap(); + + let (checksums, _, _) = list_log_files(store, 61, 150).await.unwrap(); + let found = checksums + .iter() + .find(|(v, _)| *v >= 61 && *v < 150) + .map(|(v, _)| *v); + assert_eq!( + found, None, + "resolver should ignore CRCs below the replay_hint floor" + ); + } + + #[tokio::test] + async fn list_log_files_includes_target_version_when_offset_is_prefix() { + use object_store::memory::InMemory; + use object_store::path::Path; + + let store: Arc = Arc::new(InMemory::new()); + store + .put( + &Path::from("_delta_log/00000000000000000000.json"), + b"{}".to_vec().into(), + ) + .await + .unwrap(); + + let (_, _, commits) = list_log_files(store, 0, 0).await.unwrap(); + assert_eq!(commits.len(), 1); + assert_eq!(commits[0].0, 0); + } +} diff --git a/crates/sail-delta-lake/src/delta_log/timestamps.rs b/crates/sail-delta-lake/src/delta_log/timestamps.rs new file mode 100644 index 0000000000..581ac93172 --- /dev/null +++ b/crates/sail-delta-lake/src/delta_log/timestamps.rs @@ -0,0 +1,318 @@ +use object_store::{ObjectMeta, ObjectStoreExt}; + +use crate::spec::{ + checksum_path, commit_path, Action, DeltaError, DeltaResult, Metadata, Protocol, + TableProperties, VersionChecksum, +}; +use crate::storage::{get_actions, LogStore}; + +pub(crate) fn in_commit_timestamp_from_actions(actions: &[Action]) -> Option { + actions.iter().find_map(|action| match action { + Action::CommitInfo(info) => info.in_commit_timestamp, + _ => None, + }) +} + +pub(crate) fn resolve_effective_protocol_and_metadata( + current_protocol: Option<&Protocol>, + current_metadata: Option<&Metadata>, + actions: &[Action], +) -> Option<(Protocol, Metadata)> { + let protocol = actions + .iter() + .rev() + .find_map(|action| match action { + Action::Protocol(protocol) => Some(protocol.clone()), + _ => None, + }) + .or_else(|| current_protocol.cloned())?; + let metadata = actions + .iter() + .rev() + .find_map(|action| match action { + Action::Metadata(metadata) => Some(metadata.clone()), + _ => None, + }) + .or_else(|| current_metadata.cloned())?; + Some((protocol, metadata)) +} + +pub(crate) fn version_uses_in_commit_timestamps( + version: i64, + protocol: &Protocol, + metadata: &Metadata, +) -> bool { + let table_properties = TableProperties::from(metadata.configuration().iter()); + if !protocol.is_in_commit_timestamps_enabled(&table_properties) { + return false; + } + + match table_properties.in_commit_timestamp_enablement_version() { + Some(enablement_version) => version >= enablement_version, + None => true, + } +} + +pub(crate) fn resolve_commit_timestamp_from_actions( + version: i64, + meta: &ObjectMeta, + current_protocol: Option<&Protocol>, + current_metadata: Option<&Metadata>, + actions: &[Action], +) -> DeltaResult { + let uses_in_commit_timestamp = + resolve_effective_protocol_and_metadata(current_protocol, current_metadata, actions) + .map(|(protocol, metadata)| { + version_uses_in_commit_timestamps(version, &protocol, &metadata) + }) + .unwrap_or(false); + + if uses_in_commit_timestamp { + in_commit_timestamp_from_actions(actions).ok_or_else(|| { + DeltaError::generic(format!( + "commit {version} is missing inCommitTimestamp and object metadata fallback is disallowed" + )) + }) + } else { + Ok(meta.last_modified.timestamp_millis()) + } +} + +async fn read_in_commit_timestamp_from_checksum( + log_store: &dyn LogStore, + version: i64, +) -> DeltaResult> { + let path = checksum_path(version); + let store = log_store.object_store(None); + let bytes = match store.get(&path).await { + Ok(result) => result.bytes().await?, + Err(object_store::Error::NotFound { .. }) => return Ok(None), + Err(err) => return Err(err.into()), + }; + let checksum: VersionChecksum = serde_json::from_slice(&bytes)?; + Ok(checksum.in_commit_timestamp_opt) +} + +async fn read_in_commit_timestamp_from_commit_json( + log_store: &dyn LogStore, + version: i64, +) -> DeltaResult> { + let Some(bytes) = log_store.read_commit_entry(version).await? else { + return Ok(None); + }; + let actions = get_actions(version, &bytes)?; + Ok(in_commit_timestamp_from_actions(&actions)) +} + +pub(crate) async fn resolve_version_timestamp( + log_store: &dyn LogStore, + version: i64, + cached_timestamp: Option, + protocol: &Protocol, + metadata: &Metadata, +) -> DeltaResult { + if !version_uses_in_commit_timestamps(version, protocol, metadata) { + if let Some(timestamp) = cached_timestamp { + return Ok(timestamp); + } + let commit_uri = commit_path(version); + let meta = log_store.object_store(None).head(&commit_uri).await?; + return Ok(meta.last_modified.timestamp_millis()); + } + + if let Some(timestamp) = cached_timestamp { + return Ok(timestamp); + } + if let Some(timestamp) = read_in_commit_timestamp_from_checksum(log_store, version).await? { + return Ok(timestamp); + } + if let Some(timestamp) = read_in_commit_timestamp_from_commit_json(log_store, version).await? { + return Ok(timestamp); + } + Err(DeltaError::generic(format!( + "commit {version} requires inCommitTimestamp, but neither the checksum nor the commit JSON provided one" + ))) +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use std::sync::Arc; + + use chrono::DateTime; + use object_store::memory::InMemory; + use object_store::path::Path; + use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; + use url::Url; + + use super::*; + use crate::spec::{CommitInfo, DataType, Metadata, StructField, StructType, TableFeature}; + use crate::storage::{default_logstore, LogStoreRef, StorageConfig}; + + fn test_log_store(store: Arc) -> LogStoreRef { + default_logstore( + store.clone(), + store, + &Url::parse("memory:///").unwrap(), + &StorageConfig, + ) + } + + fn test_metadata( + configuration: impl IntoIterator, + ) -> Metadata { + Metadata::try_new( + None, + None, + StructType::try_new([StructField::not_null("id", DataType::LONG)]).unwrap(), + Vec::new(), + 0, + configuration + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ) + .unwrap() + } + + fn commit_meta(version: i64, last_modified_millis: i64) -> DeltaResult { + let last_modified = DateTime::from_timestamp_millis(last_modified_millis) + .ok_or_else(|| DeltaError::generic("test timestamp must be valid"))?; + Ok(ObjectMeta { + location: Path::from(format!("_delta_log/{version:020}.json")), + last_modified, + size: 0, + e_tag: None, + version: None, + }) + } + + async fn put_commit( + store: &Arc, + version: i64, + actions: &[Action], + ) -> DeltaResult<()> { + let mut bytes = Vec::new(); + for (index, action) in actions.iter().enumerate() { + if index > 0 { + bytes.push(b'\n'); + } + serde_json::to_writer(&mut bytes, action)?; + } + store.put(&commit_path(version), bytes.into()).await?; + Ok(()) + } + + async fn put_checksum( + store: &Arc, + version: i64, + protocol: &Protocol, + metadata: &Metadata, + in_commit_timestamp_opt: Option, + ) -> DeltaResult<()> { + let checksum = VersionChecksum { + txn_id: None, + table_size_bytes: 0, + num_files: 0, + num_metadata: 1, + num_protocol: 1, + in_commit_timestamp_opt, + set_transactions: None, + domain_metadata: None, + metadata: metadata.clone(), + protocol: protocol.clone(), + file_size_histogram: None, + all_files: None, + }; + store + .put( + &checksum_path(version), + serde_json::to_vec(&checksum).unwrap().into(), + ) + .await?; + Ok(()) + } + + #[test] + fn resolve_commit_timestamp_from_actions_ignores_pre_enable_ict() -> DeltaResult<()> { + let protocol = Protocol::new(1, 2, None, None); + let metadata = test_metadata([]); + let commit_timestamp = resolve_commit_timestamp_from_actions( + 0, + &commit_meta(0, 4_567)?, + Some(&protocol), + Some(&metadata), + &[Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(123), + ..Default::default() + })], + )?; + + assert_eq!(commit_timestamp, 4_567); + Ok(()) + } + + #[tokio::test] + async fn resolve_version_timestamp_ignores_pre_enable_ict_in_checksum_and_commit( + ) -> DeltaResult<()> { + let store: Arc = Arc::new(InMemory::new()); + let log_store = test_log_store(store.clone()); + let pre_enable_protocol = Protocol::new(1, 2, None, None); + let pre_enable_metadata = test_metadata([]); + let enabled_protocol = + Protocol::new(1, 7, None, Some(vec![TableFeature::InCommitTimestamp])); + let enabled_metadata = test_metadata([ + ("delta.enableInCommitTimestamps", "true"), + ("delta.inCommitTimestampEnablementVersion", "2"), + ("delta.inCommitTimestampEnablementTimestamp", "300"), + ]); + + put_commit( + &store, + 0, + &[Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(10_000), + ..Default::default() + })], + ) + .await?; + put_checksum( + &store, + 0, + &pre_enable_protocol, + &pre_enable_metadata, + Some(10_000), + ) + .await?; + + let expected = store + .head(&commit_path(0)) + .await? + .last_modified + .timestamp_millis(); + let resolved = resolve_version_timestamp( + log_store.as_ref(), + 0, + None, + &enabled_protocol, + &enabled_metadata, + ) + .await?; + + assert_eq!(resolved, expected); + Ok(()) + } + + #[test] + fn version_uses_in_commit_timestamps_honors_enablement_boundary() { + let protocol = Protocol::new(1, 7, None, Some(vec![TableFeature::InCommitTimestamp])); + let metadata = test_metadata([ + ("delta.enableInCommitTimestamps", "true"), + ("delta.inCommitTimestampEnablementVersion", "2"), + ("delta.inCommitTimestampEnablementTimestamp", "300"), + ]); + + assert!(!version_uses_in_commit_timestamps(1, &protocol, &metadata)); + assert!(version_uses_in_commit_timestamps(2, &protocol, &metadata)); + } +} diff --git a/crates/sail-delta-lake/src/error.rs b/crates/sail-delta-lake/src/error.rs deleted file mode 100644 index 009c31c0d2..0000000000 --- a/crates/sail-delta-lake/src/error.rs +++ /dev/null @@ -1,176 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use datafusion_common::{Column, DataFusionError, SchemaError}; -pub use delta_kernel::Error as KernelError; -use object_store::Error as ObjectStoreError; -use thiserror::Error; - -use crate::kernel::transaction::TransactionError; - -/// Result type that is used throughout the Delta Lake integration. -pub type DeltaResult = Result; - -/// Error type that bridges Delta Kernel and DataFusion failures. -#[derive(Debug, Error)] -pub enum DeltaError { - #[error(transparent)] - Kernel(#[from] KernelError), - - #[error(transparent)] - DataFusion(#[from] DataFusionError), - - #[error(transparent)] - Io(#[from] std::io::Error), - - #[error(transparent)] - ObjectStore(#[from] ObjectStoreError), - - #[error(transparent)] - Arrow(#[from] datafusion::arrow::error::ArrowError), - - #[error(transparent)] - Url(#[from] url::ParseError), - - #[error(transparent)] - Json(#[from] serde_json::Error), - - #[error("Invalid configuration: {0}")] - Config(String), - - #[error("Delta table operation failed: {0}")] - Generic(String), - - #[error("Delta transaction error: {0}")] - Transaction(#[from] TransactionError), -} - -impl DeltaError { - /// Convenience helper that mirrors [`KernelError::generic`]. - pub fn generic(msg: impl ToString) -> Self { - KernelError::generic(msg).into() - } - - /// Convenience helper that mirrors [`KernelError::generic_err`]. - pub fn generic_err( - source: impl Into>, - ) -> Self { - KernelError::generic_err(source).into() - } - - /// Convenience helper that mirrors [`KernelError::schema`]. - pub fn schema(msg: impl ToString) -> Self { - KernelError::Schema(msg.to_string()).with_backtrace().into() - } - - /// Convenience helper that mirrors [`KernelError::invalid_table_location`]. - pub fn invalid_table_location(location: impl ToString) -> Self { - KernelError::invalid_table_location(location).into() - } - - /// Convenience helper that mirrors [`KernelError::missing_column`]. - pub fn missing_column(name: impl ToString) -> Self { - KernelError::missing_column(name).into() - } -} - -impl From for DataFusionError { - fn from(err: DeltaError) -> Self { - match err { - DeltaError::DataFusion(inner) => inner, - DeltaError::Io(err) => DataFusionError::IoError(err), - DeltaError::Arrow(err) => DataFusionError::ArrowError(Box::new(err), None), - DeltaError::ObjectStore(err) => DataFusionError::ObjectStore(Box::new(err)), - DeltaError::Url(err) => { - DataFusionError::Configuration(format!("Invalid URL format: {err}")) - } - DeltaError::Json(err) => DataFusionError::External(Box::new(err)), - DeltaError::Config(msg) => DataFusionError::Configuration(msg), - DeltaError::Generic(msg) => DataFusionError::Execution(msg), - DeltaError::Transaction(err) => DataFusionError::External(Box::new(err)), - DeltaError::Kernel(err) => map_kernel_error_to_datafusion(err), - } - } -} - -impl From for DeltaError { - fn from(err: object_store::path::Error) -> Self { - KernelError::ObjectStorePath(err).into() - } -} - -fn map_kernel_error_to_datafusion(err: KernelError) -> DataFusionError { - match err { - KernelError::Arrow(err) => DataFusionError::ArrowError(Box::new(err), None), - KernelError::IOError(err) => DataFusionError::IoError(err), - KernelError::ObjectStore(err) => DataFusionError::ObjectStore(Box::new(err)), - KernelError::ObjectStorePath(source) => { - DataFusionError::ObjectStore(Box::new(ObjectStoreError::InvalidPath { source })) - } - KernelError::Parquet(err) => DataFusionError::ParquetError(Box::new(err)), - KernelError::FileNotFound(path) => { - DataFusionError::ObjectStore(Box::new(ObjectStoreError::NotFound { - path, - source: Box::new(std::io::Error::new( - std::io::ErrorKind::NotFound, - "File not found in Delta kernel", - )), - })) - } - KernelError::MissingColumn(column) => DataFusionError::SchemaError( - Box::new(SchemaError::FieldNotFound { - field: Box::new(Column::from_name(column)), - valid_fields: vec![], - }), - Box::new(None), - ), - KernelError::InvalidUrl(err) => { - DataFusionError::Configuration(format!("Invalid Delta URL: {err}")) - } - KernelError::InvalidTableLocation(location) => { - DataFusionError::Configuration(format!("Invalid table location: {location}")) - } - KernelError::MissingVersion => { - DataFusionError::Execution("No table version found.".to_string()) - } - KernelError::Unsupported(msg) => DataFusionError::NotImplemented(msg), - KernelError::ChangeDataFeedUnsupported(version) => DataFusionError::NotImplemented( - format!("Change data feed unsupported at version {version}"), - ), - KernelError::ChangeDataFeedIncompatibleSchema(expected, actual) => { - DataFusionError::Execution(format!( - "Change data feed schema mismatch. Expected {expected}, got {actual}" - )) - } - KernelError::GenericError { source } => DataFusionError::External(source), - KernelError::MalformedJson(err) => DataFusionError::External(Box::new(err)), - KernelError::Reqwest(err) => DataFusionError::External(Box::new(err)), - KernelError::Utf8Error(err) => DataFusionError::Execution(err.to_string()), - KernelError::ParseIntError(err) => DataFusionError::Execution(err.to_string()), - KernelError::ParseIntervalError(err) => DataFusionError::Execution(err.to_string()), - KernelError::Backtraced { source, .. } => map_kernel_error_to_datafusion(*source), - KernelError::InternalError(msg) => DataFusionError::Internal(msg), - KernelError::MissingMetadata => { - DataFusionError::Execution("No table metadata found in delta log.".to_string()) - } - KernelError::MissingProtocol => { - DataFusionError::Execution("No protocol found in delta log.".to_string()) - } - KernelError::MissingMetadataAndProtocol => DataFusionError::Execution( - "No table metadata or protocol found in delta log.".to_string(), - ), - KernelError::ParseError(value, ty) => { - DataFusionError::Execution(format!("Failed to parse value '{value}' as '{ty}'")) - } - _ => DataFusionError::External(Box::new(err)), - } -} diff --git a/crates/sail-delta-lake/src/kernel/arrow/engine_ext.rs b/crates/sail-delta-lake/src/kernel/arrow/engine_ext.rs deleted file mode 100644 index 9189dae156..0000000000 --- a/crates/sail-delta-lake/src/kernel/arrow/engine_ext.rs +++ /dev/null @@ -1,708 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: - -//! Utilities for interacting with Kernel APIs using Arrow data structures. -//! -use std::borrow::Cow; -use std::collections::HashMap; -use std::sync::Arc; - -use arrow_schema::Fields; -use datafusion::arrow::array::{Array, BooleanArray, MapArray, StringArray, StructArray}; -use datafusion::arrow::datatypes::{ - DataType as ArrowDataType, Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; -use delta_kernel::arrow::compute::filter_record_batch; -use delta_kernel::arrow::record_batch::RecordBatch; -use delta_kernel::engine::arrow_conversion::TryIntoArrow; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::engine::parse_json; -use delta_kernel::expressions::{ColumnName, Scalar, StructData}; -use delta_kernel::scan::{Scan, ScanMetadata}; -use delta_kernel::schema::{ - ArrayType, DataType, MapType, PrimitiveType, Schema, SchemaRef, SchemaTransform, StructField, - StructType, -}; -use delta_kernel::snapshot::Snapshot; -use delta_kernel::table_features::ColumnMappingMode; -use delta_kernel::table_properties::{DataSkippingNumIndexedCols, TableProperties}; -use delta_kernel::{ - DeltaResult, Engine, EngineData, ExpressionEvaluator, ExpressionRef, PredicateRef, Version, -}; -use itertools::Itertools; - -use crate::conversion::ScalarConverter; -use crate::kernel::snapshot::SCAN_ROW_ARROW_SCHEMA; -use crate::kernel::{DeltaResult as DeltaResultLocal, DeltaTableError}; - -/// [`ScanMetadata`] contains (1) a [`RecordBatch`] specifying data files to be scanned -/// and (2) a vector of transforms (one transform per scan file) that must be applied to the data read -/// from those files. -pub(crate) struct ScanMetadataArrow { - /// Record batch with one row per file to scan - pub scan_files: RecordBatch, - - /// Row-level transformations to apply to data read from files. - /// - /// Each entry in this vector corresponds to a row in the `scan_files` data. The entry is an - /// expression that must be applied to convert the file's data into the logical schema - /// expected by the scan: - /// - /// - `Some(expr)`: Apply this expression to transform the data to match [`Scan::schema()`]. - /// - `None`: No transformation is needed; the data is already in the correct logical form. - /// - /// Note: This vector can be indexed by row number. - #[expect(dead_code)] - pub scan_file_transforms: Vec>, -} - -/// Internal extension trait to streamline working with Kernel scan objects. -/// -/// THe trait mainly handles conversion between arrow `RecordBatch` and `ArrowEngineData`. -/// The exposed methods are arrow-variants of methods already exposed on the kernel scan. -pub(crate) trait ScanExt { - /// Get the metadata for a table scan. - /// - /// This method handles translation between `EngineData` and `RecordBatch` - /// and will already apply any selection vectors to the data. - /// See [`Scan::scan_metadata`] for details. - fn scan_metadata_arrow( - &self, - engine: &dyn Engine, - ) -> DeltaResult>>; - - fn scan_metadata_from_arrow( - &self, - engine: &dyn Engine, - existing_version: Version, - existing_data: Box>, - existing_predicate: Option, - ) -> DeltaResult>>; -} - -impl ScanExt for Scan { - fn scan_metadata_arrow( - &self, - engine: &dyn Engine, - ) -> DeltaResult>> { - Ok(self - .scan_metadata(engine)? - .map_ok(kernel_to_arrow) - .flatten()) - } - - fn scan_metadata_from_arrow( - &self, - engine: &dyn Engine, - existing_version: Version, - existing_data: Box>, - existing_predicate: Option, - ) -> DeltaResult>> { - let engine_iter = - existing_data.map(|batch| Box::new(ArrowEngineData::new(batch)) as Box); - Ok(self - .scan_metadata_from(engine, existing_version, engine_iter, existing_predicate)? - .map_ok(kernel_to_arrow) - .flatten()) - } -} - -/// Internal extension traits to the Kernel Snapshot. -/// -/// These traits provide additional convenience functionality for working with Kernel snapshots. -/// Some of this may eventually be upstreamed as the kernel implementation matures. -pub(crate) trait SnapshotExt { - /// Returns the expected file statistics schema for the snapshot. - fn stats_schema(&self) -> DeltaResult; - - /// The expected schema for partition values - fn partitions_schema(&self) -> DeltaResultLocal>; - - /// The scheme expected for the data returned from a scan. - /// - /// This is an extended version of the raw schema that includes additional - /// computations by delta-rs. Specifically the `stats_parsed` and - /// `partitionValues_parsed` fields are added. - fn scan_row_parsed_schema_arrow(&self) -> DeltaResultLocal; - - /// Parse stats column into a struct array. - fn parse_stats_column(&self, batch: &RecordBatch) -> DeltaResultLocal; -} - -impl SnapshotExt for Snapshot { - fn stats_schema(&self) -> DeltaResult { - let partition_columns = self.table_configuration().metadata().partition_columns(); - let column_mapping_mode = self.table_configuration().column_mapping_mode(); - let physical_schema = StructType::try_new( - self.schema() - .fields() - .filter(|field| !partition_columns.contains(field.name())) - .map(|field| field.make_physical(column_mapping_mode)), - )?; - Ok(Arc::new(stats_schema( - &physical_schema, - self.table_properties(), - )?)) - } - - fn partitions_schema(&self) -> DeltaResultLocal> { - Ok(partitions_schema( - self.schema().as_ref(), - self.table_configuration().metadata().partition_columns(), - )? - .map(Arc::new)) - } - - /// Arrow schema for a parsed (including stats_parsed and partitionValues_parsed) - /// scan row (file data). - fn scan_row_parsed_schema_arrow(&self) -> DeltaResultLocal { - let mut fields = SCAN_ROW_ARROW_SCHEMA.fields().to_vec(); - - let stats_schema = self.stats_schema()?; - let stats_schema: ArrowSchema = stats_schema.as_ref().try_into_arrow()?; - fields.push(Arc::new(Field::new( - "stats_parsed", - ArrowDataType::Struct(stats_schema.fields().to_owned()), - true, - ))); - - if let Some(partition_schema) = self.partitions_schema()? { - let partition_schema: ArrowSchema = partition_schema.as_ref().try_into_arrow()?; - fields.push(Arc::new(Field::new( - "partitionValues_parsed", - ArrowDataType::Struct(partition_schema.fields().to_owned()), - false, - ))); - } - - let schema = Arc::new(ArrowSchema::new(fields)); - Ok(schema) - } - - fn parse_stats_column(&self, batch: &RecordBatch) -> DeltaResultLocal { - let Some((stats_idx, _)) = batch.schema_ref().column_with_name("stats") else { - return Err(DeltaTableError::schema( - "stats column not found".to_string(), - )); - }; - - let mut columns = batch.columns().to_vec(); - let mut fields = batch.schema().fields().to_vec(); - let column_mapping_mode = self.table_configuration().column_mapping_mode(); - - let stats_schema = self.stats_schema()?; - let stats_batch = batch.project(&[stats_idx])?; - let stats_data = Box::new(ArrowEngineData::new(stats_batch)); - - let parsed = parse_json(stats_data, stats_schema)?; - let parsed: RecordBatch = ArrowEngineData::try_from_engine_data(parsed)?.into(); - - let stats_array: Arc = Arc::new(parsed.into()); - fields.push(Arc::new(Field::new( - "stats_parsed", - stats_array.data_type().to_owned(), - true, - ))); - columns.push(stats_array.clone()); - - if let Some(partition_schema) = self.partitions_schema()? { - let partition_array = parse_partition_values_array( - batch, - partition_schema.as_ref(), - "fileConstantValues.partitionValues", - column_mapping_mode, - )?; - fields.push(Arc::new(Field::new( - "partitionValues_parsed", - partition_array.data_type().to_owned(), - false, - ))); - columns.push(Arc::new(partition_array)); - } - - Ok(RecordBatch::try_new( - Arc::new(ArrowSchema::new(fields)), - columns, - )?) - } -} - -fn parse_partition_values_array( - batch: &RecordBatch, - partition_schema: &StructType, - path: &str, - column_mapping_mode: ColumnMappingMode, -) -> DeltaResultLocal { - let partitions = map_array_from_path(batch, path)?; - let num_rows = partitions.len(); - - let mut collected: HashMap> = partition_schema - .fields() - .map(|f| { - ( - f.physical_name(column_mapping_mode).to_string(), - Vec::with_capacity(num_rows), - ) - }) - .collect(); - - for row in 0..num_rows { - if partitions.is_null(row) { - return Err(DeltaTableError::generic( - "Expected partition values map, found null entry.", - )); - } - let raw_values = collect_partition_row(&partitions.value(row))?; - - for field in partition_schema.fields() { - let physical_name = field.physical_name(column_mapping_mode); - let value = raw_values - .get(physical_name) - .or_else(|| raw_values.get(field.name())); - let scalar = match field.data_type() { - DataType::Primitive(primitive) => match value { - Some(Some(raw)) => primitive.parse_scalar(raw)?, - _ => Scalar::Null(field.data_type().clone()), - }, - _ => { - return Err(DeltaTableError::generic( - "nested partitioning values are not supported", - )) - } - }; - collected - .get_mut(physical_name) - .ok_or_else(|| DeltaTableError::schema("partition field missing".to_string()))? - .push(scalar); - } - } - - let columns = partition_schema - .fields() - .map(|field| { - let physical_name = field.physical_name(column_mapping_mode); - ScalarConverter::scalars_to_arrow_array( - field, - collected.get(physical_name).ok_or_else(|| { - DeltaTableError::schema("partition field missing".to_string()) - })?, - ) - }) - .collect::>>()?; - - let arrow_fields: Fields = Fields::from( - partition_schema - .fields() - .map(|f| f.try_into_arrow()) - .collect::, _>>()?, - ); - - Ok(StructArray::try_new(arrow_fields, columns, None)?) -} - -fn map_array_from_path<'a>(batch: &'a RecordBatch, path: &str) -> DeltaResultLocal<&'a MapArray> { - let mut segments = path.split('.'); - let first = segments - .next() - .ok_or_else(|| DeltaTableError::generic("partition column path must not be empty"))?; - - let mut current: &dyn Array = batch - .column_by_name(first) - .map(|col| col.as_ref()) - .ok_or_else(|| { - DeltaTableError::schema(format!("{first} column not found when parsing partitions")) - })?; - - for segment in segments { - let struct_array = current - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DeltaTableError::schema(format!("Expected struct column while traversing {path}")) - })?; - current = struct_array - .column_by_name(segment) - .map(|col| col.as_ref()) - .ok_or_else(|| { - DeltaTableError::schema(format!( - "{segment} column not found while traversing {path}" - )) - })?; - } - - current - .as_any() - .downcast_ref::() - .ok_or_else(|| DeltaTableError::schema(format!("Column {path} is not a map"))) -} - -fn collect_partition_row(value: &StructArray) -> DeltaResultLocal>> { - let keys = value - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| DeltaTableError::schema("map key column is not Utf8".to_string()))?; - let vals = value - .column(1) - .as_any() - .downcast_ref::() - .ok_or_else(|| DeltaTableError::schema("map value column is not Utf8".to_string()))?; - - let mut result = HashMap::with_capacity(keys.len()); - for (key, value) in keys.iter().zip(vals.iter()) { - if let Some(k) = key { - result.insert(k.to_string(), value.map(|v| v.to_string())); - } - } - Ok(result) -} - -fn partitions_schema( - schema: &StructType, - partition_columns: &[String], -) -> DeltaResultLocal> { - if partition_columns.is_empty() { - return Ok(None); - } - Ok(Some(StructType::try_new( - partition_columns - .iter() - .map(|col| { - schema.field(col).cloned().ok_or_else(|| { - DeltaTableError::generic(format!("Partition column {col} not found in schema")) - }) - }) - .collect::, _>>()?, - )?)) -} - -/// Generates the expected schema for file statistics. -/// -/// The base stats schema is dependent on the current table configuration and derived via: -/// - only fields present in data files are included (use physical names, no partition columns) -/// - if `dataSkippingStatsColumns` is set, include only those columns. -/// Column names may refer to struct fields in which case all child fields are included. -/// - otherwise the first `dataSkippingNumIndexedCols` (default 32) leaf fields are included. -/// - all fields are made nullable. -/// -/// For the `nullCount` schema, we consider the whole base schema and convert all leaf fields -/// to data type LONG. Maps, arrays, and variant are considered leaf fields in this case. -/// -/// For the min / max schemas, we non-eligible leaf fields from the base schema. -/// Field eligibility is determined by the fields data type via [`is_skipping_eligeble_datatype`]. -/// -/// The overall schema is then: -/// ```ignored -/// { -/// numRecords: long, -/// nullCount: , -/// minValues: , -/// maxValues: , -/// } -/// ``` -pub(crate) fn stats_schema( - physical_file_schema: &Schema, - table_properties: &TableProperties, -) -> DeltaResult { - let mut fields = Vec::with_capacity(4); - fields.push(StructField::nullable("numRecords", DataType::LONG)); - - // generate the base stats schema: - // - make all fields nullable - // - include fields according to table properties (num_indexed_cols, stats_coliumns, ...) - let mut base_transform = BaseStatsTransform::new(table_properties); - if let Some(base_schema) = base_transform.transform_struct(physical_file_schema) { - let base_schema = base_schema.into_owned(); - - // convert all leaf fields to data type LONG for null count - let mut null_count_transform = NullCountStatsTransform; - if let Some(null_count_schema) = null_count_transform.transform_struct(&base_schema) { - fields.push(StructField::nullable( - "nullCount", - null_count_schema.into_owned(), - )); - }; - - // include only min/max skipping eligible fields (data types) - let mut min_max_transform = MinMaxStatsTransform; - if let Some(min_max_schema) = min_max_transform.transform_struct(&base_schema) { - let min_max_schema = min_max_schema.into_owned(); - fields.push(StructField::nullable("minValues", min_max_schema.clone())); - fields.push(StructField::nullable("maxValues", min_max_schema)); - } - } - StructType::try_new(fields) -} - -// Convert a min/max stats schema into a nullcount schema (all leaf fields are LONG) -pub(crate) struct NullCountStatsTransform; -impl<'a> SchemaTransform<'a> for NullCountStatsTransform { - fn transform_primitive(&mut self, _ptype: &'a PrimitiveType) -> Option> { - Some(Cow::Owned(PrimitiveType::Long)) - } - fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - if matches!( - &field.data_type, - DataType::Array(_) | DataType::Map(_) | DataType::Variant(_) - ) { - return Some(Cow::Owned(StructField { - name: field.name.clone(), - data_type: DataType::LONG, - nullable: true, - metadata: Default::default(), - })); - } - - match self.transform(&field.data_type)? { - Cow::Borrowed(_) => Some(Cow::Borrowed(field)), - dt => Some(Cow::Owned(StructField { - name: field.name.clone(), - data_type: dt.into_owned(), - nullable: true, - metadata: Default::default(), - })), - } - } -} - -/// Transforms a table schema into a base stats schema. -/// -/// Base stats schema in this case refers the subsets of fields in the table schema -/// that may be considered for stats collection. Depending on the type of stats - min/max/nullcount/... - -/// additional transformations may be applied. -/// -/// The concrete shape of the schema depends on the table configuration. -/// * `dataSkippingStatsColumns` - used to explicitly specify the columns -/// to be used for data skipping statistics. (takes precedence) -/// * `dataSkippingNumIndexedCols` - used to specify the number of columns -/// to be used for data skipping statistics. Defaults to 32. -/// -/// All fields are nullable. -struct BaseStatsTransform { - n_columns: Option, - added_columns: u64, - column_names: Option>, - path: Vec, -} - -impl BaseStatsTransform { - fn new(props: &TableProperties) -> Self { - // if data_skipping_stats_columns is specified, it takes precedence - // over data_skipping_num_indexed_cols, even if that is also specified - if let Some(columns_names) = &props.data_skipping_stats_columns { - Self { - n_columns: None, - added_columns: 0, - column_names: Some(columns_names.clone()), - path: Vec::new(), - } - } else { - Self { - n_columns: Some( - props - .data_skipping_num_indexed_cols - .unwrap_or(DataSkippingNumIndexedCols::NumColumns(32)), - ), - added_columns: 0, - column_names: None, - path: Vec::new(), - } - } - } -} - -impl<'a> SchemaTransform<'a> for BaseStatsTransform { - fn transform_struct_field(&mut self, field: &'a StructField) -> Option> { - // Check if the number of columns is set and if the added columns exceed the limit - // In the constructor we assert this will always be None if column_names are specified - if let Some(DataSkippingNumIndexedCols::NumColumns(n_cols)) = self.n_columns { - if self.added_columns >= n_cols { - return None; - } - } - - self.path.push(field.name.clone()); - let data_type = field.data_type(); - - // keep the field if it: - // - is a struct field and we need to traverse its children - // - OR it is referenced by the column names - // - OR it is a primitive type / leaf field - let should_include = matches!(data_type, DataType::Struct(_)) - || self - .column_names - .as_ref() - .map(|ns| should_include_column(&ColumnName::new(&self.path), ns)) - .unwrap_or(true); - - if !should_include { - self.path.pop(); - return None; - } - - // increment count only for leaf columns. - if !matches!(data_type, DataType::Struct(_)) { - self.added_columns += 1; - } - - let field = match self.transform(&field.data_type)? { - Cow::Borrowed(_) if field.is_nullable() => Cow::Borrowed(field), - data_type => Cow::Owned(StructField { - name: field.name.clone(), - data_type: data_type.into_owned(), - nullable: true, - metadata: Default::default(), - }), - }; - - self.path.pop(); - - // exclude struct fields with no children - if matches!( - field.data_type(), - DataType::Struct(dt) if dt.fields().count() == 0 - ) { - None - } else { - Some(field) - } - } -} - -// removes all fields with non eligible data types -// -// should only be applied to schema oricessed via `BaseStatsTransform`. -struct MinMaxStatsTransform; - -impl<'a> SchemaTransform<'a> for MinMaxStatsTransform { - // array and map fields are not eligible for data skipping, so filter them out. - fn transform_array(&mut self, _: &'a ArrayType) -> Option> { - None - } - fn transform_map(&mut self, _: &'a MapType) -> Option> { - None - } - fn transform_variant(&mut self, _: &'a StructType) -> Option> { - None - } - - fn transform_primitive(&mut self, ptype: &'a PrimitiveType) -> Option> { - if is_skipping_eligeble_datatype(ptype) { - Some(Cow::Borrowed(ptype)) - } else { - None - } - } -} - -// Checks if a column should be included or traversed into. -// -// Returns true if the column name is included in the list of column names -// or if the column name is a prefix of any column name in the list -// or if the column name is a child of any column name in the list -fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName]) -> bool { - column_names.iter().any(|name| { - name.as_ref().starts_with(column_name) || column_name.as_ref().starts_with(name) - }) -} - -/// Checks if a data type is eligible for min/max file skipping. -/// https://github.com/delta-io/delta/blob/143ab3337121248d2ca6a7d5bc31deae7c8fe4be/kernel/kernel-api/src/main/java/io/delta/kernel/internal/skipping/StatsSchemaHelper.java#L61 -fn is_skipping_eligeble_datatype(data_type: &PrimitiveType) -> bool { - matches!( - data_type, - &PrimitiveType::Byte - | &PrimitiveType::Short - | &PrimitiveType::Integer - | &PrimitiveType::Long - | &PrimitiveType::Float - | &PrimitiveType::Double - | &PrimitiveType::Date - | &PrimitiveType::Timestamp - | &PrimitiveType::TimestampNtz - | &PrimitiveType::String - // | &PrimitiveType::Boolean - | PrimitiveType::Decimal(_) - ) -} - -fn kernel_to_arrow(metadata: ScanMetadata) -> DeltaResult { - let scan_file_transforms = metadata - .scan_file_transforms - .into_iter() - .enumerate() - .filter_map(|(i, v)| metadata.scan_files.selection_vector()[i].then_some(v)) - .collect(); - let (data, selection) = metadata.scan_files.into_parts(); - let batch = ArrowEngineData::try_from_engine_data(data)?.into(); - let scan_files = filter_record_batch(&batch, &BooleanArray::from(selection))?; - Ok(ScanMetadataArrow { - scan_files, - scan_file_transforms, - }) -} - -/// Internal extension trait for expression evaluators. -/// -/// This just abstracts the conversion between Arrow [`RecoedBatch`]es and -/// Kernel's [`ArrowEngineData`]. -pub(crate) trait ExpressionEvaluatorExt { - fn evaluate_arrow(&self, batch: RecordBatch) -> DeltaResult; -} - -impl ExpressionEvaluatorExt for T { - fn evaluate_arrow(&self, batch: RecordBatch) -> DeltaResult { - let engine_data = ArrowEngineData::new(batch); - Ok(ArrowEngineData::try_from_engine_data(T::evaluate(self, &engine_data)?)?.into()) - } -} - -/// Extension trait for Kernel's [`StructData`]. -/// -/// StructData is the data structure contained in a Struct scalar. -/// The exposed API on kernels struct data is very minimal and does not allow -/// for conveniently probing the fields / values contained within [`StructData`]. -/// -/// This trait therefore adds convenience methods for accessing fields and values. -#[expect(dead_code)] -pub trait StructDataExt { - /// Returns a reference to the field with the given name, if it exists. - fn field(&self, name: &str) -> Option<&StructField>; - - /// Returns a reference to the value with the given index, if it exists. - fn value(&self, index: usize) -> Option<&Scalar>; - - /// Returns the index of the field with the given name, if it exists. - fn index_of(&self, name: &str) -> Option; -} - -impl StructDataExt for StructData { - fn field(&self, name: &str) -> Option<&StructField> { - self.fields().iter().find(|f| f.name() == name) - } - - fn index_of(&self, name: &str) -> Option { - self.fields().iter().position(|f| f.name() == name) - } - - fn value(&self, index: usize) -> Option<&Scalar> { - self.values().get(index) - } -} diff --git a/crates/sail-delta-lake/src/kernel/arrow/mod.rs b/crates/sail-delta-lake/src/kernel/arrow/mod.rs deleted file mode 100644 index 070ba99769..0000000000 --- a/crates/sail-delta-lake/src/kernel/arrow/mod.rs +++ /dev/null @@ -1,13 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub(crate) mod engine_ext; diff --git a/crates/sail-delta-lake/src/kernel/checkpoints.rs b/crates/sail-delta-lake/src/kernel/checkpoints.rs index 1ec6d87fe5..24ce501ae9 100644 --- a/crates/sail-delta-lake/src/kernel/checkpoints.rs +++ b/crates/sail-delta-lake/src/kernel/checkpoints.rs @@ -18,72 +18,391 @@ // [Credit]: -use std::sync::LazyLock; +use std::collections::{BTreeMap, HashMap, VecDeque}; +use std::sync::Arc; -use chrono::{TimeZone, Utc}; -use datafusion::arrow::array::BooleanArray; -use datafusion::arrow::compute::filter_record_batch; -use datafusion::arrow::datatypes::SchemaRef; +use chrono::Utc; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, FieldRef}; use datafusion::arrow::record_batch::RecordBatch; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::engine_data::FilteredEngineData; -use delta_kernel::snapshot::Snapshot as KernelSnapshot; -use delta_kernel::FileMeta; -use futures::{StreamExt, TryStreamExt}; -use log::{debug, error}; -use object_store::path::Path; -use object_store::ObjectStore; +use log::debug; +use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; +use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet::arrow::async_writer::ParquetObjectWriter; use parquet::arrow::AsyncArrowWriter; -use regex::Regex; -use sail_common_datafusion::array::record_batch::cast_record_batch_relaxed_tz; -use tokio::sync::oneshot; -use tokio::task::spawn_blocking; use uuid::Uuid; -use crate::kernel::snapshot::stream::RecordBatchReceiverStreamBuilder; -use crate::kernel::{DeltaResult, DeltaTableError}; -use crate::storage::LogStore; - -const DELTA_LOG_FOLDER: &str = "_delta_log"; -static DELTA_LOG_REGEX: LazyLock> = - LazyLock::new(|| Regex::new(r"(\d{20})\.json$")); -static CHECKPOINT_REGEX: LazyLock> = - LazyLock::new(|| Regex::new(r"(\d{20})\.checkpoint.*\.parquet$")); - -fn regex_from_lazy( - lazy: &'static LazyLock>, - name: &str, -) -> DeltaResult<&'static Regex> { - match LazyLock::force(lazy) { - Ok(regex) => Ok(regex), - Err(err) => Err(DeltaTableError::generic(format!( - "Failed to compile {name} regex: {err}" - ))), +pub(crate) use crate::delta_log::{ + latest_replayable_version, load_replayed_table_header, load_replayed_table_state, +}; +use crate::delta_log::{ + list_delta_log_entries_from, parse_checkpoint_version_from_location, + parse_commit_version_from_location, read_last_checkpoint_version_from_store, + resolve_commit_timestamp_from_actions, +}; +use crate::kernel::log_segment::ReplayedTableHeader; +use crate::spec::{ + checkpoint_path, last_checkpoint_path, Action, Add, CheckpointActionRow, + DeltaError as DeltaTableError, DeltaResult, LastCheckpointHint, Metadata, Protocol, Remove, + TableProperties, Transaction, +}; +use crate::storage::{get_actions, LogStore}; + +#[derive(Debug, Clone, Copy)] +struct CheckpointRetentionTimestamps { + deleted_file_retention_timestamp: i64, + transaction_expiration_timestamp: i64, +} + +impl CheckpointRetentionTimestamps { + fn try_new(metadata: &Metadata, reference_timestamp: i64) -> DeltaResult { + let table_properties = TableProperties::from(metadata.configuration().iter()); + Ok(Self { + deleted_file_retention_timestamp: retention_cutoff_timestamp( + reference_timestamp, + table_properties.deleted_file_retention_duration(), + "delta.deletedFileRetentionDuration", + )?, + transaction_expiration_timestamp: retention_cutoff_timestamp( + reference_timestamp, + table_properties.log_retention_duration(), + "delta.logRetentionDuration", + )?, + }) } } -fn delta_log_regex() -> DeltaResult<&'static Regex> { - regex_from_lazy(&DELTA_LOG_REGEX, "delta log") +fn retention_cutoff_timestamp( + reference_timestamp: i64, + retention_duration: std::time::Duration, + property_name: &str, +) -> DeltaResult { + let retention_millis = i64::try_from(retention_duration.as_millis()).map_err(|_| { + DeltaTableError::generic(format!( + "{property_name} exceeds the supported millisecond range" + )) + })?; + reference_timestamp + .checked_sub(retention_millis) + .ok_or_else(|| { + DeltaTableError::generic(format!( + "Failed to compute retention cutoff for {property_name}" + )) + }) } -fn checkpoint_regex() -> DeltaResult<&'static Regex> { - regex_from_lazy(&CHECKPOINT_REGEX, "checkpoint") +#[derive(Debug, Default)] +pub(crate) struct ReconciledCheckpointState { + pub(crate) protocol: Option, + pub(crate) metadata: Option, + pub(crate) txns: HashMap, + // TODO: Use `(path, dvId)` once replay is deletion-vector aware. + pub(crate) adds: HashMap, + pub(crate) removes: HashMap, } -fn parse_version(regex: &Regex, location: &Path) -> Option { - regex - .captures(location.as_ref()) - .and_then(|caps| caps.get(1)) - .and_then(|m| m.as_str().parse::().ok()) +impl ReconciledCheckpointState { + fn apply_action(&mut self, action: Action) { + match action { + Action::Protocol(protocol) => { + self.protocol = Some(protocol); + } + Action::Metadata(metadata) => { + self.metadata = Some(metadata); + } + Action::Txn(txn) => { + self.txns.insert(txn.app_id.clone(), txn); + } + Action::Add(add) => { + self.removes.remove(&add.path); + self.adds.insert(add.path.clone(), add); + } + Action::Remove(remove) => { + self.adds.remove(&remove.path); + self.removes.insert(remove.path.clone(), remove); + } + // TODO: Preserve DomainMetadata so VersionChecksum can emit it. + Action::CommitInfo(_) + | Action::Cdc(_) + | Action::DomainMetadata(_) + | Action::CheckpointMetadata(_) + | Action::Sidecar(_) => {} + } + } + + pub(crate) fn apply_checkpoint_row(&mut self, row: CheckpointActionRow) -> DeltaResult<()> { + if let Some(protocol) = row.protocol { + self.protocol = Some(protocol); + } + if let Some(metadata) = row.metadata { + self.metadata = Some(metadata); + } + if let Some(txn) = row.txn { + self.txns.insert(txn.app_id.clone(), txn); + } + if let Some(add) = row.add { + self.removes.remove(&add.path); + self.adds.insert(add.path.clone(), add); + } + if let Some(remove) = row.remove { + self.adds.remove(&remove.path); + self.removes.insert(remove.path.clone(), remove); + } + Ok(()) + } + + fn prune_expired_checkpoint_actions(&mut self, reference_timestamp: i64) -> DeltaResult<()> { + let metadata = self.metadata.as_ref().ok_or_else(|| { + DeltaTableError::generic("Cannot prune checkpoint actions without metadata action") + })?; + let retention = CheckpointRetentionTimestamps::try_new(metadata, reference_timestamp)?; + + let txns_before = self.txns.len(); + self.txns.retain(|_, txn| { + txn.last_updated + .map(|last_updated| last_updated > retention.transaction_expiration_timestamp) + .unwrap_or(true) + }); + + let removes_before = self.removes.len(); + self.removes.retain(|_, remove| { + remove + .deletion_timestamp + .map(|deletion_timestamp| { + deletion_timestamp > retention.deleted_file_retention_timestamp + }) + .unwrap_or(true) + }); + + debug!( + "Pruned {} expired txn actions and {} expired remove actions before checkpoint write", + txns_before.saturating_sub(self.txns.len()), + removes_before.saturating_sub(self.removes.len()), + ); + + Ok(()) + } + + // TODO: Make checkpoint creation fully streaming. This iterator removes the + // single-batch peak, but the reconciled state is still fully materialized. + fn into_checkpoint_batch_iter( + self, + batch_size: usize, + ) -> DeltaResult<(CheckpointBatchIter, i64)> { + let protocol = self.protocol.ok_or_else(|| { + DeltaTableError::generic("Cannot create checkpoint without protocol action") + })?; + let metadata = self.metadata.ok_or_else(|| { + DeltaTableError::generic("Cannot create checkpoint without metadata action") + })?; + if batch_size == 0 { + return Err(DeltaTableError::generic( + "checkpoint batch size must be positive", + )); + } + + let add_count = i64::try_from(self.adds.len()) + .map_err(|_| DeltaTableError::generic("add action count overflow"))?; + + Ok(( + CheckpointBatchIter { + batch_size, + leading_rows: VecDeque::from([ + CheckpointActionRow { + protocol: Some(protocol), + ..Default::default() + }, + CheckpointActionRow { + metadata: Some(metadata), + ..Default::default() + }, + ]), + txns: self + .txns + .into_iter() + .collect::>() + .into_iter(), + removes: self + .removes + .into_iter() + .collect::>() + .into_iter(), + adds: self + .adds + .into_iter() + .collect::>() + .into_iter(), + }, + add_count, + )) + } +} + +#[derive(Debug, Default)] +pub(crate) struct ReconciledHeaderState { + pub(crate) protocol: Option, + pub(crate) metadata: Option, + pub(crate) txns: HashMap, } -fn to_rb(data: FilteredEngineData) -> DeltaResult { - let (underlying_data, selection_vector) = data.into_parts(); - let engine_data = ArrowEngineData::try_from_engine_data(underlying_data)?; - let predicate = BooleanArray::from(selection_vector); - let batch = filter_record_batch(engine_data.record_batch(), &predicate)?; - Ok(batch) +impl ReconciledHeaderState { + fn apply_action(&mut self, action: Action) { + match action { + Action::Protocol(protocol) => { + self.protocol = Some(protocol); + } + Action::Metadata(metadata) => { + self.metadata = Some(metadata); + } + Action::Txn(txn) => { + self.txns.insert(txn.app_id.clone(), txn); + } + Action::Add(_) + | Action::Remove(_) + | Action::CommitInfo(_) + | Action::Cdc(_) + | Action::DomainMetadata(_) + | Action::CheckpointMetadata(_) + | Action::Sidecar(_) => {} + } + } + + pub(crate) fn apply_checkpoint_row(&mut self, row: CheckpointActionRow) { + if let Some(protocol) = row.protocol { + self.protocol = Some(protocol); + } + if let Some(metadata) = row.metadata { + self.metadata = Some(metadata); + } + if let Some(txn) = row.txn { + self.txns.insert(txn.app_id.clone(), txn); + } + } + + pub(crate) fn from_header(header: &ReplayedTableHeader) -> Self { + Self { + protocol: Some(header.protocol.clone()), + metadata: Some(header.metadata.clone()), + txns: header.txns.as_ref().clone(), + } + } +} + +struct CheckpointBatchIter { + batch_size: usize, + leading_rows: VecDeque, + txns: std::collections::btree_map::IntoIter, + removes: std::collections::btree_map::IntoIter, + adds: std::collections::btree_map::IntoIter, +} + +impl CheckpointBatchIter { + fn next_batch(&mut self) -> DeltaResult> { + let mut rows = Vec::with_capacity(self.batch_size); + + while rows.len() < self.batch_size { + if let Some(row) = self.leading_rows.pop_front() { + rows.push(row); + continue; + } + if let Some((_, txn)) = self.txns.next() { + rows.push(CheckpointActionRow { + txn: Some(txn), + ..Default::default() + }); + continue; + } + if let Some((_, remove)) = self.removes.next() { + rows.push(CheckpointActionRow { + remove: Some(remove), + ..Default::default() + }); + continue; + } + if let Some((_, add)) = self.adds.next() { + rows.push(CheckpointActionRow { + add: Some(add), + ..Default::default() + }); + continue; + } + break; + } + + if rows.is_empty() { + Ok(None) + } else { + Ok(Some(encode_checkpoint_rows(&rows)?)) + } + } +} + +#[derive(Debug, Clone)] +pub(crate) struct ReplayedTableState { + pub version: i64, + pub protocol: Protocol, + pub metadata: Metadata, + pub txns: HashMap, + pub adds: Vec, + pub removes: Vec, + pub commit_timestamps: BTreeMap, +} + +fn encode_checkpoint_rows(rows: &Vec) -> DeltaResult { + let fields = checkpoint_fields()?; + serde_arrow::to_record_batch(&fields, rows).map_err(DeltaTableError::generic_err) +} + +pub(crate) fn decode_checkpoint_rows(batch: &RecordBatch) -> DeltaResult> { + serde_arrow::from_record_batch(batch).map_err(DeltaTableError::generic_err) +} + +fn checkpoint_fields() -> DeltaResult> { + let schema = CheckpointActionRow::struct_type(); + schema + .fields() + .map(|field| { + datafusion::arrow::datatypes::Field::try_from(field) + .map(|f| Arc::new(f) as FieldRef) + .map_err(|e| { + DeltaTableError::generic(format!( + "checkpoint schema should convert to Arrow: {e}" + )) + }) + }) + .collect() +} + +fn find_union_path_in_type(dtype: &ArrowDataType, path: &str) -> Option { + match dtype { + ArrowDataType::Union(_, _) => Some(path.to_string()), + ArrowDataType::Struct(fields) => fields.iter().find_map(|f| { + let child_path = format!("{path}.{}", f.name()); + find_union_path_in_type(f.data_type(), &child_path) + }), + ArrowDataType::List(field) + | ArrowDataType::LargeList(field) + | ArrowDataType::FixedSizeList(field, _) => { + let child_path = format!("{path}.{}", field.name()); + find_union_path_in_type(field.data_type(), &child_path) + } + ArrowDataType::Map(field, _) => { + let child_path = format!("{path}.{}", field.name()); + find_union_path_in_type(field.data_type(), &child_path) + } + _ => None, + } +} + +fn ensure_schema_supported_for_parquet(batch: &RecordBatch) -> DeltaResult<()> { + for field in batch.schema().fields() { + let path = field.name().to_string(); + if let Some(union_path) = find_union_path_in_type(field.data_type(), &path) { + return Err(DeltaTableError::generic(format!( + "Unsupported checkpoint schema contains Union type at '{union_path}'" + ))); + } + } + Ok(()) } struct CheckpointManager<'a> { @@ -106,107 +425,219 @@ impl<'a> CheckpointManager<'a> { ))); } - let mut table_root = self.log_store.config().location.clone(); - if !table_root.path().ends_with('/') { - table_root.set_path(&format!("{}/", table_root.path())); + let store = self.log_store.object_store(Some(self.operation_id)); + let offset_version = read_last_checkpoint_version_from_store(store.clone()).await; + let offset_version = offset_version + .map(|v| v.min(version).saturating_sub(1)) + .unwrap_or(0); + let log_entries = list_delta_log_entries_from(store.clone(), offset_version).await?; + let mut commit_entries: Vec<(i64, ObjectMeta)> = Vec::new(); + let mut checkpoint_entries: Vec<(i64, ObjectMeta)> = Vec::new(); + for meta in log_entries { + if let Some(v) = parse_commit_version_from_location(&meta.location) { + if v <= version { + commit_entries.push((v, meta)); + } + continue; + } + if let Some(v) = parse_checkpoint_version_from_location(&meta.location) { + if v <= version { + checkpoint_entries.push((v, meta)); + } + } } + commit_entries.sort_by(|(av, _), (bv, _)| av.cmp(bv)); + checkpoint_entries.sort_by(|(av, _), (bv, _)| av.cmp(bv)); - let engine = self.log_store.engine(Some(self.operation_id)); - let version_u64 = version as u64; - - let snapshot = spawn_blocking(move || { - KernelSnapshot::builder_for(table_root) - .at_version(version_u64) - .build(engine.as_ref()) - }) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))??; - - let cp_writer = snapshot.checkpoint()?; - let cp_url = cp_writer.checkpoint_path()?; - let cp_path = Path::from_url_path(cp_url.path())?; - - // Prepare checkpoint data iterator (sync) in the kernel engine. - let engine = self.log_store.engine(Some(self.operation_id)); - let mut cp_data = cp_writer.checkpoint_data(engine.as_ref())?; - - // Pull the first batch (for schema), but keep the iterator for the producer thread. - let (first_batch, mut cp_data_after_first) = spawn_blocking(move || { - let Some(first) = cp_data.next() else { - return Err(DeltaTableError::generic("No checkpoint data".to_string())); - }; - Ok::<_, DeltaTableError>((to_rb(first?)?, cp_data)) - }) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))??; + let mut state = ReconciledCheckpointState::default(); + let start_commit_version = if let Some((cp_ver, cp_meta)) = checkpoint_entries.pop() { + let rows = read_checkpoint_rows_from_parquet(store.clone(), cp_meta).await?; + for row in rows { + state.apply_checkpoint_row(row)?; + } + cp_ver.saturating_add(1) + } else { + 0 + }; + replay_commit_actions( + &mut state, + store.clone(), + &commit_entries, + start_commit_version, + version, + ) + .await?; + state.prune_expired_checkpoint_actions(Utc::now().timestamp_millis())?; - let checkpoint_schema: SchemaRef = first_batch.schema(); + // Batching avoids one giant RecordBatch, but full-state materialization + // is still the main memory cost here. + const CHECKPOINT_WRITE_BATCH_SIZE: usize = 16_384; + let (mut checkpoint_batches, checkpoint_add_count) = + state.into_checkpoint_batch_iter(CHECKPOINT_WRITE_BATCH_SIZE)?; - // Start writer (consumer) immediately. - let root_store = self.log_store.root_object_store(Some(self.operation_id)); - let object_store_writer = ParquetObjectWriter::new(root_store.clone(), cp_path.clone()); - let mut writer = - AsyncArrowWriter::try_new(object_store_writer, checkpoint_schema.clone(), None) - .map_err(DeltaTableError::generic_err)?; + let Some(first_batch) = checkpoint_batches.next_batch()? else { + return Err(DeltaTableError::generic("No checkpoint rows to write")); + }; + ensure_schema_supported_for_parquet(&first_batch)?; + let mut checkpoint_row_count = i64::try_from(first_batch.num_rows()) + .map_err(|_| DeltaTableError::generic("checkpoint action count overflow"))?; + let cp_path = checkpoint_path(version); + let object_store_writer = ParquetObjectWriter::new(store.clone(), cp_path.clone()); + let mut writer = AsyncArrowWriter::try_new(object_store_writer, first_batch.schema(), None) + .map_err(DeltaTableError::generic_err)?; writer .write(&first_batch) .await .map_err(DeltaTableError::generic_err)?; - - // Stream remaining batches from a blocking producer thread to the async writer. - let mut rb_builder = RecordBatchReceiverStreamBuilder::new(4); - let tx = rb_builder.tx(); - let (cp_data_done_tx, cp_data_done_rx) = oneshot::channel(); - - rb_builder.spawn_blocking(move || { - for next in cp_data_after_first.by_ref() { - let batch: DeltaResult = - next.map_err(DeltaTableError::from).and_then(to_rb); - if tx.blocking_send(batch).is_err() { - break; // consumer dropped - } - } - - // Return the exhausted iterator (it contains kernel-side stats used by finalize). - let _ = cp_data_done_tx.send(cp_data_after_first); - Ok(()) - }); - - let mut batch_stream = rb_builder.build(); - while let Some(batch) = batch_stream.next().await { - let batch = batch?; - let batch = if batch.schema() != checkpoint_schema { - cast_record_batch_relaxed_tz(&batch, &checkpoint_schema)? - } else { - batch - }; + while let Some(batch) = checkpoint_batches.next_batch()? { + checkpoint_row_count = + checkpoint_row_count + .checked_add(i64::try_from(batch.num_rows()).map_err(|_| { + DeltaTableError::generic("checkpoint action count overflow") + })?) + .ok_or_else(|| DeltaTableError::generic("checkpoint action count overflow"))?; writer .write(&batch) .await .map_err(DeltaTableError::generic_err)?; } + let _ = writer.close().await.map_err(DeltaTableError::generic_err)?; + let file_meta = store.head(&cp_path).await?; + let last_checkpoint_path = last_checkpoint_path(); + let hint = LastCheckpointHint { + version, + size: Some(checkpoint_row_count), + parts: None, + size_in_bytes: Some(file_meta.size as i64), + num_of_add_files: Some(checkpoint_add_count), + checkpoint_schema: None, + checksum: None, + tags: None, + }; + let hint_bytes = serde_json::to_vec(&hint).map_err(DeltaTableError::generic_err)?; + store.put(&last_checkpoint_path, hint_bytes.into()).await?; - let _pq_meta = writer.close().await.map_err(DeltaTableError::generic_err)?; + Ok(()) + } +} - let file_meta = root_store.head(&cp_path).await?; - let file_meta = FileMeta { - location: cp_url, - size: file_meta.size, - last_modified: file_meta.last_modified.timestamp_millis(), - }; +pub(crate) async fn replay_commit_actions( + state: &mut ReconciledCheckpointState, + root_store: std::sync::Arc, + commit_entries: &[(i64, ObjectMeta)], + start_version: i64, + end_version: i64, +) -> DeltaResult> { + if start_version > end_version { + return Ok(BTreeMap::new()); + } - let cp_data_final = cp_data_done_rx - .await - .map_err(|_| DeltaTableError::generic("checkpoint producer dropped unexpectedly"))?; + let mut expected_version = start_version; + let mut commit_timestamps = BTreeMap::new(); + for (version, meta) in commit_entries { + if *version < start_version || *version > end_version { + continue; + } + if *version != expected_version { + return Err(DeltaTableError::generic(format!( + "Missing commit file while building checkpoint: expected version {expected_version}, found {version}" + ))); + } + let bytes = root_store.get(&meta.location).await?.bytes().await?; + let actions = get_actions(*version, &bytes)?; + let commit_timestamp = resolve_commit_timestamp_from_actions( + *version, + meta, + state.protocol.as_ref(), + state.metadata.as_ref(), + &actions, + )?; + for action in actions { + state.apply_action(action); + } + commit_timestamps.insert(*version, commit_timestamp); + expected_version = expected_version.saturating_add(1); + } - let engine = self.log_store.engine(Some(self.operation_id)); - spawn_blocking(move || cp_writer.finalize(engine.as_ref(), &file_meta, cp_data_final)) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))??; + if expected_version.saturating_sub(1) != end_version { + return Err(DeltaTableError::generic(format!( + "Missing commit file while building checkpoint: expected final version {end_version}, replay reached {}", + expected_version.saturating_sub(1) + ))); + } + Ok(commit_timestamps) +} - Ok(()) +pub(crate) async fn read_checkpoint_rows_from_parquet( + root_store: std::sync::Arc, + meta: ObjectMeta, +) -> DeltaResult> { + let bytes = root_store.get(&meta.location).await?.bytes().await?; + tokio::task::spawn_blocking(move || { + // TODO: V2 checkpoints move add/remove rows into sidecars; full replay + // needs to read those parquet files too. + let mut batches = ParquetRecordBatchReaderBuilder::try_new(bytes) + .map_err(DeltaTableError::generic_err)? + .build() + .map_err(DeltaTableError::generic_err)?; + let mut rows = Vec::new(); + for batch in &mut batches { + let batch = batch.map_err(DeltaTableError::generic_err)?; + let mut decoded = decode_checkpoint_rows(&batch)?; + rows.append(&mut decoded); + } + Ok::<_, DeltaTableError>(rows) + }) + .await + .map_err(DeltaTableError::generic_err)? +} + +pub(crate) async fn replay_commit_header_actions( + state: &mut ReconciledHeaderState, + root_store: std::sync::Arc, + commit_entries: &[(i64, ObjectMeta)], + start_version: i64, + end_version: i64, +) -> DeltaResult> { + if start_version > end_version { + return Ok(BTreeMap::new()); } + + let mut expected_version = start_version; + let mut commit_timestamps = BTreeMap::new(); + for (version, meta) in commit_entries { + if *version < start_version || *version > end_version { + continue; + } + if *version != expected_version { + return Err(DeltaTableError::generic(format!( + "Missing commit file while replaying table header: expected version {expected_version}, found {version}" + ))); + } + let bytes = root_store.get(&meta.location).await?.bytes().await?; + let actions = get_actions(*version, &bytes)?; + let commit_timestamp = resolve_commit_timestamp_from_actions( + *version, + meta, + state.protocol.as_ref(), + state.metadata.as_ref(), + &actions, + )?; + for action in actions { + state.apply_action(action); + } + commit_timestamps.insert(*version, commit_timestamp); + expected_version = expected_version.saturating_add(1); + } + + if expected_version.saturating_sub(1) != end_version { + return Err(DeltaTableError::generic(format!( + "Missing commit file while replaying table header: expected final version {end_version}, replay reached {}", + expected_version.saturating_sub(1) + ))); + } + Ok(commit_timestamps) } /// Creates a checkpoint for the given table version. @@ -219,84 +650,553 @@ pub(crate) async fn create_checkpoint_for( .create_checkpoint(version) .await } +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; -/// Delete expired Delta log files up to a safe checkpoint boundary. -pub async fn cleanup_expired_logs_for( - mut keep_version: i64, - log_store: &dyn LogStore, - cutoff_timestamp: i64, - operation_id: Option, -) -> DeltaResult { - debug!("called cleanup_expired_logs_for"); - let delta_log_pattern = delta_log_regex()?; - let checkpoint_pattern = checkpoint_regex()?; - let object_store = log_store.object_store(operation_id); - let log_path = Path::from(DELTA_LOG_FOLDER); - - let log_entries = object_store.list(Some(&log_path)).collect::>().await; - - debug!("starting keep_version: {keep_version}"); - debug!( - "starting cutoff_timestamp: {:?}", - Utc.timestamp_millis_opt(cutoff_timestamp).unwrap() - ); - - let min_retention_version = log_entries - .iter() - .filter_map(|entry| entry.as_ref().ok()) - .filter_map(|meta| { - parse_version(delta_log_pattern, &meta.location) - .map(|ver| (ver, meta.last_modified.timestamp_millis())) + use chrono::DateTime; + use datafusion::arrow::datatypes::DataType as ArrowDataType; + use object_store::memory::InMemory; + use object_store::path::Path; + use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; + + use super::{ + checkpoint_fields, decode_checkpoint_rows, encode_checkpoint_rows, + replay_commit_header_actions, ReconciledCheckpointState, ReconciledHeaderState, + }; + use crate::spec::{ + Action, Add, CheckpointActionRow, CommitInfo, DataType, DeletionVectorDescriptor, + DeltaError, DeltaResult, Metadata, Protocol, Remove, StorageType, StructField, StructType, + TableFeature, Transaction, + }; + + fn test_metadata( + configuration: impl IntoIterator, + ) -> DeltaResult { + Metadata::try_new( + None, + None, + StructType::try_new([StructField::not_null("id", DataType::LONG)])?, + Vec::new(), + 0, + configuration + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ) + } + + fn commit_meta(version: i64, last_modified_millis: i64) -> DeltaResult { + let last_modified = DateTime::from_timestamp_millis(last_modified_millis) + .ok_or_else(|| DeltaError::generic("test timestamp must be valid"))?; + Ok(ObjectMeta { + location: Path::from(format!("_delta_log/{version:020}.json")), + last_modified, + size: 0, + e_tag: None, + version: None, }) - .filter(|(_, ts)| *ts >= cutoff_timestamp) - .map(|(ver, _)| ver) - .min() - .unwrap_or(keep_version); + } - keep_version = keep_version.min(min_retention_version); + async fn put_commit( + store: &Arc, + version: i64, + actions: &[Action], + ) -> DeltaResult<()> { + let mut bytes = Vec::new(); + for (index, action) in actions.iter().enumerate() { + if index > 0 { + bytes.push(b'\n'); + } + serde_json::to_writer(&mut bytes, action)?; + } + store + .put( + &Path::from(format!("_delta_log/{version:020}.json")), + bytes.into(), + ) + .await?; + Ok(()) + } - let safe_checkpoint_version = log_entries - .iter() - .filter_map(|entry| entry.as_ref().ok()) - .filter_map(|meta| parse_version(checkpoint_pattern, &meta.location)) - .filter(|ver| *ver <= keep_version) - .max(); + #[test] + fn checkpoint_row_roundtrip_preserves_add_path() -> DeltaResult<()> { + let rows = vec![CheckpointActionRow { + add: Some(Add { + path: "part-000.parquet".to_string(), + partition_values: HashMap::new(), + size: 10, + modification_time: 20, + data_change: true, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + clustering_provider: None, + commit_version: None, + commit_timestamp: None, + }), + ..Default::default() + }]; + let batch = encode_checkpoint_rows(&rows)?; + let decoded = decode_checkpoint_rows(&batch)?; + assert_eq!(decoded.len(), 1); + assert_eq!( + decoded + .first() + .and_then(|row| row.add.as_ref()) + .map(|add| add.path.as_str()), + Some("part-000.parquet") + ); + Ok(()) + } - let Some(safe_checkpoint_version) = safe_checkpoint_version else { - debug!( - "Not cleaning metadata files, could not find a checkpoint with version <= keep_version ({keep_version})" + #[test] + fn checkpoint_row_roundtrip_preserves_shared_protocol_and_dv_models() -> DeltaResult<()> { + let protocol = Protocol::new( + 3, + 7, + Some(vec![TableFeature::TimestampWithoutTimezone]), + Some(vec![TableFeature::AppendOnly, TableFeature::ColumnMapping]), ); - return Ok(0); - }; + let deletion_vector = DeletionVectorDescriptor { + storage_type: StorageType::Inline, + path_or_inline_dv: "encoded-dv".to_string(), + offset: Some(12), + size_in_bytes: 34, + cardinality: 56, + }; + let rows = vec![ + CheckpointActionRow { + protocol: Some(protocol.clone()), + ..Default::default() + }, + CheckpointActionRow { + add: Some(Add { + path: "part-001.parquet".to_string(), + partition_values: HashMap::new(), + size: 10, + modification_time: 20, + data_change: true, + stats: None, + tags: None, + deletion_vector: Some(deletion_vector.clone()), + base_row_id: Some(1), + default_row_commit_version: Some(2), + clustering_provider: Some("liquid".to_string()), + commit_version: None, + commit_timestamp: None, + }), + ..Default::default() + }, + CheckpointActionRow { + remove: Some(Remove { + path: "part-001.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(30), + extended_file_metadata: Some(true), + partition_values: Some(HashMap::new()), + size: Some(10), + stats: Some("{\"numRecords\":1}".to_string()), + tags: None, + deletion_vector: Some(deletion_vector), + base_row_id: Some(1), + default_row_commit_version: Some(2), + }), + ..Default::default() + }, + ]; + + let batch = encode_checkpoint_rows(&rows)?; + let decoded = decode_checkpoint_rows(&batch)?; - debug!("safe_checkpoint_version: {safe_checkpoint_version}"); + assert_eq!(decoded.len(), 3); + assert_eq!(decoded[0].protocol.as_ref(), Some(&protocol)); + assert_eq!( + decoded[1] + .add + .as_ref() + .and_then(|add| add.deletion_vector.as_ref()) + .map(|dv| (&dv.storage_type, dv.path_or_inline_dv.as_str(), dv.offset)), + Some((&StorageType::Inline, "encoded-dv", Some(12))) + ); + assert_eq!( + decoded[2] + .remove + .as_ref() + .and_then(|remove| remove.stats.as_deref()), + Some("{\"numRecords\":1}") + ); - let locations = futures::stream::iter(log_entries.into_iter()) - .filter_map(|meta| async move { - let meta = match meta { - Ok(m) => m, - Err(err) => { - error!("Error received while cleaning up expired logs: {err:?}"); - return None; + Ok(()) + } + + #[test] + fn checkpoint_row_roundtrip_preserves_remove_stats() -> DeltaResult<()> { + let rows = vec![CheckpointActionRow { + remove: Some(Remove { + path: "part-000.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(20), + extended_file_metadata: Some(true), + partition_values: Some(HashMap::new()), + size: Some(10), + stats: Some("{\"numRecords\":1}".to_string()), + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + }), + ..Default::default() + }]; + + let batch = encode_checkpoint_rows(&rows)?; + let decoded = decode_checkpoint_rows(&batch)?; + + assert_eq!(decoded.len(), 1); + assert_eq!( + decoded + .first() + .and_then(|row| row.remove.as_ref()) + .and_then(|remove| remove.stats.as_deref()), + Some("{\"numRecords\":1}") + ); + Ok(()) + } + + #[test] + fn reconciled_state_remove_masks_old_add() { + let mut state = ReconciledCheckpointState::default(); + state.apply_action(Action::Add(Add { + path: "a.parquet".to_string(), + partition_values: HashMap::new(), + size: 1, + modification_time: 1, + data_change: true, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + clustering_provider: None, + commit_version: None, + commit_timestamp: None, + })); + state.apply_action(Action::Remove(Remove { + path: "a.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(2), + extended_file_metadata: None, + partition_values: None, + size: None, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + })); + assert!(!state.adds.contains_key("a.parquet")); + assert!(state.removes.contains_key("a.parquet")); + } + + #[test] + fn checkpoint_pruning_drops_expired_remove_and_txn_actions() -> DeltaResult<()> { + const DAY_MILLIS: i64 = 24 * 60 * 60 * 1000; + let now = 10 * DAY_MILLIS; + + let mut state = ReconciledCheckpointState::default(); + state.apply_action(Action::Metadata(test_metadata([ + ("delta.deletedFileRetentionDuration", "interval 7 days"), + ("delta.logRetentionDuration", "interval 30 days"), + ])?)); + state.apply_action(Action::Remove(Remove { + path: "expired.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(now - 8 * DAY_MILLIS), + extended_file_metadata: None, + partition_values: None, + size: None, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + })); + state.apply_action(Action::Remove(Remove { + path: "fresh.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(now - 6 * DAY_MILLIS), + extended_file_metadata: None, + partition_values: None, + size: None, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + })); + state.apply_action(Action::Remove(Remove { + path: "unknown-ts.parquet".to_string(), + data_change: true, + deletion_timestamp: None, + extended_file_metadata: None, + partition_values: None, + size: None, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + })); + state.apply_action(Action::Txn(Transaction { + app_id: "expired-app".to_string(), + version: 1, + last_updated: Some(now - 31 * DAY_MILLIS), + })); + state.apply_action(Action::Txn(Transaction { + app_id: "fresh-app".to_string(), + version: 2, + last_updated: Some(now - 29 * DAY_MILLIS), + })); + state.apply_action(Action::Txn(Transaction { + app_id: "legacy-app".to_string(), + version: 3, + last_updated: None, + })); + + state.prune_expired_checkpoint_actions(now)?; + + assert!(!state.removes.contains_key("expired.parquet")); + assert!(state.removes.contains_key("fresh.parquet")); + assert!(state.removes.contains_key("unknown-ts.parquet")); + assert!(!state.txns.contains_key("expired-app")); + assert!(state.txns.contains_key("fresh-app")); + assert!(state.txns.contains_key("legacy-app")); + Ok(()) + } + + #[test] + fn checkpoint_pruning_uses_latest_metadata_configuration() -> DeltaResult<()> { + const DAY_MILLIS: i64 = 24 * 60 * 60 * 1000; + let now = 3 * DAY_MILLIS; + + let mut state = ReconciledCheckpointState::default(); + state.apply_action(Action::Metadata(test_metadata([ + ("delta.deletedFileRetentionDuration", "interval 1 day"), + ("delta.logRetentionDuration", "interval 1 day"), + ])?)); + state.apply_action(Action::Remove(Remove { + path: "older-remove.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(now - 2 * DAY_MILLIS), + extended_file_metadata: None, + partition_values: None, + size: None, + stats: None, + tags: None, + deletion_vector: None, + base_row_id: None, + default_row_commit_version: None, + })); + state.apply_action(Action::Metadata(test_metadata([ + ("delta.deletedFileRetentionDuration", "interval 30 days"), + ("delta.logRetentionDuration", "interval 30 days"), + ])?)); + + state.prune_expired_checkpoint_actions(now)?; + + assert!(state.removes.contains_key("older-remove.parquet")); + Ok(()) + } + + #[test] + fn checkpoint_schema_keeps_protocol_and_metadata_fields() { + #[expect(clippy::expect_used)] + let fields = checkpoint_fields().expect("checkpoint fields should build"); + let metadata_has_configuration = fields + .iter() + .find(|field| field.name() == "metaData") + .and_then(|field| match field.data_type() { + ArrowDataType::Struct(fields) => { + Some(fields.iter().any(|field| field.name() == "configuration")) + } + _ => None, + }); + let protocol_has_reader_features = fields + .iter() + .find(|field| field.name() == "protocol") + .and_then(|field| match field.data_type() { + ArrowDataType::Struct(fields) => { + Some(fields.iter().any(|field| field.name() == "readerFeatures")) } - }; + _ => None, + }); - let ts = meta.last_modified.timestamp_millis(); - let log_ver = parse_version(delta_log_pattern, &meta.location)?; + assert_eq!(metadata_has_configuration, Some(true)); + assert_eq!(protocol_has_reader_features, Some(true)); + } - if log_ver < safe_checkpoint_version && ts <= cutoff_timestamp { - Some(Ok(meta.location)) - } else { - None - } - }) - .boxed(); + #[test] + fn checkpoint_schema_keeps_remove_stats_field() { + #[expect(clippy::expect_used)] + let fields = checkpoint_fields().expect("checkpoint fields should build"); + let remove_has_stats = fields + .iter() + .find(|field| field.name() == "remove") + .and_then(|field| match field.data_type() { + ArrowDataType::Struct(fields) => { + Some(fields.iter().any(|field| field.name() == "stats")) + } + _ => None, + }); + + assert_eq!(remove_has_stats, Some(true)); + } + + #[test] + fn checkpoint_schema_reuses_shared_payload_types() { + #[expect(clippy::expect_used)] + let fields = checkpoint_fields().expect("checkpoint fields should build"); + #[expect(clippy::expect_used)] + let expected_add = + ArrowDataType::try_from(&crate::spec::DataType::from(crate::spec::add_struct_type())) + .expect("shared add schema should convert to Arrow"); + #[expect(clippy::expect_used)] + let expected_metadata = ArrowDataType::try_from(&crate::spec::DataType::from( + crate::spec::metadata_struct_type(), + )) + .expect("shared metadata schema should convert to Arrow"); + + let add_type = fields + .iter() + .find(|field| field.name() == "add") + .map(|field| field.data_type().clone()); + let metadata_type = fields + .iter() + .find(|field| field.name() == "metaData") + .map(|field| field.data_type().clone()); + + assert_eq!(add_type, Some(expected_add)); + assert_eq!(metadata_type, Some(expected_metadata)); + } + + #[tokio::test] + async fn replay_commit_header_actions_prefers_in_commit_timestamp() -> DeltaResult<()> { + let store: Arc = Arc::new(InMemory::new()); + let protocol = Protocol::new(1, 7, None, Some(vec![TableFeature::InCommitTimestamp])); + let metadata = test_metadata([("delta.enableInCommitTimestamps", "true")])?; + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(123), + ..Default::default() + }), + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + ) + .await?; + + let commit_meta = commit_meta(0, 9_999)?; + let timestamps = replay_commit_header_actions( + &mut ReconciledHeaderState::default(), + store, + &[(0, commit_meta)], + 0, + 0, + ) + .await?; + + assert_eq!(timestamps.get(&0), Some(&123)); + Ok(()) + } + + #[tokio::test] + async fn replay_commit_header_actions_falls_back_to_mtime_before_enablement() -> DeltaResult<()> + { + let store: Arc = Arc::new(InMemory::new()); + let protocol = Protocol::new(1, 2, None, None); + let metadata = test_metadata([])?; + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo::default()), + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + ) + .await?; - let deleted = object_store - .delete_stream(locations) - .try_collect::>() + let commit_meta = commit_meta(0, 4_567)?; + let timestamps = replay_commit_header_actions( + &mut ReconciledHeaderState::default(), + store, + &[(0, commit_meta)], + 0, + 0, + ) .await?; - debug!("Deleted {} expired logs", deleted.len()); - Ok(deleted.len()) + assert_eq!(timestamps.get(&0), Some(&4_567)); + Ok(()) + } + + #[tokio::test] + async fn replay_commit_header_actions_ignores_pre_enable_ict_before_upgrade() -> DeltaResult<()> + { + let store: Arc = Arc::new(InMemory::new()); + let pre_enable_protocol = Protocol::new(1, 2, None, None); + let pre_enable_metadata = test_metadata([])?; + let enabled_protocol = + Protocol::new(1, 7, None, Some(vec![TableFeature::InCommitTimestamp])); + let enabled_metadata = test_metadata([ + ("delta.enableInCommitTimestamps", "true"), + ("delta.inCommitTimestampEnablementVersion", "1"), + ("delta.inCommitTimestampEnablementTimestamp", "300"), + ])?; + put_commit( + &store, + 0, + &[ + Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(10_000), + ..Default::default() + }), + Action::Protocol(pre_enable_protocol), + Action::Metadata(pre_enable_metadata), + ], + ) + .await?; + put_commit( + &store, + 1, + &[ + Action::CommitInfo(CommitInfo { + in_commit_timestamp: Some(300), + ..Default::default() + }), + Action::Protocol(enabled_protocol), + Action::Metadata(enabled_metadata), + ], + ) + .await?; + + let timestamps = replay_commit_header_actions( + &mut ReconciledHeaderState::default(), + store, + &[(0, commit_meta(0, 4_567)?), (1, commit_meta(1, 9_999)?)], + 0, + 1, + ) + .await?; + + assert_eq!(timestamps.get(&0), Some(&4_567)); + assert_eq!(timestamps.get(&1), Some(&300)); + Ok(()) + } } diff --git a/crates/sail-delta-lake/src/kernel/error.rs b/crates/sail-delta-lake/src/kernel/error.rs deleted file mode 100644 index 6377d530f8..0000000000 --- a/crates/sail-delta-lake/src/kernel/error.rs +++ /dev/null @@ -1,19 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -pub use crate::error::{DeltaError as DeltaTableError, DeltaResult}; diff --git a/crates/sail-delta-lake/src/kernel/log_segment.rs b/crates/sail-delta-lake/src/kernel/log_segment.rs new file mode 100644 index 0000000000..6bdd7b362d --- /dev/null +++ b/crates/sail-delta-lake/src/kernel/log_segment.rs @@ -0,0 +1,84 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +pub(crate) use crate::delta_log::ReplayedTableHeader; +use crate::delta_log::{list_log_files, read_last_checkpoint_version_from_store}; +use crate::spec::DeltaResult; + +/// The minimal set of Delta log files needed to reconstruct table state up to a given version. +#[derive(Debug, Clone, Default)] +pub struct LogSegmentFiles { + /// Parquet checkpoint files for the latest checkpoint at or before `max_version`. + pub checkpoint_files: Vec, + /// Commit JSON files sorted by version, strictly newer than the latest checkpoint. + pub commit_files: Vec, +} + +/// Options controlling which commit files are included in the resolved segment. +#[derive(Debug, Clone, Copy, Default)] +pub struct LogSegmentResolveOptions { + /// If set, only include commit JSON files whose version falls within `[start, end]`. + pub commit_version_range: Option<(i64, i64)>, +} + +/// List all Delta log files up to `max_version` from the given log store. +/// +/// Returns a [`LogSegmentFiles`] containing: +/// - all parquet files belonging to the **latest** checkpoint at or before `max_version` +/// - all commit JSON files at or before `max_version` +/// +/// Commit files are **not** filtered against the checkpoint here. +pub async fn list_log_segment_files( + log_store: &crate::storage::LogStoreRef, + max_version: i64, +) -> DeltaResult { + let store = log_store.object_store(None); + let offset_version = read_last_checkpoint_version_from_store(store.clone()) + .await + .map(|v| v.min(max_version).saturating_sub(1)) + .unwrap_or(0); + + let (_, checkpoint_meta, commit_metas) = + list_log_files(store, offset_version, max_version).await?; + + let mut checkpoint_files: Vec = match checkpoint_meta { + Some(meta) => { + let filename = meta + .location + .as_ref() + .rsplit('/') + .next() + .unwrap_or_default() + .to_string(); + vec![filename] + } + None => Vec::new(), + }; + checkpoint_files.sort(); + + let mut commit_files: Vec = commit_metas + .into_iter() + .filter_map(|(_, meta)| { + meta.location + .as_ref() + .rsplit('/') + .next() + .map(|s| s.to_string()) + }) + .collect(); + commit_files.sort(); + + Ok(LogSegmentFiles { + checkpoint_files, + commit_files, + }) +} diff --git a/crates/sail-delta-lake/src/kernel/mod.rs b/crates/sail-delta-lake/src/kernel/mod.rs index 02c24b66ee..b9ae599e5f 100644 --- a/crates/sail-delta-lake/src/kernel/mod.rs +++ b/crates/sail-delta-lake/src/kernel/mod.rs @@ -10,25 +10,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub mod arrow; -pub mod models; +pub mod log_segment; pub mod snapshot; -pub mod statistics; pub mod transaction; pub(crate) mod checkpoints; mod config; -mod error; -mod operation; -mod table_properties; - -use std::sync::LazyLock; pub use config::DeltaTableConfig; -use delta_kernel::engine::arrow_expression::ArrowEvaluationHandler; -pub use error::{DeltaResult, DeltaTableError}; -pub use operation::{DeltaOperation, MergePredicate, SaveMode}; -pub use table_properties::TablePropertiesExt; -pub(crate) static ARROW_HANDLER: LazyLock = - LazyLock::new(|| ArrowEvaluationHandler {}); +pub use crate::spec::operation::{DeltaOperation, MergePredicate, SaveMode}; +pub use crate::spec::SchemaRef; diff --git a/crates/sail-delta-lake/src/kernel/models/metadata.rs b/crates/sail-delta-lake/src/kernel/models/metadata.rs deleted file mode 100644 index 27006ecdb0..0000000000 --- a/crates/sail-delta-lake/src/kernel/models/metadata.rs +++ /dev/null @@ -1,137 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: -use std::collections::HashMap; - -use delta_kernel::actions::Metadata; -use serde_json::json; - -use super::StructType; -use crate::kernel::DeltaResult; - -/// Extension trait for working with `Metadata` until kernel exposes mutation APIs. -#[expect(dead_code)] -pub trait MetadataExt { - fn with_table_id(self, table_id: String) -> DeltaResult; - - fn with_name(self, name: String) -> DeltaResult; - - fn with_description(self, description: String) -> DeltaResult; - - fn with_schema(self, schema: &StructType) -> DeltaResult; - - fn add_config_key(self, key: String, value: String) -> DeltaResult; - - fn remove_config_key(self, key: &str) -> DeltaResult; -} - -impl MetadataExt for Metadata { - fn with_table_id(self, table_id: String) -> DeltaResult { - let schema_string = serde_json::to_string(&self.parse_schema()?)?; - let value = json!({ - "id": table_id, - "name": self.name(), - "description": self.description(), - "format": { "provider": "parquet", "options": {} }, - "schemaString": schema_string, - "partitionColumns": self.partition_columns().clone(), - "configuration": self.configuration().clone(), - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } - - fn with_name(self, name: String) -> DeltaResult { - let schema_string = serde_json::to_string(&self.parse_schema()?)?; - let value = json!({ - "id": self.id(), - "name": name, - "description": self.description(), - "format": { "provider": "parquet", "options": {} }, - "schemaString": schema_string, - "partitionColumns": self.partition_columns().clone(), - "configuration": self.configuration().clone(), - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } - - fn with_description(self, description: String) -> DeltaResult { - let schema_string = serde_json::to_string(&self.parse_schema()?)?; - let value = json!({ - "id": self.id(), - "name": self.name(), - "description": description, - "format": { "provider": "parquet", "options": {} }, - "schemaString": schema_string, - "partitionColumns": self.partition_columns().clone(), - "configuration": self.configuration().clone(), - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } - - fn with_schema(self, schema: &StructType) -> DeltaResult { - let value = json!({ - "id": self.id(), - "name": self.name(), - "description": self.description(), - "format": { "provider": "parquet", "options": {} }, - "schemaString": serde_json::to_string(schema)?, - "partitionColumns": self.partition_columns().clone(), - "configuration": self.configuration().clone(), - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } - - fn add_config_key(self, key: String, value: String) -> DeltaResult { - let mut configuration: HashMap = self.configuration().clone(); - configuration.insert(key, value); - let schema_string = serde_json::to_string(&self.parse_schema()?)?; - let value = json!({ - "id": self.id(), - "name": self.name(), - "description": self.description(), - "format": { "provider": "parquet", "options": {} }, - "schemaString": schema_string, - "partitionColumns": self.partition_columns().clone(), - "configuration": configuration, - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } - - fn remove_config_key(self, key: &str) -> DeltaResult { - let mut configuration: HashMap = self.configuration().clone(); - configuration.remove(key); - let schema_string = serde_json::to_string(&self.parse_schema()?)?; - let value = json!({ - "id": self.id(), - "name": self.name(), - "description": self.description(), - "format": { "provider": "parquet", "options": {} }, - "schemaString": schema_string, - "partitionColumns": self.partition_columns().clone(), - "configuration": configuration, - "createdTime": self.created_time(), - }); - Ok(serde_json::from_value(value)?) - } -} diff --git a/crates/sail-delta-lake/src/kernel/models/mod.rs b/crates/sail-delta-lake/src/kernel/models/mod.rs deleted file mode 100644 index ab5c9765e9..0000000000 --- a/crates/sail-delta-lake/src/kernel/models/mod.rs +++ /dev/null @@ -1,42 +0,0 @@ -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -mod actions; -mod metadata; - -pub use actions::{ - Action, Add, CommitInfo, DeletionVectorDescriptor, Remove, RemoveOptions, StorageType, - Transaction, -}; -pub use delta_kernel::actions::{Metadata, Protocol}; -pub use delta_kernel::schema::{DataType, Schema, StructField, StructType}; -pub use metadata::MetadataExt; - -pub use super::statistics::{ColumnCountStat, ColumnValueStat, Stats}; -pub use crate::conversion::ScalarExt; - -// [Credit]: -/// Checks if any field (including nested) in the provided iterator is a `timestampNtz`. -pub fn contains_timestampntz<'a>(mut fields: impl Iterator) -> bool { - fn has_timestamp(dtype: &DataType) -> bool { - match dtype { - &DataType::TIMESTAMP_NTZ => true, - DataType::Array(inner) => has_timestamp(inner.element_type()), - DataType::Struct(struct_type) => { - struct_type.fields().any(|f| has_timestamp(f.data_type())) - } - _ => false, - } - } - - fields.any(|field| has_timestamp(field.data_type())) -} diff --git a/crates/sail-delta-lake/src/kernel/snapshot/iterators.rs b/crates/sail-delta-lake/src/kernel/snapshot/iterators.rs deleted file mode 100644 index c90a3df00f..0000000000 --- a/crates/sail-delta-lake/src/kernel/snapshot/iterators.rs +++ /dev/null @@ -1,463 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: - -use std::borrow::Cow; -use std::collections::HashMap; -use std::str::FromStr; -use std::sync::LazyLock; - -// TODO: Stop depending on delta-rs StorageType. -use chrono::{DateTime, Utc}; -use datafusion::arrow::array::cast::AsArray; -use datafusion::arrow::array::types::Int64Type; -use datafusion::arrow::array::{Array, RecordBatch, StructArray}; -use datafusion::arrow::datatypes::{DataType as ArrowDataType, Int32Type}; -use delta_kernel::expressions::{Scalar, StructData}; -use delta_kernel::scan::scan_row_schema; -use delta_kernel::schema::DataType; -use percent_encoding::percent_decode_str; - -use crate::kernel::models::{Add, DeletionVectorDescriptor, Remove, ScalarExt, StorageType}; -use crate::kernel::{DeltaResult, DeltaTableError}; - -const FIELD_NAME_PATH: &str = "path"; -const FIELD_NAME_SIZE: &str = "size"; -const FIELD_NAME_MODIFICATION_TIME: &str = "modificationTime"; -const FIELD_NAME_STATS: &str = "stats"; -const FIELD_NAME_STATS_PARSED: &str = "stats_parsed"; -#[expect(dead_code)] -const FIELD_NAME_FILE_CONSTANT_VALUES: &str = "fileConstantValues"; -#[expect(dead_code)] -const FIELD_NAME_PARTITION_VALUES: &str = "partitionValues"; -const FIELD_NAME_PARTITION_VALUES_PARSED: &str = "partitionValues_parsed"; -const FIELD_NAME_DELETION_VECTOR: &str = "deletionVector"; - -const STATS_FIELD_NUM_RECORDS: &str = "numRecords"; -const STATS_FIELD_MIN_VALUES: &str = "minValues"; -const STATS_FIELD_MAX_VALUES: &str = "maxValues"; -const STATS_FIELD_NULL_COUNT: &str = "nullCount"; - -const DV_FIELD_STORAGE_TYPE: &str = "storageType"; -const DV_FIELD_PATH_OR_INLINE_DV: &str = "pathOrInlineDv"; -const DV_FIELD_SIZE_IN_BYTES: &str = "sizeInBytes"; -const DV_FIELD_CARDINALITY: &str = "cardinality"; -const DV_FIELD_OFFSET: &str = "offset"; - -static FIELD_INDICES: LazyLock> = LazyLock::new(|| { - let schema = scan_row_schema(); - let mut indices = HashMap::new(); - - if let Some(path_idx) = schema.index_of(FIELD_NAME_PATH) { - indices.insert(FIELD_NAME_PATH, path_idx); - } - - if let Some(size_idx) = schema.index_of(FIELD_NAME_SIZE) { - indices.insert(FIELD_NAME_SIZE, size_idx); - } - - if let Some(modification_time_idx) = schema.index_of(FIELD_NAME_MODIFICATION_TIME) { - indices.insert(FIELD_NAME_MODIFICATION_TIME, modification_time_idx); - } - - if let Some(stats_idx) = schema.index_of(FIELD_NAME_STATS) { - indices.insert(FIELD_NAME_STATS, stats_idx); - } - - indices -}); - -static DV_FIELD_INDICES: LazyLock> = LazyLock::new(|| { - let schema = scan_row_schema(); - let mut indices = HashMap::new(); - - if let Some(dv_field) = schema.field(FIELD_NAME_DELETION_VECTOR) { - if let DataType::Struct(dv_type) = dv_field.data_type() { - if let Some(storage_type_idx) = dv_type.index_of(DV_FIELD_STORAGE_TYPE) { - indices.insert(DV_FIELD_STORAGE_TYPE, storage_type_idx); - } - - if let Some(path_or_inline_dv_idx) = dv_type.index_of(DV_FIELD_PATH_OR_INLINE_DV) { - indices.insert(DV_FIELD_PATH_OR_INLINE_DV, path_or_inline_dv_idx); - } - - if let Some(size_in_bytes_idx) = dv_type.index_of(DV_FIELD_SIZE_IN_BYTES) { - indices.insert(DV_FIELD_SIZE_IN_BYTES, size_in_bytes_idx); - } - - if let Some(cardinality_idx) = dv_type.index_of(DV_FIELD_CARDINALITY) { - indices.insert(DV_FIELD_CARDINALITY, cardinality_idx); - } - } - } - - indices -}); - -/// Provides semantic, typed access to file metadata from Delta log replay. -/// -/// This struct wraps a RecordBatch containing file data and provides zero-copy -/// access to individual file entries through an index. It serves as a view into -/// the kernel's log replay results, offering convenient methods to extract -/// file properties without unnecessary data copies. -#[derive(Clone)] -pub struct LogicalFileView { - files: RecordBatch, - index: usize, -} - -impl LogicalFileView { - /// Creates a new view into the specified file entry. - pub(crate) fn new(files: RecordBatch, index: usize) -> Self { - Self { files, index } - } - - /// Returns the file path with URL decoding applied. - pub fn path(&self) -> Cow<'_, str> { - if let Some(&path_idx) = FIELD_INDICES.get(FIELD_NAME_PATH) { - if let Some(raw) = get_string_value(self.files.column(path_idx), self.index) { - return percent_decode_str(raw).decode_utf8_lossy(); - } - } - Cow::Borrowed("") - } - - /// Returns the file size in bytes. - pub fn size(&self) -> i64 { - if let Some(&size_idx) = FIELD_INDICES.get(FIELD_NAME_SIZE) { - self.files - .column(size_idx) - .as_primitive::() - .value(self.index) - } else { - 0 - } - } - - /// Returns the file modification time in milliseconds since Unix epoch. - pub fn modification_time(&self) -> i64 { - if let Some(&mod_time_idx) = FIELD_INDICES.get(FIELD_NAME_MODIFICATION_TIME) { - self.files - .column(mod_time_idx) - .as_primitive::() - .value(self.index) - } else { - 0 - } - } - - /// Returns the file modification time as a UTC DateTime. - pub fn modification_datetime(&self) -> DeltaResult> { - DateTime::from_timestamp_millis(self.modification_time()).ok_or_else(|| { - DeltaTableError::generic(format!( - "invalid modification_time: {:?}", - self.modification_time() - )) - }) - } - - /// Returns the raw JSON statistics string for this file, if available. - pub fn stats(&self) -> Option<&str> { - FIELD_INDICES - .get(FIELD_NAME_STATS) - .and_then(|&stats_idx| get_string_value(self.files.column(stats_idx), self.index)) - } - - /// Returns the parsed partition values as structured data. - pub fn partition_values(&self) -> Option { - self.files - .column_by_name(FIELD_NAME_PARTITION_VALUES_PARSED) - .and_then(|col| col.as_struct_opt()) - .and_then(|arr| { - arr.is_valid(self.index) - .then(|| match Scalar::from_array(arr, self.index) { - Some(Scalar::Struct(s)) => Some(s), - _ => None, - }) - .flatten() - }) - } - - /// Converts partition values to a map of column names to serialized values. - fn partition_values_map(&self) -> HashMap> { - self.partition_values() - .map(|data| { - data.fields() - .iter() - .zip(data.values().iter()) - .map(|(k, v)| { - ( - k.name().to_string(), - if v.is_null() { - None - } else { - Some(v.serialize().into_owned()) - }, - ) - }) - .collect() - }) - .unwrap_or_default() - } - - /// Returns the parsed statistics as a StructArray, if available. - fn stats_parsed(&self) -> Option<&StructArray> { - self.files - .column_by_name(FIELD_NAME_STATS_PARSED) - .and_then(|col| col.as_struct_opt()) - } - - /// Returns the number of records in this file. - pub fn num_records(&self) -> Option { - self.stats_parsed() - .and_then(|stats| stats.column_by_name(STATS_FIELD_NUM_RECORDS)) - .and_then(|col| col.as_primitive_opt::()) - .map(|a| a.value(self.index) as usize) - } - - /// Returns null counts for all columns in this file as structured data. - pub fn null_counts(&self) -> Option { - self.stats_parsed() - .and_then(|stats| stats.column_by_name(STATS_FIELD_NULL_COUNT)) - .and_then(|c| Scalar::from_array(c.as_ref(), self.index)) - } - - /// Returns minimum values for all columns with statics in this file as structured data. - pub fn min_values(&self) -> Option { - self.stats_parsed() - .and_then(|stats| stats.column_by_name(STATS_FIELD_MIN_VALUES)) - .and_then(|c| Scalar::from_array(c.as_ref(), self.index)) - } - - /// Returns maximum values for all columns in this file as structured data. - /// - /// For timestamp columns, values are rounded up to handle microsecond truncation - /// in checkpoint statistics. - pub fn max_values(&self) -> Option { - self.stats_parsed() - .and_then(|stats| stats.column_by_name(STATS_FIELD_MAX_VALUES)) - .and_then(|c| Scalar::from_array(c.as_ref(), self.index)) - .map(|s| round_ms_datetimes(s, &ceil_datetime)) - } - - /// Returns a view into the deletion vector for this file, if present. - fn deletion_vector(&self) -> Option> { - let dv_col = self - .files - .column_by_name(FIELD_NAME_DELETION_VECTOR) - .and_then(|col| col.as_struct_opt())?; - if dv_col.null_count() == dv_col.len() { - return None; - } - dv_col - .is_valid(self.index) - .then(|| { - DV_FIELD_INDICES - .get(DV_FIELD_STORAGE_TYPE) - .and_then(|&storage_idx| { - let storage_col = dv_col.column(storage_idx); - storage_col - .is_valid(self.index) - .then_some(DeletionVectorView { - data: dv_col, - index: self.index, - }) - }) - }) - .flatten() - } - - /// Converts this file view into an Add action for log operations. - pub(crate) fn add_action(&self) -> Add { - Add { - path: self.path().to_string(), - partition_values: self.partition_values_map(), - size: self.size(), - modification_time: self.modification_time(), - data_change: true, - stats: self.stats().map(|v| v.to_string()), - tags: None, - deletion_vector: self.deletion_vector().map(|dv| dv.descriptor()), - base_row_id: None, - default_row_commit_version: None, - clustering_provider: None, - commit_version: None, - commit_timestamp: None, - } - } - - /// Converts this file view into a Remove action for log operations. - pub fn remove_action(&self, data_change: bool) -> Remove { - Remove { - // TODO use the raw (still encoded) path here once we reconciled serde ... - path: self.path().to_string(), - data_change, - deletion_timestamp: Some(Utc::now().timestamp_millis()), - extended_file_metadata: Some(true), - size: Some(self.size()), - partition_values: Some(self.partition_values_map()), - deletion_vector: self.deletion_vector().map(|dv| dv.descriptor()), - tags: None, - base_row_id: None, - default_row_commit_version: None, - } - } -} - -/// Rounds up timestamp values to handle microsecond truncation in checkpoint statistics. -/// -/// When delta.checkpoint.writeStatsAsStruct is enabled, microsecond timestamps are -/// truncated to milliseconds. This function rounds up by 1ms to ensure correct -/// range queries when stats are parsed on-the-fly. -fn ceil_datetime(v: i64) -> i64 { - let remainder = v % 1000; - if remainder == 0 { - // if nanoseconds precision remainder is 0, we assume it was truncated - // else we use the exact stats - ((v as f64 / 1000.0).floor() as i64 + 1) * 1000 - } else { - v - } -} - -/// Recursively applies a rounding function to timestamp values in scalar data. -fn round_ms_datetimes(value: Scalar, func: &F) -> Scalar -where - F: Fn(i64) -> i64, -{ - match value { - Scalar::Timestamp(v) => Scalar::Timestamp(func(v)), - Scalar::TimestampNtz(v) => Scalar::TimestampNtz(func(v)), - Scalar::Struct(ref struct_data) => { - let mut fields = Vec::new(); - let mut scalars = Vec::new(); - - for (field, scalar_value) in - struct_data.fields().iter().zip(struct_data.values().iter()) - { - fields.push(field.clone()); - scalars.push(round_ms_datetimes(scalar_value.clone(), func)); - } - match StructData::try_new(fields, scalars) { - Ok(data) => Scalar::Struct(data), - Err(_) => value, // Return original value if struct creation fails - } - } - other => other, - } -} - -/// Provides typed access to deletion vector metadata from log data. -/// -/// This struct wraps a StructArray containing deletion vector information -/// and provides zero-copy access to individual fields through an index. -#[derive(Debug)] -struct DeletionVectorView<'a> { - data: &'a StructArray, - /// Index into the deletion vector data array. - index: usize, -} - -impl DeletionVectorView<'_> { - /// Converts this view into a DeletionVectorDescriptor. - fn descriptor(&self) -> DeletionVectorDescriptor { - let storage_type = - StorageType::from_str(self.storage_type()).unwrap_or(StorageType::UuidRelativePath); - DeletionVectorDescriptor { - storage_type, - path_or_inline_dv: self.path_or_inline_dv().to_string(), - size_in_bytes: self.size_in_bytes(), - cardinality: self.cardinality(), - offset: self.offset(), - } - } - - /// Returns the storage type of the deletion vector. - fn storage_type(&self) -> &str { - DV_FIELD_INDICES - .get(DV_FIELD_STORAGE_TYPE) - .and_then(|&idx| get_string_value(self.data.column(idx), self.index)) - .unwrap_or("") - } - - /// Returns the path or inline data for the deletion vector. - fn path_or_inline_dv(&self) -> &str { - DV_FIELD_INDICES - .get(DV_FIELD_PATH_OR_INLINE_DV) - .and_then(|&idx| get_string_value(self.data.column(idx), self.index)) - .unwrap_or("") - } - - /// Returns the size of the deletion vector in bytes. - fn size_in_bytes(&self) -> i32 { - DV_FIELD_INDICES - .get(DV_FIELD_SIZE_IN_BYTES) - .map(|&idx| { - self.data - .column(idx) - .as_primitive::() - .value(self.index) - }) - .unwrap_or(0) - } - - /// Returns the number of deleted rows represented by this deletion vector. - fn cardinality(&self) -> i64 { - DV_FIELD_INDICES - .get(DV_FIELD_CARDINALITY) - .map(|&idx| { - self.data - .column(idx) - .as_primitive::() - .value(self.index) - }) - .unwrap_or(0) - } - - /// Returns the offset within the deletion vector file, if applicable. - fn offset(&self) -> Option { - let col = self - .data - .column_by_name(DV_FIELD_OFFSET) - .map(|c| c.as_primitive::())?; - col.is_valid(self.index).then(|| col.value(self.index)) - } -} - -/// Extracts a string value from an Arrow array at the specified index. -/// -/// Handles different string array types (Utf8, LargeUtf8, Utf8View) and -/// returns None for null values or unsupported types. -fn get_string_value(data: &dyn Array, index: usize) -> Option<&str> { - match data.data_type() { - ArrowDataType::Utf8 => { - let arr = data.as_string::(); - arr.is_valid(index).then(|| arr.value(index)) - } - ArrowDataType::LargeUtf8 => { - let arr = data.as_string::(); - arr.is_valid(index).then(|| arr.value(index)) - } - ArrowDataType::Utf8View => { - let arr = data.as_string_view(); - arr.is_valid(index).then(|| arr.value(index)) - } - _ => None, - } -} diff --git a/crates/sail-delta-lake/src/kernel/snapshot/log_data.rs b/crates/sail-delta-lake/src/kernel/snapshot/log_data.rs deleted file mode 100644 index 2dfaa3c513..0000000000 --- a/crates/sail-delta-lake/src/kernel/snapshot/log_data.rs +++ /dev/null @@ -1,559 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: - -use ::datafusion::arrow::array::{Array, RecordBatch, StringArray, StructArray}; -use delta_kernel::actions::{Metadata, Protocol}; -use delta_kernel::scan::scan_row_schema; -use delta_kernel::table_configuration::TableConfiguration; -use delta_kernel::table_properties::TableProperties; -use log::warn; - -use crate::kernel::snapshot::iterators::LogicalFileView; -use crate::kernel::{DeltaResult, DeltaTableError}; - -const COL_NUM_RECORDS: &str = "numRecords"; -const COL_MIN_VALUES: &str = "minValues"; -const COL_MAX_VALUES: &str = "maxValues"; -const COL_NULL_COUNT: &str = "nullCount"; - -/// Provides semanitc access to the log data. -/// -/// This is a helper struct that provides access to the log data in a more semantic way -/// to avid the necessiity of knowing the exact layout of the underlying log data. -#[derive(Clone)] -pub struct LogDataHandler<'a> { - data: &'a RecordBatch, - config: &'a TableConfiguration, -} - -impl<'a> LogDataHandler<'a> { - pub(crate) fn new(data: &'a RecordBatch, config: &'a TableConfiguration) -> Self { - Self { data, config } - } - - #[expect(dead_code)] - pub(crate) fn table_configuration(&self) -> &TableConfiguration { - self.config - } - - pub(crate) fn table_properties(&self) -> &TableProperties { - self.config.table_properties() - } - - pub(crate) fn protocol(&self) -> &Protocol { - self.config.protocol() - } - - #[expect(dead_code)] - pub(crate) fn metadata(&self) -> &Metadata { - self.config.metadata() - } - - /// The number of files in the log data. - pub fn num_files(&self) -> usize { - self.data.num_rows() - } - - pub fn iter(&self) -> impl Iterator { - let batch = self.data.clone(); - (0..batch.num_rows()).map(move |idx| LogicalFileView::new(batch.clone(), idx)) - } -} - -impl IntoIterator for LogDataHandler<'_> { - type Item = LogicalFileView; - type IntoIter = Box>; - - fn into_iter(self) -> Self::IntoIter { - let batch = self.data.clone(); - Box::new((0..self.data.num_rows()).map(move |idx| LogicalFileView::new(batch.clone(), idx))) - } -} - -mod datafusion { - use std::collections::HashSet; - use std::sync::{Arc, LazyLock}; - - use ::datafusion::arrow::array::{ArrayRef, BooleanArray, Int64Array, UInt64Array}; - use ::datafusion::arrow::compute::sum; - use ::datafusion::common::scalar::ScalarValue; - use ::datafusion::common::stats::{ColumnStatistics, Precision, Statistics}; - use ::datafusion::common::{Column, DataFusionError}; - use ::datafusion::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; - use ::datafusion::physical_optimizer::pruning::PruningStatistics; - use ::datafusion::physical_plan::Accumulator; - use arrow_schema::DataType as ArrowDataType; - use delta_kernel::expressions::Expression; - use delta_kernel::schema::{DataType, PrimitiveType}; - use delta_kernel::{EvaluationHandler, ExpressionEvaluator}; - - use super::*; - use crate::kernel::arrow::engine_ext::ExpressionEvaluatorExt as _; - use crate::kernel::ARROW_HANDLER; - - #[derive(Debug, Default, Clone)] - enum AccumulatorType { - Min, - Max, - #[default] - Unused, - } - // TODO validate this works with "wide and narrow" builds / stats - - /// Helper for processing data from the materialized Delta log. - struct FileStatsAccessor<'a> { - sizes: &'a Int64Array, - stats: &'a StructArray, - } - - impl<'a> FileStatsAccessor<'a> { - pub(crate) fn try_new(data: &'a RecordBatch) -> DeltaResult { - let sizes = batch_column::(data, "size")?; - let stats = batch_column::(data, "stats_parsed")?; - Ok(Self { sizes, stats }) - } - } - - impl FileStatsAccessor<'_> { - fn collect_count(&self, name: &str) -> Precision { - let num_records = struct_column_opt::(self.stats, name); - if let Some(num_records) = num_records { - if num_records.is_empty() { - Precision::Exact(0) - } else if let Some(null_count_mulls) = num_records.nulls() { - if null_count_mulls.null_count() > 0 { - Precision::Absent - } else { - sum(num_records) - .map(|s| Precision::Exact(s as usize)) - .unwrap_or(Precision::Absent) - } - } else { - sum(num_records) - .map(|s| Precision::Exact(s as usize)) - .unwrap_or(Precision::Absent) - } - } else { - Precision::Absent - } - } - - fn column_bounds( - &self, - path_step: &str, - name: &str, - fun_type: AccumulatorType, - ) -> Precision { - let mut path = name.split('.'); - let array = match nested_column(self.stats, path_step, &mut path) { - Ok(array) => array, - Err(_) => return Precision::Absent, - }; - let array_ref = array.as_ref(); - - if array_ref.data_type().is_primitive() { - let accumulator: Option> = match fun_type { - AccumulatorType::Min => MinAccumulator::try_new(array_ref.data_type()) - .map_or(None, |a| Some(Box::new(a))), - AccumulatorType::Max => MaxAccumulator::try_new(array_ref.data_type()) - .map_or(None, |a| Some(Box::new(a))), - _ => None, - }; - - if let Some(mut accumulator) = accumulator { - return accumulator - .update_batch(std::slice::from_ref(array)) - .ok() - .and_then(|_| accumulator.evaluate().ok()) - .map(Precision::Exact) - .unwrap_or(Precision::Absent); - } - - return Precision::Absent; - } - - match array_ref.data_type() { - ArrowDataType::Struct(fields) => fields - .iter() - .map(|f| { - self.column_bounds( - path_step, - &format!("{name}.{}", f.name()), - fun_type.clone(), - ) - }) - .map(|s| match s { - Precision::Exact(s) => Some(s), - _ => None, - }) - .collect::>>() - .map(|o| { - let arrays = match o.into_iter().map(|sv| sv.to_array()).collect::, - DataFusionError, - >>( - ) { - Ok(arrays) => arrays, - Err(_) => return Precision::Absent, - }; - let sa = StructArray::new(fields.clone(), arrays, None); - Precision::Exact(ScalarValue::Struct(Arc::new(sa))) - }) - .unwrap_or(Precision::Absent), - _ => Precision::Absent, - } - } - - fn num_records(&self) -> Precision { - self.collect_count(COL_NUM_RECORDS) - } - - fn total_size_files(&self) -> Precision { - let size = self - .sizes - .iter() - .flat_map(|s| s.map(|s| s as usize)) - .sum::(); - Precision::Inexact(size) - } - - fn column_stats(&self, name: impl AsRef) -> DeltaResult { - let null_count_col = format!("{COL_NULL_COUNT}.{}", name.as_ref()); - let null_count = self.collect_count(&null_count_col); - - let min_value = self.column_bounds(COL_MIN_VALUES, name.as_ref(), AccumulatorType::Min); - let min_value = match &min_value { - Precision::Exact(value) if value.is_null() => Precision::Absent, - // TODO this is a hack, we should not be casting here but rather when we read the checkpoint data. - // it seems sometimes the min/max values are stored as nanoseconds and sometimes as microseconds? - Precision::Exact(ScalarValue::TimestampNanosecond(a, b)) => Precision::Exact( - ScalarValue::TimestampMicrosecond(a.map(|v| v / 1000), b.clone()), - ), - _ => min_value, - }; - - let max_value = self.column_bounds(COL_MAX_VALUES, name.as_ref(), AccumulatorType::Max); - let max_value = match &max_value { - Precision::Exact(value) if value.is_null() => Precision::Absent, - Precision::Exact(ScalarValue::TimestampNanosecond(a, b)) => Precision::Exact( - ScalarValue::TimestampMicrosecond(a.map(|v| v / 1000), b.clone()), - ), - _ => max_value, - }; - - Ok(ColumnStatistics { - null_count, - max_value, - min_value, - sum_value: Precision::Absent, - distinct_count: Precision::Absent, - byte_size: Precision::Absent, - }) - } - } - - trait StatsExt { - fn add(&self, other: &Self) -> Self; - } - - impl StatsExt for ColumnStatistics { - fn add(&self, other: &Self) -> Self { - Self { - null_count: self.null_count.add(&other.null_count), - max_value: self.max_value.max(&other.max_value), - min_value: self.min_value.min(&other.min_value), - sum_value: Precision::Absent, - distinct_count: self.distinct_count.add(&other.distinct_count), - byte_size: self.byte_size.add(&other.byte_size), - } - } - } - - impl LogDataHandler<'_> { - fn num_records(&self) -> Precision { - FileStatsAccessor::try_new(self.data) - .map(|a| a.num_records()) - .into_iter() - .reduce(|acc, num_records| acc.add(&num_records)) - .unwrap_or(Precision::Absent) - } - - fn total_size_files(&self) -> Precision { - FileStatsAccessor::try_new(self.data) - .map(|a| a.total_size_files()) - .into_iter() - .reduce(|acc, size| acc.add(&size)) - .unwrap_or(Precision::Absent) - } - - pub(crate) fn column_stats(&self, name: impl AsRef) -> Option { - FileStatsAccessor::try_new(self.data) - .map(|a| a.column_stats(name.as_ref())) - .into_iter() - .collect::, _>>() - .ok()? - .iter() - .fold(None::, |acc, stats| match (acc, stats) { - (None, stats) => Some(stats.clone()), - (Some(acc), stats) => Some(acc.add(stats)), - }) - } - - pub(crate) fn statistics(&self) -> Option { - let num_rows = self.num_records(); - let total_byte_size = self.total_size_files(); - let column_statistics = self - .config - .schema() - .fields() - .map(|f| self.column_stats(f.name())) - .collect::>>()?; - Some(Statistics { - num_rows, - total_byte_size, - column_statistics, - }) - } - - fn pick_stats(&self, column: &Column, stats_field: &'static str) -> Option { - let schema = self.config.schema(); - let field = schema.field(&column.name)?; - // See issue #1214. Binary type does not support natural order which is required for Datafusion to prune - if field.data_type() == &DataType::Primitive(PrimitiveType::Binary) { - return None; - } - let expression = if self - .config - .metadata() - .partition_columns() - .contains(&column.name) - { - Expression::column(["partitionValues_parsed", &column.name]) - } else { - Expression::column(["stats_parsed", stats_field, &column.name]) - }; - // `nullCount` is always a Long/Int64 count in stats (independent of column type). - let output_type = match stats_field { - COL_NULL_COUNT => DataType::Primitive(PrimitiveType::Long), - _ => field.data_type().clone(), - }; - let evaluator = match ARROW_HANDLER.new_expression_evaluator( - scan_row_schema(), - Arc::new(expression), - output_type, - ) { - Ok(value) => value, - Err(err) => { - warn!( - "Failed to construct stats evaluator for column {} (field {stats_field}): {err}", - column.name() - ); - return None; - } - }; - let batch = match evaluator.evaluate_arrow(self.data.clone()) { - Ok(batch) => batch, - Err(err) => { - warn!( - "Failed to evaluate stats expression for column {} (field {stats_field}): {err}", - column.name() - ); - return None; - } - }; - batch.column_by_name("output").cloned() - } - } - - fn batch_column<'a, T: Array + 'static>( - batch: &'a RecordBatch, - name: &str, - ) -> DeltaResult<&'a T> { - batch - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) - .ok_or_else(|| DeltaTableError::schema(format!("column {name} not found in log data"))) - } - - fn struct_column_opt<'a, T: Array + 'static>( - array: &'a StructArray, - name: &str, - ) -> Option<&'a T> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) - } - - fn nested_column<'a>( - array: &'a StructArray, - root: &str, - path: &mut impl Iterator, - ) -> Result<&'a Arc, DeltaTableError> { - let mut current = array.column_by_name(root).ok_or_else(|| { - DeltaTableError::schema(format!("{root} column not found in stats struct")) - })?; - for segment in path { - let struct_array = current - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DeltaTableError::schema(format!( - "Expected struct while accessing {segment} in stats" - )) - })?; - current = struct_array.column_by_name(segment).ok_or_else(|| { - DeltaTableError::schema(format!("{segment} column not found in stats struct")) - })?; - } - Ok(current) - } - - impl PruningStatistics for LogDataHandler<'_> { - /// return the minimum values for the named column, if known. - /// Note: the returned array must contain `num_containers()` rows - fn min_values(&self, column: &Column) -> Option { - self.pick_stats(column, "minValues") - } - - /// return the maximum values for the named column, if known. - /// Note: the returned array must contain `num_containers()` rows. - fn max_values(&self, column: &Column) -> Option { - self.pick_stats(column, "maxValues") - } - - /// return the number of containers (e.g. row groups) being - /// pruned with these statistics - fn num_containers(&self) -> usize { - self.data.num_rows() - } - - /// return the number of null values for the named column as an - /// `Option`. - /// - /// Note: the returned array must contain `num_containers()` rows. - fn null_counts(&self, column: &Column) -> Option { - if !self - .config - .metadata() - .partition_columns() - .contains(&column.name) - { - let counts = self.pick_stats(column, "nullCount")?; - return ::datafusion::arrow::compute::cast(counts.as_ref(), &ArrowDataType::UInt64) - .ok(); - } - let partition_values = self.pick_stats(column, "__dummy__")?; - let row_counts = self.row_counts(column)?; - let row_counts = row_counts.as_any().downcast_ref::()?; - let mut null_counts = Vec::with_capacity(partition_values.len()); - for i in 0..partition_values.len() { - let null_count = if partition_values.is_null(i) { - row_counts.value(i) - } else { - 0 - }; - null_counts.push(null_count); - } - Some(Arc::new(UInt64Array::from(null_counts))) - } - - /// return the number of rows for the named column in each container - /// as an `Option`. - /// - /// Note: the returned array must contain `num_containers()` rows - fn row_counts(&self, _column: &Column) -> Option { - static ROW_COUNTS_EVAL: LazyLock>> = - LazyLock::new(|| { - ARROW_HANDLER - .new_expression_evaluator( - scan_row_schema(), - Arc::new(Expression::column(["stats_parsed", "numRecords"])), - DataType::Primitive(PrimitiveType::Long), - ) - .ok() - }); - - let evaluator = ROW_COUNTS_EVAL.as_ref()?; - let batch = evaluator.evaluate_arrow(self.data.clone()).ok()?; - ::datafusion::arrow::compute::cast( - batch.column_by_name("output")?, - &ArrowDataType::UInt64, - ) - .ok() - } - - // This function is optional but will optimize partition column pruning - fn contained(&self, column: &Column, value: &HashSet) -> Option { - if value.is_empty() - || !self - .config - .metadata() - .partition_columns() - .contains(&column.name) - { - return None; - } - - // Retrieve the partition values for the column - let partition_values = self.pick_stats(column, "__dummy__")?; - - let partition_values = partition_values - .as_any() - .downcast_ref::() - .ok_or(DeltaTableError::generic( - "failed to downcast string result to StringArray.", - )) - .ok()?; - - let mut contains = Vec::with_capacity(partition_values.len()); - - // TODO: this was inspired by parquet's BloomFilter pruning, decide if we should - // just convert to Vec for a subset of column types and use .contains - fn check_scalar(pv: &str, value: &ScalarValue) -> bool { - match value { - ScalarValue::Utf8(Some(v)) - | ScalarValue::Utf8View(Some(v)) - | ScalarValue::LargeUtf8(Some(v)) => pv == v, - - ScalarValue::Dictionary(_, inner) => check_scalar(pv, inner), - // FIXME: is this a good enough default or should we sync this with - // expr_applicable_for_cols and bail out with None - _ => value.to_string() == pv, - } - } - - for i in 0..partition_values.len() { - if partition_values.is_null(i) { - // For IS NULL predicates, we want to include NULL partitions - let contains_null = value.iter().any(|scalar| scalar.is_null()); - contains.push(contains_null); - } else { - contains.push( - value - .iter() - .any(|scalar| check_scalar(partition_values.value(i), scalar)), - ); - } - } - - Some(BooleanArray::from(contains)) - } - } -} diff --git a/crates/sail-delta-lake/src/kernel/snapshot/materialize.rs b/crates/sail-delta-lake/src/kernel/snapshot/materialize.rs new file mode 100644 index 0000000000..737a2b66a6 --- /dev/null +++ b/crates/sail-delta-lake/src/kernel/snapshot/materialize.rs @@ -0,0 +1,301 @@ +use std::collections::HashMap; +use std::io::Cursor; +use std::sync::Arc; + +use datafusion::arrow::array::{ + new_empty_array, Array, ArrayRef, MapArray, StringArray, StructArray, +}; +use datafusion::arrow::datatypes::{Field, Fields, Schema as ArrowSchema}; +use datafusion::arrow::json::ReaderBuilder as JsonReaderBuilder; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::scalar::ScalarValue; + +use super::DeltaSnapshot; +use crate::conversion::parse_optional_partition_value; +use crate::schema::make_physical_arrow_schema; +use crate::spec::fields::{ + FIELD_NAME_PARTITION_VALUES_PARSED, FIELD_NAME_STATS, FIELD_NAME_STATS_PARSED, +}; +use crate::spec::{ + stats_schema, Add, ColumnMappingMode, DataType, DeltaError as DeltaTableError, DeltaResult, + StructType, +}; + +impl DeltaSnapshot { + pub(super) fn build_files_batch_from_adds(&self, adds: &[Add]) -> DeltaResult { + let raw = encode_snapshot_add_rows(adds)?; + parse_scan_row_columns(raw, self) + } + + pub(super) fn build_active_files_batch(&self) -> DeltaResult { + self.build_files_batch_from_adds(self.adds()) + } + + pub(super) fn build_empty_files_batch(&self) -> DeltaResult { + self.build_files_batch_from_adds(&[]) + } +} + +fn snapshot_add_fields() -> DeltaResult>> { + crate::spec::add_struct_type() + .fields() + .map(|field| { + Field::try_from(field).map(Arc::new).map_err(|e| { + DeltaTableError::generic(format!( + "snapshot add schema should convert to Arrow: {e}" + )) + }) + }) + .collect() +} + +fn encode_snapshot_add_rows(rows: &[Add]) -> DeltaResult { + let owned_rows = rows.to_vec(); + let fields = snapshot_add_fields()?; + serde_arrow::to_record_batch(&fields, &owned_rows).map_err(DeltaTableError::generic_err) +} + +fn build_partition_schema( + schema: &ArrowSchema, + partition_columns: &[String], +) -> DeltaResult> { + if partition_columns.is_empty() { + return Ok(None); + } + let fields = partition_columns + .iter() + .map(|col| { + schema + .field_with_name(col) + .cloned() + .map_err(|_| DeltaTableError::missing_column(col)) + }) + .collect::>>()?; + Ok(Some(ArrowSchema::new(fields))) +} + +fn build_stats_source_schema(snapshot: &DeltaSnapshot) -> DeltaResult { + let partition_columns = snapshot.metadata().partition_columns(); + let mode = snapshot.column_mapping_mode(); + let non_partition_fields: Vec = snapshot + .schema() + .fields() + .iter() + .filter(|field| !partition_columns.contains(field.name())) + .map(|field| field.as_ref().clone()) + .collect(); + let logical_non_partition = ArrowSchema::new(non_partition_fields); + Ok(make_physical_arrow_schema(&logical_non_partition, mode)) +} + +fn parse_scan_row_columns(raw: RecordBatch, snapshot: &DeltaSnapshot) -> DeltaResult { + let mut fields = raw.schema().fields().to_vec(); + let mut columns = raw.columns().to_vec(); + let mode = snapshot.column_mapping_mode(); + + if let Some((stats_idx, _)) = raw.schema_ref().column_with_name(FIELD_NAME_STATS) { + let stats_source_arrow = build_stats_source_schema(snapshot)?; + let stats_source_kernel = StructType::try_from(&stats_source_arrow)?; + let stats_schema = Arc::new(stats_schema( + &stats_source_kernel, + snapshot.table_properties(), + )?); + let arrow_stats_schema = Arc::new(ArrowSchema::try_from(stats_schema.as_ref())?); + let stats_batch = raw.project(&[stats_idx])?; + let stats_json = stats_batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DeltaTableError::schema("expected Utf8 stats column when parsing stats".to_string()) + })?; + let mut json_lines = String::new(); + for value in stats_json.iter() { + if let Some(value) = value { + json_lines.push_str(value); + } else { + json_lines.push_str("{}"); + } + json_lines.push('\n'); + } + let mut reader = JsonReaderBuilder::new(Arc::clone(&arrow_stats_schema)) + .with_batch_size(stats_batch.num_rows().max(1)) + .build(Cursor::new(json_lines)) + .map_err(DeltaTableError::generic_err)?; + let parsed_batch = match reader.next() { + Some(batch) => batch.map_err(DeltaTableError::generic_err)?, + None => RecordBatch::new_empty(arrow_stats_schema), + }; + let stats_array: Arc = Arc::new(parsed_batch.into()); + fields.push(Arc::new(Field::new( + FIELD_NAME_STATS_PARSED, + stats_array.data_type().to_owned(), + true, + ))); + columns.push(stats_array); + } + + if let Some(partition_schema_arrow) = + build_partition_schema(snapshot.schema(), snapshot.metadata().partition_columns())? + { + let partition_schema = StructType::try_from(&partition_schema_arrow)?; + let partition_array = + parse_partition_values_array(&raw, &partition_schema, "partitionValues", mode)?; + fields.push(Arc::new(Field::new( + FIELD_NAME_PARTITION_VALUES_PARSED, + partition_array.data_type().to_owned(), + false, + ))); + columns.push(Arc::new(partition_array)); + } + + Ok(RecordBatch::try_new( + Arc::new(ArrowSchema::new(fields)), + columns, + )?) +} + +fn parse_partition_values_array( + batch: &RecordBatch, + partition_schema: &StructType, + path: &str, + column_mapping_mode: ColumnMappingMode, +) -> DeltaResult { + let partitions = map_array_from_path(batch, path)?; + let num_rows = partitions.len(); + + let mut raw_collected: HashMap>> = partition_schema + .fields() + .map(|field| { + ( + field.physical_name(column_mapping_mode).to_string(), + Vec::with_capacity(num_rows), + ) + }) + .collect(); + + for row in 0..num_rows { + if partitions.is_null(row) { + return Err(DeltaTableError::generic( + "Expected partition values map, found null entry.", + )); + } + let raw_values = collect_partition_row(&partitions.value(row))?; + + for field in partition_schema.fields() { + if !matches!(field.data_type(), DataType::Primitive(_)) { + return Err(DeltaTableError::generic( + "nested partitioning values are not supported", + )); + } + let physical_name = field.physical_name(column_mapping_mode); + let value = raw_values + .get(physical_name) + .or_else(|| raw_values.get(field.name())) + .and_then(Clone::clone); + raw_collected + .get_mut(physical_name) + .ok_or_else(|| DeltaTableError::schema("partition field missing".to_string()))? + .push(value); + } + } + + let arrow_fields: Fields = Fields::from( + partition_schema + .fields() + .map(Field::try_from) + .collect::, _>>()?, + ); + + let columns: Vec = partition_schema + .fields() + .zip(arrow_fields.iter()) + .map(|(field, arrow_field)| { + let physical_name = field.physical_name(column_mapping_mode); + let raw_values = raw_collected + .get(physical_name) + .ok_or_else(|| DeltaTableError::schema("partition field missing".to_string()))?; + let arrow_dt = arrow_field.data_type(); + let scalar_values: Vec = raw_values + .iter() + .map(|value| { + parse_optional_partition_value(value.as_deref(), arrow_dt).map_err(|e| { + DeltaTableError::generic(format!("partition value parse error: {e}")) + }) + }) + .collect::>>()?; + let array = if scalar_values.is_empty() { + new_empty_array(arrow_dt) + } else { + ScalarValue::iter_to_array(scalar_values) + .map_err(|e| DeltaTableError::generic(format!("scalar to array error: {e}")))? + }; + let array = if array.data_type() != arrow_dt { + datafusion::arrow::compute::cast(&array, arrow_dt) + .map_err(|e| DeltaTableError::generic(format!("cast error: {e}")))? + } else { + array + }; + Ok(Arc::new(array) as ArrayRef) + }) + .collect::>>()?; + + Ok(StructArray::try_new(arrow_fields, columns, None)?) +} + +fn map_array_from_path<'a>(batch: &'a RecordBatch, path: &str) -> DeltaResult<&'a MapArray> { + let mut segments = path.split('.'); + let first = segments + .next() + .ok_or_else(|| DeltaTableError::generic("partition column path must not be empty"))?; + + let mut current: &dyn Array = batch + .column_by_name(first) + .map(|column| column.as_ref()) + .ok_or_else(|| { + DeltaTableError::schema(format!("{first} column not found when parsing partitions")) + })?; + + for segment in segments { + let struct_array = current + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DeltaTableError::schema(format!("Expected struct column while traversing {path}")) + })?; + current = struct_array + .column_by_name(segment) + .map(|column| column.as_ref()) + .ok_or_else(|| { + DeltaTableError::schema(format!( + "{segment} column not found while traversing {path}" + )) + })?; + } + + current + .as_any() + .downcast_ref::() + .ok_or_else(|| DeltaTableError::schema(format!("Column {path} is not a map"))) +} + +fn collect_partition_row(value: &StructArray) -> DeltaResult>> { + let keys = value + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| DeltaTableError::schema("map key column is not Utf8".to_string()))?; + let values = value + .column(1) + .as_any() + .downcast_ref::() + .ok_or_else(|| DeltaTableError::schema("map value column is not Utf8".to_string()))?; + + let mut result = HashMap::with_capacity(keys.len()); + for (key, value) in keys.iter().zip(values.iter()) { + if let Some(key) = key { + result.insert(key.to_string(), value.map(|entry| entry.to_string())); + } + } + Ok(result) +} diff --git a/crates/sail-delta-lake/src/kernel/snapshot/mod.rs b/crates/sail-delta-lake/src/kernel/snapshot/mod.rs index 2009ff2c15..60a430b550 100644 --- a/crates/sail-delta-lake/src/kernel/snapshot/mod.rs +++ b/crates/sail-delta-lake/src/kernel/snapshot/mod.rs @@ -16,843 +16,612 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [Credit]: +use std::collections::{BTreeMap, HashMap}; +use std::sync::Arc; -use std::collections::HashMap; -use std::io::{BufRead, BufReader, Cursor}; -use std::str::FromStr; -use std::sync::{Arc, LazyLock}; - -use datafusion::arrow::array::{ - Array, BooleanArray, Int32Array, Int64Array, MapArray, RecordBatch, StringArray, StructArray, +use chrono::Utc; +use datafusion::arrow::array::{ArrayRef, StructArray}; +use datafusion::arrow::datatypes::{ + Field, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, }; -use datafusion::arrow::compute::concat_batches; -use delta_kernel::actions::{Remove as KernelRemove, Sidecar}; -use delta_kernel::engine::arrow_conversion::TryIntoArrow; -use delta_kernel::engine::arrow_data::ArrowEngineData; -use delta_kernel::path::{LogPathFileType, ParsedLogPath}; -use delta_kernel::scan::scan_row_schema; -use delta_kernel::schema::derive_macro_utils::ToDataType; -use delta_kernel::schema::{SchemaRef, StructField}; -use delta_kernel::snapshot::Snapshot as KernelSnapshot; -use delta_kernel::table_configuration::TableConfiguration; -use delta_kernel::table_properties::TableProperties; -use delta_kernel::{PredicateRef, Version}; +use datafusion::arrow::record_batch::RecordBatch; use futures::stream::BoxStream; use futures::{StreamExt, TryStreamExt}; -use itertools::Itertools; -use object_store::path::Path; -use object_store::ObjectStore; -use percent_encoding::percent_decode_str; -use sail_common_datafusion::array::record_batch::cast_record_batch_relaxed_tz; -use tokio::task::spawn_blocking; +use log::debug; +use once_cell::sync::OnceCell; use url::Url; -use crate::kernel::arrow::engine_ext::{ScanExt, SnapshotExt}; -use crate::kernel::models::{ - Action, CommitInfo, DeletionVectorDescriptor, Metadata, Protocol, Remove, StorageType, - StructType, +use crate::kernel::checkpoints::{ + latest_replayable_version, load_replayed_table_header, load_replayed_table_state, + ReplayedTableState, +}; +use crate::kernel::log_segment::ReplayedTableHeader; +pub use crate::kernel::snapshot::stats::SnapshotPruningStats; +use crate::kernel::{DeltaTableConfig, SchemaRef}; +use crate::schema::{arrow_field_physical_name, arrow_schema_reorder_partitions}; +use crate::spec::fields::{ + FIELD_NAME_MODIFICATION_TIME, FIELD_NAME_PARTITION_VALUES_PARSED, FIELD_NAME_PATH, + FIELD_NAME_SIZE, FIELD_NAME_STATS_PARSED, STATS_FIELD_MAX_VALUES, STATS_FIELD_MIN_VALUES, + STATS_FIELD_NULL_COUNT, STATS_FIELD_NUM_RECORDS, +}; +use crate::spec::{ + Add, ColumnMappingMode, ColumnMetadataKey, DeltaError as DeltaTableError, DeltaResult, + Metadata, Protocol, Remove, TableFeature, TableProperties, Transaction, VersionChecksum, }; -use crate::kernel::snapshot::iterators::LogicalFileView; -pub use crate::kernel::snapshot::log_data::LogDataHandler; -use crate::kernel::snapshot::stream::{RecordBatchReceiverStreamBuilder, SendableRBStream}; -use crate::kernel::{DeltaResult, DeltaTableConfig, DeltaTableError}; use crate::storage::LogStore; -pub mod iterators; -pub mod log_data; -pub(crate) mod stream; - -pub(crate) static SCAN_ROW_ARROW_SCHEMA: LazyLock = LazyLock::new(|| { - Arc::new( - scan_row_schema() - .as_ref() - .try_into_arrow() - .unwrap_or_else(|_| { - // Fallback to an empty schema if conversion fails - arrow_schema::Schema::empty() - }), - ) -}); - -/// A snapshot of a Delta table -#[derive(Debug, Clone, PartialEq)] -pub struct Snapshot { - /// Log segment containing all log files in the snapshot - pub(crate) inner: Arc, - /// Configuration for the current session - config: DeltaTableConfig, - /// Logical table schema - schema: SchemaRef, - /// Fully qualified URL of the table +mod materialize; +mod stats; + +pub struct DeltaSnapshot { + version: i64, table_url: Url, + config: DeltaTableConfig, + protocol: Protocol, + metadata: Metadata, + table_properties: TableProperties, + arrow_schema: SchemaRef, + adds: Arc>, + removes: Arc>, + app_txns: Arc>, + commit_timestamps: Arc>, + files_batch: OnceCell, } -impl Snapshot { - /// Create a new [`Snapshot`] instance - pub async fn try_new( +impl std::fmt::Debug for DeltaSnapshot { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DeltaSnapshot") + .field("version", &self.version) + .field("table_url", &self.table_url) + .field("require_files", &self.config.require_files) + .field("adds_len", &self.adds.len()) + .field("removes_len", &self.removes.len()) + .finish() + } +} + +impl Clone for DeltaSnapshot { + fn clone(&self) -> Self { + let files_batch = OnceCell::new(); + if let Some(batch) = self.files_batch.get() { + let _ = files_batch.set(batch.clone()); + } + Self { + version: self.version, + table_url: self.table_url.clone(), + config: self.config.clone(), + protocol: self.protocol.clone(), + metadata: self.metadata.clone(), + table_properties: self.table_properties.clone(), + arrow_schema: Arc::clone(&self.arrow_schema), + adds: Arc::clone(&self.adds), + removes: Arc::clone(&self.removes), + app_txns: Arc::clone(&self.app_txns), + commit_timestamps: Arc::clone(&self.commit_timestamps), + files_batch, + } + } +} + +impl DeltaSnapshot { + pub(crate) async fn try_new( log_store: &dyn LogStore, config: DeltaTableConfig, version: Option, + replay_hint: Option<&ReplayedTableHeader>, ) -> DeltaResult { - // TODO: bundle operation_id with logstore ... - let engine = log_store.engine(None); - let mut table_root = log_store.config().location.clone(); - let version = version.map(|v| v as u64); - - // NB: kernel engine uses Url::join to construct paths, - // if the path does not end with a slash, the would override the entire path. - // So we need to be extra sure its ends with a slash. - if !table_root.path().ends_with('/') { - table_root.set_path(&format!("{}/", table_root.path())); - } - let snapshot = match spawn_blocking(move || { - let mut builder = KernelSnapshot::builder_for(table_root); - if let Some(v) = version { - builder = builder.at_version(v); - } - builder.build(engine.as_ref()) - }) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))? - { - Ok(snapshot) => snapshot, - Err(e) => { - // TODO: we should have more handling-friendly errors upstream in kernel. - if e.to_string().contains("No files in log segment") { - return Err(DeltaTableError::invalid_table_location(e.to_string())); - } else { - return Err(e.into()); + let target_version = match version { + Some(v) => v, + None => match latest_replayable_version(log_store).await { + Ok(v) => v, + Err(crate::spec::DeltaError::MissingVersion) => { + return Err(DeltaTableError::invalid_table_location( + "No commit files found in _delta_log", + )) } - } + Err(err) => return Err(err), + }, }; - let schema = snapshot.table_configuration().schema(); + if !config.require_files { + match load_replayed_table_header(target_version, log_store, replay_hint).await { + Ok(Some(replayed)) => { + return Self::from_replayed_header(log_store, config, replayed) + } + Ok(None) => {} + Err(err) => { + debug!( + "Failed to load table header fast-path for version {}: {}; falling back to full replay", + target_version, err + ); + } + } + } + + let replayed = load_replayed_table_state(target_version, log_store).await?; + Self::from_replayed_state(log_store, config, replayed) + } + + fn from_replayed_state( + log_store: &dyn LogStore, + config: DeltaTableConfig, + replayed: ReplayedTableState, + ) -> DeltaResult { + Self::from_replayed_parts( + log_store, + config, + replayed.version, + replayed.protocol, + replayed.metadata, + replayed.adds, + replayed.removes, + replayed.txns, + replayed.commit_timestamps, + ) + } + + fn from_replayed_header( + log_store: &dyn LogStore, + config: DeltaTableConfig, + replayed: ReplayedTableHeader, + ) -> DeltaResult { + let arrow_schema = Arc::new(replayed.metadata.parse_schema_arrow()?); + let table_properties = TableProperties::from(replayed.metadata.configuration().iter()); Ok(Self { - inner: snapshot, + version: replayed.version, + table_url: log_store.config().location.clone(), config, - schema, + protocol: replayed.protocol, + metadata: replayed.metadata, + table_properties, + arrow_schema, + adds: Arc::new(Vec::new()), + removes: Arc::new(Vec::new()), + app_txns: replayed.txns, + commit_timestamps: replayed.commit_timestamps, + files_batch: OnceCell::new(), + }) + } + + #[expect(clippy::too_many_arguments)] + fn from_replayed_parts( + log_store: &dyn LogStore, + config: DeltaTableConfig, + version: i64, + protocol: Protocol, + metadata: Metadata, + adds: Vec, + removes: Vec, + txns: HashMap, + commit_timestamps: BTreeMap, + ) -> DeltaResult { + let arrow_schema = Arc::new(metadata.parse_schema_arrow()?); + let table_properties = TableProperties::from(metadata.configuration().iter()); + + Ok(Self { + version, table_url: log_store.config().location.clone(), + config, + protocol, + metadata, + table_properties, + arrow_schema, + adds: Arc::new(adds), + removes: Arc::new(removes), + app_txns: Arc::new(txns), + commit_timestamps: Arc::new(commit_timestamps), + files_batch: OnceCell::new(), }) } - /// Update the snapshot to the given version + fn replay_hint(&self) -> ReplayedTableHeader { + ReplayedTableHeader { + version: self.version, + protocol: self.protocol.clone(), + metadata: self.metadata.clone(), + txns: Arc::clone(&self.app_txns), + commit_timestamps: Arc::clone(&self.commit_timestamps), + } + } + pub async fn update( &mut self, log_store: &dyn LogStore, target_version: Option, ) -> DeltaResult<()> { - if let Some(version) = target_version { - if version == self.version() as u64 { - return Ok(()); - } - if version < self.version() as u64 { - return Err(DeltaTableError::generic("Cannot downgrade snapshot")); - } - } - - // TODO: bundle operation id with log store ... - let engine = log_store.engine(None); - let current = self.inner.clone(); - let snapshot = spawn_blocking(move || { - let mut builder = KernelSnapshot::builder_from(current); - if let Some(v) = target_version { - builder = builder.at_version(v); - } - builder.build(engine.as_ref()) - }) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))??; + let target_version = match target_version { + Some(v) => i64::try_from(v) + .map_err(|_| DeltaTableError::generic("target version overflows i64"))?, + None => log_store.get_latest_version(self.version()).await?, + }; - self.inner = snapshot; - self.schema = self.inner.table_configuration().schema(); + if target_version == self.version() { + return Ok(()); + } + if target_version < self.version() { + return Err(DeltaTableError::generic("Cannot downgrade snapshot")); + } + let replay_hint = (!self.config.require_files).then(|| self.replay_hint()); + *self = Self::try_new( + log_store, + self.config.clone(), + Some(target_version), + replay_hint.as_ref(), + ) + .await?; Ok(()) } - /// Get the table version of the snapshot pub fn version(&self) -> i64 { - self.inner.version() as i64 + self.version + } + + pub fn version_timestamp(&self, version: i64) -> Option { + self.commit_timestamps.get(&version).copied() + } + + pub fn in_commit_timestamps_enabled(&self) -> bool { + self.protocol() + .is_in_commit_timestamps_enabled(self.table_properties()) + } + + pub fn in_commit_timestamp_enablement(&self) -> Option<(i64, i64)> { + self.in_commit_timestamps_enabled() + .then(|| self.table_properties().in_commit_timestamp_enablement()) + .flatten() + } + + pub fn table_url(&self) -> &Url { + &self.table_url } - /// Get the table schema of the snapshot - pub fn schema(&self) -> &StructType { - self.schema.as_ref() + pub fn schema(&self) -> &ArrowSchema { + self.arrow_schema.as_ref() } - /// Get the table metadata of the snapshot pub fn metadata(&self) -> &Metadata { - self.inner.table_configuration().metadata() + &self.metadata } - /// Get the table protocol of the snapshot pub fn protocol(&self) -> &Protocol { - self.inner.table_configuration().protocol() + &self.protocol } - /// Get the table config which is loaded with of the snapshot pub fn load_config(&self) -> &DeltaTableConfig { &self.config } - /// Get the table root of the snapshot - pub(crate) fn table_root_path(&self) -> DeltaResult { - Ok(Path::from_url_path(self.table_url.path())?) - } - - /// Well known properties of the table pub fn table_properties(&self) -> &TableProperties { - self.inner.table_properties() - } - - pub fn table_configuration(&self) -> &TableConfiguration { - self.inner.table_configuration() - } - - /// Get the active files for the current snapshot. - /// - /// This method returns a stream of record batches where each row - /// represents an active file for the current snapshot. - /// - /// The files can be filtered using the provided predicate. This is a - /// best effort to skip files that are excluded by the predicate. Individual - /// files may still contain data that is not relevant to the predicate. - /// - /// ## Arguments - /// - /// * `log_store` - The log store to use for reading the snapshot. - /// * `predicate` - An optional predicate to filter the files. - /// - /// ## Returns - /// - /// A stream of active files for the current snapshot. - pub fn files( - &self, - log_store: &dyn LogStore, - predicate: Option, - ) -> SendableRBStream { - let scan = match self - .inner - .clone() - .scan_builder() - .with_predicate(predicate) - .build() - { - Ok(scan) => scan, - Err(err) => return Box::pin(futures::stream::once(async { Err(err.into()) })), - }; + &self.table_properties + } - // TODO: which capacity to choose? - let mut builder = RecordBatchReceiverStreamBuilder::new(100); - let tx = builder.tx(); - // TODO: bundle operation id with log store ... - let engine = log_store.engine(None); - let inner = self.inner.clone(); - - builder.spawn_blocking(move || { - let scan_iter = scan.scan_metadata_arrow(engine.as_ref())?; - for res in scan_iter { - let batch = res?.scan_files; - // Be tolerant of malformed or empty stats JSON - let batch = match inner.parse_stats_column(&batch) { - Ok(parsed) => parsed, - Err(_) => batch, - }; - if tx.blocking_send(Ok(batch)).is_err() { - break; - } - } - Ok(()) - }); + pub fn config(&self) -> &TableProperties { + self.table_properties() + } - builder.build() - } - - /// Get the commit infos in the snapshot - /// - /// ## Parameters - /// - /// * `log_store`: The log store to use. - /// * `limit`: The maximum number of commit infos to return (optional). - /// - /// ## Returns - /// - /// A stream of commit infos. - // TODO: move outer error into stream. - #[expect(dead_code)] - pub(crate) async fn commit_infos( - &self, - log_store: &dyn LogStore, - limit: Option, - ) -> DeltaResult>>> { - let store = log_store.root_object_store(None); - - let log_root = self.table_root_path()?.child("_delta_log"); - let start_from = log_root.child( - format!( - "{:020}", - limit - .map(|l| (self.version() - l as i64 + 1).max(0)) - .unwrap_or(0) - ) - .as_str(), - ); - - let dummy_url = url::Url::parse("memory:///") - .map_err(|e| DeltaTableError::generic(format!("Failed to parse dummy URL: {}", e)))?; - let mut commit_files = Vec::new(); - for meta in store - .list_with_offset(Some(&log_root), &start_from) - .try_collect::>() - .await? - { - // safety: object store path are always valid urls paths. - let dummy_path = dummy_url - .join(meta.location.as_ref()) - .map_err(|e| DeltaTableError::generic(format!("Failed to join URL path: {}", e)))?; - if let Some(parsed_path) = ParsedLogPath::try_from(dummy_path)? { - if matches!(parsed_path.file_type, LogPathFileType::Commit) { - commit_files.push(meta); - } + pub fn column_mapping_mode(&self) -> ColumnMappingMode { + self.table_properties + .column_mapping_mode + .unwrap_or(ColumnMappingMode::None) + } + + pub fn effective_column_mapping_mode(&self) -> ColumnMappingMode { + let explicit = self.column_mapping_mode(); + if matches!(explicit, ColumnMappingMode::None) { + let has_annotations = self.schema().fields().iter().any(|field| { + field + .metadata() + .contains_key(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) + && field + .metadata() + .contains_key(ColumnMetadataKey::ColumnMappingId.as_ref()) + }); + if has_annotations { + return ColumnMappingMode::Name; } } - commit_files.sort_unstable_by(|a, b| b.location.cmp(&a.location)); - Ok(futures::stream::iter(commit_files) - .map(move |meta| { - let store = store.clone(); - async move { - let commit_log_bytes = store.get(&meta.location).await?.bytes().await?; - let reader = BufReader::new(Cursor::new(commit_log_bytes)); - for line in reader.lines() { - let action: Action = serde_json::from_str(line?.as_str())?; - if let Action::CommitInfo(commit_info) = action { - return Ok::<_, DeltaTableError>(Some(commit_info)); - } - } - Ok(None) - } - }) - .buffered(self.config.log_buffer_size) - .boxed()) + explicit } - pub(crate) fn tombstones( - &self, - log_store: &dyn LogStore, - ) -> BoxStream<'_, DeltaResult> { - static TOMBSTONE_SCHEMA: LazyLock> = LazyLock::new(|| { - Arc::new( - StructType::try_new(vec![ - StructField::nullable("remove", KernelRemove::to_data_type()), - StructField::nullable("sidecar", Sidecar::to_data_type()), - ]) - .unwrap_or_else(|_| empty_struct_type()), - ) - }); - - // TODO: which capacity to choose? - let mut builder = RecordBatchReceiverStreamBuilder::new(100); - let tx = builder.tx(); - - // TODO: bundle operation id with log store ... - let engine = log_store.engine(None); - - let remove_data = match self.inner.log_segment().read_actions( - engine.as_ref(), - TOMBSTONE_SCHEMA.clone(), - None, - ) { - Ok(data) => data, - Err(err) => return Box::pin(futures::stream::once(async { Err(err.into()) })), - }; - - builder.spawn_blocking(move || { - for res in remove_data { - let batch: RecordBatch = - ArrowEngineData::try_from_engine_data(res?.actions)?.into(); - if tx.blocking_send(Ok(batch)).is_err() { - break; - } - } - Ok(()) - }); - - builder - .build() - .map(|maybe_batch| maybe_batch.and_then(|batch| read_removes(&batch))) - .map_ok(|removes| { - futures::stream::iter(removes.into_iter().map(Ok::<_, DeltaTableError>)) - }) - .try_flatten() - .boxed() + pub fn logical_arrow_schema(&self) -> DeltaResult { + self.input_schema() } - /// Fetch the latest version of the provided application_id for this snapshot. - /// - /// Filters the txn based on the SetTransactionRetentionDuration property and lastUpdated - async fn application_transaction_version( - &self, - log_store: &dyn LogStore, - app_id: String, - ) -> DeltaResult> { - // TODO: bundle operation id with log store ... - let engine = log_store.engine(None); - let inner = self.inner.clone(); - let version = spawn_blocking(move || inner.get_app_id_version(&app_id, engine.as_ref())) - .await - .map_err(DeltaTableError::generic_err)??; - Ok(version) + pub fn arrow_schema(&self) -> DeltaResult { + arrow_schema_reorder_partitions(self.schema(), self.metadata().partition_columns(), true) } -} - -fn empty_struct_type() -> StructType { - StructType::try_new(Vec::::new()) - .unwrap_or_else(|_| unreachable!("empty struct type is always valid")) -} - -fn read_removes(batch: &RecordBatch) -> DeltaResult> { - let Some(remove_col) = batch - .column_by_name("remove") - .and_then(|col| col.as_any().downcast_ref::()) - else { - return Ok(vec![]); - }; - - if remove_col.null_count() == remove_col.len() { - return Ok(vec![]); - } - - let path_col = required_string_field(remove_col, "path")?; - let data_change_col = required_bool_field(remove_col, "dataChange")?; - let deletion_ts_col = required_i64_field(remove_col, "deletionTimestamp")?; - let extended_file_metadata_col = optional_bool_field(remove_col, "extendedFileMetadata"); - let partition_values_col = optional_map_field(remove_col, "partitionValues"); - let size_col = optional_i64_field(remove_col, "size"); - let tags_col = optional_map_field(remove_col, "tags"); - let dv_struct = optional_struct_field(remove_col, "deletionVector"); - - let dv_storage_type = dv_struct - .and_then(|c| c.column_by_name("storageType")) - .and_then(|col| col.as_any().downcast_ref::()); - let dv_path = dv_struct - .and_then(|c| c.column_by_name("pathOrInlineDv")) - .and_then(|col| col.as_any().downcast_ref::()); - let dv_offset = dv_struct - .and_then(|c| c.column_by_name("offset")) - .and_then(|col| col.as_any().downcast_ref::()); - let dv_size_in_bytes = dv_struct - .and_then(|c| c.column_by_name("sizeInBytes")) - .and_then(|col| col.as_any().downcast_ref::()); - let dv_cardinality = dv_struct - .and_then(|c| c.column_by_name("cardinality")) - .and_then(|col| col.as_any().downcast_ref::()); - - let mut removes = Vec::with_capacity(remove_col.len()); - for idx in 0..remove_col.len() { - if !remove_col.is_valid(idx) { - continue; - } - - let raw_path = read_str(path_col, idx)?; - let path = percent_decode_str(raw_path) - .decode_utf8() - .map_err(|_| DeltaTableError::generic("illegal path encoding"))? - .to_string(); - - let deletion_vector = if let ( - Some(struct_array), - Some(storage), - Some(path), - Some(size), - Some(cardinality), - ) = ( - dv_struct, - dv_storage_type, - dv_path, - dv_size_in_bytes, - dv_cardinality, - ) { - if struct_array.is_valid(idx) { - let storage_type = - StorageType::from_str(read_str(storage, idx)?).map_err(|_| { - DeltaTableError::generic("failed to parse deletion vector storage type") - })?; - let path_or_inline_dv = read_str(path, idx)?.to_string(); - let size_in_bytes = read_i32(size, idx)?; - let cardinality = read_i64(cardinality, idx)?; - let offset = dv_offset.and_then(|arr| read_i32_opt(arr, idx)); - Some(DeletionVectorDescriptor { - storage_type, - path_or_inline_dv, - size_in_bytes, - cardinality, - offset, - }) - } else { - None - } - } else { - None - }; - let partition_values = map_to_hash_map(partition_values_col, idx)?; - let tags = map_to_hash_map(tags_col, idx)?; - - removes.push(Remove { - path, - data_change: read_bool(data_change_col, idx)?, - deletion_timestamp: read_i64_opt(deletion_ts_col, idx), - extended_file_metadata: extended_file_metadata_col - .and_then(|col| read_bool_opt(col, idx)), - partition_values, - size: size_col.and_then(|col| read_i64_opt(col, idx)), - tags, - deletion_vector, - base_row_id: None, - default_row_commit_version: None, - }); + pub fn input_schema(&self) -> DeltaResult { + arrow_schema_reorder_partitions(self.schema(), self.metadata().partition_columns(), false) } - Ok(removes) -} - -fn required_string_field<'a>(array: &'a StructArray, name: &str) -> DeltaResult<&'a StringArray> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) - .ok_or_else(|| DeltaTableError::schema(format!("{name} column not found on remove struct"))) -} - -fn required_bool_field<'a>(array: &'a StructArray, name: &str) -> DeltaResult<&'a BooleanArray> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) - .ok_or_else(|| DeltaTableError::schema(format!("{name} column not found on remove struct"))) -} - -fn required_i64_field<'a>(array: &'a StructArray, name: &str) -> DeltaResult<&'a Int64Array> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) - .ok_or_else(|| DeltaTableError::schema(format!("{name} column not found on remove struct"))) -} - -fn optional_bool_field<'a>(array: &'a StructArray, name: &str) -> Option<&'a BooleanArray> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) -} - -fn optional_map_field<'a>(array: &'a StructArray, name: &str) -> Option<&'a MapArray> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) -} - -fn optional_i64_field<'a>(array: &'a StructArray, name: &str) -> Option<&'a Int64Array> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) -} - -fn optional_struct_field<'a>(array: &'a StructArray, name: &str) -> Option<&'a StructArray> { - array - .column_by_name(name) - .and_then(|col| col.as_any().downcast_ref::()) -} - -fn read_str(array: &StringArray, idx: usize) -> DeltaResult<&str> { - array - .is_valid(idx) - .then(|| array.value(idx)) - .ok_or_else(|| DeltaTableError::generic("missing string value")) -} - -fn read_bool(array: &BooleanArray, idx: usize) -> DeltaResult { - array - .is_valid(idx) - .then(|| array.value(idx)) - .ok_or_else(|| DeltaTableError::generic("missing boolean value")) -} - -fn read_bool_opt(array: &BooleanArray, idx: usize) -> Option { - array.is_valid(idx).then(|| array.value(idx)) -} - -fn read_i64(array: &Int64Array, idx: usize) -> DeltaResult { - array - .is_valid(idx) - .then(|| array.value(idx)) - .ok_or_else(|| DeltaTableError::generic("missing i64 value")) -} - -fn read_i64_opt(array: &Int64Array, idx: usize) -> Option { - array.is_valid(idx).then(|| array.value(idx)) -} + pub fn adds(&self) -> &[Add] { + self.adds.as_ref() + } -fn read_i32(array: &Int32Array, idx: usize) -> DeltaResult { - array - .is_valid(idx) - .then(|| array.value(idx)) - .ok_or_else(|| DeltaTableError::generic("missing i32 value")) -} + pub fn removes(&self) -> &[Remove] { + self.removes.as_ref() + } -fn read_i32_opt(array: &Int32Array, idx: usize) -> Option { - array.is_valid(idx).then(|| array.value(idx)) -} + fn has_unknown_table_features(&self) -> bool { + self.protocol() + .reader_features() + .into_iter() + .flatten() + .chain(self.protocol().writer_features().into_iter().flatten()) + .any(|feature| matches!(feature, TableFeature::Unknown)) + } -fn map_to_hash_map( - map: Option<&MapArray>, - idx: usize, -) -> DeltaResult>>> { - match map { - Some(array) if array.is_valid(idx) => { - let entries = collect_map(&array.value(idx))?; - Ok(Some(entries.into_iter().collect::>())) - } - _ => Ok(None), + fn has_deletion_vectors(&self) -> bool { + self.adds().iter().any(|add| add.deletion_vector.is_some()) + || self + .removes() + .iter() + .any(|remove| remove.deletion_vector.is_some()) } -} -fn collect_map(val: &StructArray) -> DeltaResult)>> { - let keys = val - .column(0) - .as_any() - .downcast_ref::() - .ok_or_else(|| DeltaTableError::schema("map key column is not Utf8".to_string()))?; - let values = val - .column(1) - .as_any() - .downcast_ref::() - .ok_or_else(|| DeltaTableError::schema("map value column is not Utf8".to_string()))?; - - let mut entries = Vec::with_capacity(keys.len()); - for (key, value) in keys.iter().zip(values.iter()) { - if let Some(k) = key { - entries.push((k.to_string(), value.map(|v| v.to_string()))); + pub fn build_version_checksum( + &self, + txn_id: Option, + in_commit_timestamp_opt: Option, + ) -> DeltaResult> { + // TODO: Remove these coarse skips once replay retains the latest + // DomainMetadata actions and reconciles files with deletion-vector identity. + if self.has_unknown_table_features() { + debug!( + "Skipping version checksum for version {} because the protocol includes unsupported features", + self.version() + ); + return Ok(None); + } + if self.has_deletion_vectors() { + debug!( + "Skipping version checksum for version {} because the snapshot includes deletion vectors", + self.version() + ); + return Ok(None); } - } - Ok(entries) -} -/// A snapshot of a Delta table that has been eagerly loaded into memory. -#[derive(Debug, Clone, PartialEq)] -pub struct EagerSnapshot { - snapshot: Snapshot, - // logical files in the snapshot - pub(crate) files: RecordBatch, -} + let mut num_files: i64 = 0; + let mut table_size_bytes: i64 = 0; -impl EagerSnapshot { - /// Create a new [`EagerSnapshot`] instance - pub async fn try_new( - log_store: &dyn LogStore, - config: DeltaTableConfig, - version: Option, - ) -> DeltaResult { - let snapshot = Snapshot::try_new(log_store, config.clone(), version).await?; + for add in self.adds() { + num_files = num_files + .checked_add(1) + .ok_or_else(|| DeltaTableError::generic("Version checksum file count overflow"))?; + table_size_bytes = table_size_bytes + .checked_add(add.size) + .ok_or_else(|| DeltaTableError::generic("Version checksum table size overflow"))?; + } - let files = match config.require_files { - true => snapshot.files(log_store, None).try_collect().await?, - false => vec![], - }; + let mut set_transactions = self.app_txns.values().cloned().collect::>(); + set_transactions.sort_by(|left, right| { + left.app_id + .cmp(&right.app_id) + .then(left.version.cmp(&right.version)) + }); - let scan_row_schema = snapshot.inner.scan_row_parsed_schema_arrow()?; - let files = files - .into_iter() - .map(|batch| { - if batch.schema().as_ref() == scan_row_schema.as_ref() { - Ok(batch) - } else { - // Align row batches to the canonical scan schema before concatenation. - cast_record_batch_relaxed_tz(&batch, &scan_row_schema) - .map_err(|e| DeltaTableError::generic(e.to_string())) - } + Ok(Some(VersionChecksum { + txn_id, + table_size_bytes, + num_files, + num_metadata: 1, + num_protocol: 1, + in_commit_timestamp_opt, + set_transactions: (!set_transactions.is_empty()).then_some(set_transactions), + // TODO(protocol-hardening): Populate from reconciled snapshot state once replay keeps + // the latest DomainMetadata actions alongside metadata/protocol/txns. + domain_metadata: None, + metadata: self.metadata.clone(), + protocol: self.protocol.clone(), + // TODO(protocol-hardening): Populate optional protocol fields when we can do so + // deterministically without synthesizing partial state. + file_size_histogram: None, + all_files: None, + })) + } + + pub fn physical_partition_columns(&self) -> Vec<(String, String)> { + let mode = self.effective_column_mapping_mode(); + self.metadata() + .partition_columns() + .iter() + .map(|logical| { + let physical = self + .schema() + .field_with_name(logical) + .map(|field| arrow_field_physical_name(field, mode).to_string()) + .unwrap_or_else(|_| logical.clone()); + (logical.clone(), physical) }) - .collect::>>()?; - let files = concat_batches(&scan_row_schema, &files)?; - - Ok(Self { snapshot, files }) + .collect() } - /// Update the snapshot to the given version - pub(crate) async fn update( - &mut self, - log_store: &dyn LogStore, - target_version: Option, - ) -> DeltaResult<()> { - let current_version = self.version() as u64; - if Some(current_version) == target_version { - return Ok(()); - } - - self.snapshot.update(log_store, target_version).await?; - - let scan = self.snapshot.inner.clone().scan_builder().build()?; - let engine = log_store.engine(None); - let current_files = self.files.clone(); - let files: Vec<_> = spawn_blocking(move || { - scan.scan_metadata_from_arrow( - engine.as_ref(), - current_version, - Box::new(std::iter::once(current_files)), - None, - )? - .map_ok(|s| s.scan_files) - .try_collect() + pub fn files_batch(&self) -> DeltaResult<&RecordBatch> { + self.files_batch.get_or_try_init(|| { + if self.config.require_files { + self.build_active_files_batch() + } else { + self.build_empty_files_batch() + } }) - .await - .map_err(|e| DeltaTableError::generic(e.to_string()))??; - - let files = concat_batches(&SCAN_ROW_ARROW_SCHEMA, &files)?; - let files = match self.snapshot.inner.parse_stats_column(&files) { - Ok(parsed) => parsed, - Err(_) => files, - }; - - self.files = files; - - Ok(()) } - /// Get the underlying snapshot - pub(crate) fn snapshot(&self) -> &Snapshot { - &self.snapshot + pub fn pruning_stats(&self) -> DeltaResult> { + SnapshotPruningStats::try_new(self.files_batch()?, self) } - /// Get the table version of the snapshot - pub fn version(&self) -> i64 { - self.snapshot.version() + pub async fn all_tombstones( + &self, + log_store: &dyn LogStore, + ) -> DeltaResult> { + Ok(self + .tombstones(log_store) + .try_collect::>() + .await? + .into_iter()) } - /// Get the timestamp of the given version - pub fn version_timestamp(&self, version: i64) -> Option { - for path in &self.snapshot.inner.log_segment().ascending_commit_files { - if path.version as i64 == version { - return Some(path.location.last_modified); + pub async fn unexpired_tombstones( + &self, + log_store: &dyn LogStore, + ) -> DeltaResult> { + let retention_timestamp = Utc::now().timestamp_millis() + - self + .table_properties() + .deleted_file_retention_duration() + .as_millis() as i64; + let tombstones = self.all_tombstones(log_store).await?.collect::>(); + Ok(tombstones + .into_iter() + .filter(move |t| t.deletion_timestamp.unwrap_or(0) > retention_timestamp)) + } + + pub fn add_actions_table(&self, flatten: bool) -> Result { + let actions = self.files_batch()?; + let mut fields: Vec = Vec::new(); + let mut columns: Vec = Vec::new(); + + push_renamed_column( + actions, + FIELD_NAME_PATH, + FIELD_NAME_PATH, + &mut fields, + &mut columns, + )?; + push_renamed_column( + actions, + FIELD_NAME_SIZE, + "size_bytes", + &mut fields, + &mut columns, + )?; + push_renamed_column( + actions, + FIELD_NAME_MODIFICATION_TIME, + "modification_time", + &mut fields, + &mut columns, + )?; + + if let Some(stats) = struct_column(actions, FIELD_NAME_STATS_PARSED) { + let (num_records, nullable) = required_struct_child(stats, STATS_FIELD_NUM_RECORDS)?; + fields.push(Arc::new(Field::new( + "num_records", + num_records.data_type().clone(), + nullable, + ))); + columns.push(num_records); + + if let Some((null_count, nullable)) = + optional_struct_child(stats, STATS_FIELD_NULL_COUNT) + { + fields.push(Arc::new(Field::new( + "null_count", + null_count.data_type().clone(), + nullable, + ))); + columns.push(null_count); + } + if let Some((min_values, nullable)) = + optional_struct_child(stats, STATS_FIELD_MIN_VALUES) + { + fields.push(Arc::new(Field::new( + "min", + min_values.data_type().clone(), + nullable, + ))); + columns.push(min_values); + } + if let Some((max_values, nullable)) = + optional_struct_child(stats, STATS_FIELD_MAX_VALUES) + { + fields.push(Arc::new(Field::new( + "max", + max_values.data_type().clone(), + nullable, + ))); + columns.push(max_values); } } - None - } - /// Get the table schema of the snapshot - pub fn schema(&self) -> &StructType { - self.snapshot.schema() - } - - /// Get the table metadata of the snapshot - pub fn metadata(&self) -> &Metadata { - self.snapshot.metadata() - } - - /// Get the table protocol of the snapshot - pub fn protocol(&self) -> &Protocol { - self.snapshot.protocol() - } + if !self.metadata().partition_columns().is_empty() { + push_renamed_column( + actions, + FIELD_NAME_PARTITION_VALUES_PARSED, + "partition", + &mut fields, + &mut columns, + )?; + } - /// Get the table config which is loaded with of the snapshot - pub fn load_config(&self) -> &DeltaTableConfig { - self.snapshot.load_config() + let result = RecordBatch::try_new(Arc::new(ArrowSchema::new(fields)), columns)?; + if flatten { + Ok(result.normalize(".", None)?) + } else { + Ok(result) + } } - /// Well known table configuration - pub fn table_properties(&self) -> &TableProperties { - self.snapshot.table_properties() + pub(crate) fn tombstones( + &self, + _log_store: &dyn LogStore, + ) -> BoxStream<'_, DeltaResult> { + futures::stream::iter(self.removes.iter().cloned().map(Ok::<_, DeltaTableError>)).boxed() } - /// Well known table configuration (alias for table_properties) - pub fn config(&self) -> &TableProperties { - self.table_properties() + pub fn transaction_version(&self, app_id: impl ToString) -> DeltaResult> { + Ok(self + .app_txns + .get(&app_id.to_string()) + .map(|txn| txn.version)) } +} - pub fn table_configuration(&self) -> &TableConfiguration { - self.snapshot.table_configuration() - } +fn push_renamed_column( + batch: &RecordBatch, + input_name: &str, + output_name: &str, + fields: &mut Vec, + columns: &mut Vec, +) -> DeltaResult<()> { + let schema = batch.schema(); + let index = schema.index_of(input_name).map_err(|_| { + DeltaTableError::schema(format!("column {input_name} not found in add actions")) + })?; + let field = schema.field(index); + fields.push(Arc::new(Field::new( + output_name, + field.data_type().clone(), + field.is_nullable(), + ))); + columns.push(batch.column(index).clone()); + Ok(()) +} - /// Get a [`LogDataHandler`] for the snapshot to inspect the currently loaded state of the log. - pub fn log_data(&self) -> LogDataHandler<'_> { - LogDataHandler::new(&self.files, self.snapshot.table_configuration()) - } +fn struct_column<'a>(batch: &'a RecordBatch, name: &str) -> Option<&'a StructArray> { + batch + .column_by_name(name) + .and_then(|array| array.as_any().downcast_ref::()) +} - /// Stream the active files in the snapshot - /// - /// This function returns a stream of [`LogicalFileView`] objects, - /// which represent the active files in the snapshot. - /// - /// # Arguments - /// - /// * `log_store` - A reference to a [`LogStore`] implementation. - /// * `predicate` - An optional predicate to filter the files. - /// - /// # Returns - /// - /// A stream of [`LogicalFileView`] objects. - pub fn files( - &self, - log_store: &dyn LogStore, - predicate: Option, - ) -> BoxStream<'_, DeltaResult> { - // TODO: the logic in this function would be more suitable as an async fn rather than - // a stream. However as we are moving from an eager to a cached snapshot, this should be - // a stream just like on the Snapshot. So we swallow the awkward error handling for now - // knowing that we will be able to clean this up soon (TM). - let data = if let Some(predicate) = predicate { - let scan = match self - .snapshot - .inner - .clone() - .scan_builder() - .with_predicate(predicate) - .build() - { - Ok(scan) => scan, - Err(err) => return Box::pin(futures::stream::once(async { Err(err.into()) })), - }; - let engine = log_store.engine(None); - let current_files = self.files.clone(); - let current_version = self.version() as u64; - - // TODO: while we are always re-processing the cached files, we are confident that no IO - // is performed when processing, so for now we are not spawning this on a blocking thread. - // As we continue refactoring, we need to move this onto an actual stream. - let files_iter = match scan.scan_metadata_from_arrow( - engine.as_ref(), - current_version, - Box::new(std::iter::once(current_files)), - None, - ) { - Ok(files_iter) => files_iter, - Err(err) => return Box::pin(futures::stream::once(async { Err(err.into()) })), - }; - - let files: Vec<_> = match files_iter.map_ok(|s| s.scan_files).try_collect() { - Ok(files) => files, - Err(err) => return Box::pin(futures::stream::once(async { Err(err.into()) })), - }; - - match concat_batches(&SCAN_ROW_ARROW_SCHEMA, &files) - .map_err(DeltaTableError::from) - .and_then(|batch| self.snapshot.inner.parse_stats_column(&batch)) - { - Ok(files) => files, - Err(err) => return Box::pin(futures::stream::once(async { Err(err) })), - } - } else { - self.files.clone() - }; - let iter = (0..data.num_rows()).map(move |i| Ok(LogicalFileView::new(data.clone(), i))); - futures::stream::iter(iter).boxed() - } +fn required_struct_child( + array: &StructArray, + name: &str, +) -> Result<(ArrayRef, bool), DeltaTableError> { + optional_struct_child(array, name) + .ok_or_else(|| DeltaTableError::schema(format!("{name} field not found in struct column"))) +} - /// Iterate over all latest app transactions - pub async fn transaction_version( - &self, - log_store: &dyn LogStore, - app_id: impl ToString, - ) -> DeltaResult> { - self.snapshot - .application_transaction_version(log_store, app_id.to_string()) - .await - } +fn optional_struct_child(array: &StructArray, name: &str) -> Option<(ArrayRef, bool)> { + let column = array.column_by_name(name)?.clone(); + let nullable = array + .fields() + .iter() + .find(|f| f.name() == name) + .map(|f| f.is_nullable()) + .unwrap_or(true); + Some((column, nullable)) } diff --git a/crates/sail-delta-lake/src/kernel/snapshot/stats.rs b/crates/sail-delta-lake/src/kernel/snapshot/stats.rs new file mode 100644 index 0000000000..0c3e2ada78 --- /dev/null +++ b/crates/sail-delta-lake/src/kernel/snapshot/stats.rs @@ -0,0 +1,427 @@ +// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt +// +// Copyright (2020) QP Hou and a number of other contributors. +// Portions Copyright 2025-2026 LakeSail, Inc. +// Modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [Credit]: + +use std::collections::HashSet; +use std::sync::Arc; + +use ::datafusion::common::stats::{ColumnStatistics, Precision, Statistics}; +use arrow_schema::DataType as ArrowDataType; +use datafusion::arrow::array::{ + Array, ArrayRef, BooleanArray, Int64Array, RecordBatch, StringArray, StructArray, UInt64Array, +}; +use datafusion::arrow::compute::sum; +use datafusion::common::scalar::ScalarValue; +use datafusion::common::{Column, DataFusionError}; +use datafusion::functions_aggregate::min_max::{MaxAccumulator, MinAccumulator}; +use datafusion::physical_optimizer::pruning::PruningStatistics; +use datafusion::physical_plan::Accumulator; +use log::warn; + +use super::DeltaSnapshot; +use crate::spec::fields::{ + FIELD_NAME_PARTITION_VALUES_PARSED, FIELD_NAME_SIZE, FIELD_NAME_STATS_PARSED, + STATS_FIELD_MAX_VALUES, STATS_FIELD_MIN_VALUES, STATS_FIELD_NULL_COUNT, + STATS_FIELD_NUM_RECORDS, +}; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; + +#[derive(Debug, Clone)] +enum AccumulatorType { + Min, + Max, +} + +// TODO validate this works with "wide and narrow" builds / stats + +/// Pruning/statistics view over a materialized snapshot file batch. +#[derive(Clone)] +pub struct SnapshotPruningStats<'a> { + data: &'a RecordBatch, + snapshot: &'a DeltaSnapshot, + sizes: &'a Int64Array, + stats: &'a StructArray, +} + +impl<'a> SnapshotPruningStats<'a> { + pub(crate) fn try_new(data: &'a RecordBatch, snapshot: &'a DeltaSnapshot) -> DeltaResult { + let sizes = batch_column::(data, FIELD_NAME_SIZE)?; + let stats = batch_column::(data, FIELD_NAME_STATS_PARSED)?; + Ok(Self { + data, + snapshot, + sizes, + stats, + }) + } + + /// The number of files in the log data. + pub fn num_files(&self) -> usize { + self.data.num_rows() + } + + fn collect_count(&self, name: &str) -> Precision { + let num_records = nested_struct_column_exact_or_path(self.stats, name) + .and_then(|col| col.as_any().downcast_ref::()); + if let Some(num_records) = num_records { + if num_records.is_empty() { + Precision::Exact(0) + } else if let Some(null_count_mulls) = num_records.nulls() { + if null_count_mulls.null_count() > 0 { + Precision::Absent + } else { + sum(num_records) + .map(|s| Precision::Exact(s as usize)) + .unwrap_or(Precision::Absent) + } + } else { + sum(num_records) + .map(|s| Precision::Exact(s as usize)) + .unwrap_or(Precision::Absent) + } + } else { + Precision::Absent + } + } + + fn column_bounds( + &self, + path_step: &str, + name: &str, + fun_type: AccumulatorType, + ) -> Precision { + let array = match nested_column(self.stats, path_step, name) { + Ok(array) => array, + Err(_) => return Precision::Absent, + }; + let array_ref = array.as_ref(); + + if array_ref.data_type().is_primitive() { + let accumulator: Option> = match fun_type { + AccumulatorType::Min => MinAccumulator::try_new(array_ref.data_type()) + .map_or(None, |a| Some(Box::new(a))), + AccumulatorType::Max => MaxAccumulator::try_new(array_ref.data_type()) + .map_or(None, |a| Some(Box::new(a))), + }; + + if let Some(mut accumulator) = accumulator { + return accumulator + .update_batch(std::slice::from_ref(array)) + .ok() + .and_then(|_| accumulator.evaluate().ok()) + .map(Precision::Exact) + .unwrap_or(Precision::Absent); + } + + return Precision::Absent; + } + + match array_ref.data_type() { + ArrowDataType::Struct(fields) => fields + .iter() + .map(|f| { + self.column_bounds(path_step, &format!("{name}.{}", f.name()), fun_type.clone()) + }) + .map(|s| match s { + Precision::Exact(s) => Some(s), + _ => None, + }) + .collect::>>() + .map(|o| { + let arrays = match o + .into_iter() + .map(|sv| sv.to_array()) + .collect::, DataFusionError>>() + { + Ok(arrays) => arrays, + Err(_) => return Precision::Absent, + }; + let sa = StructArray::new(fields.clone(), arrays, None); + Precision::Exact(ScalarValue::Struct(Arc::new(sa))) + }) + .unwrap_or(Precision::Absent), + _ => Precision::Absent, + } + } + + fn num_records(&self) -> Precision { + self.collect_count(STATS_FIELD_NUM_RECORDS) + } + + fn total_size_files(&self) -> Precision { + let size = self + .sizes + .iter() + .flat_map(|s| s.map(|s| s as usize)) + .sum::(); + Precision::Inexact(size) + } + + fn build_column_stats(&self, name: impl AsRef) -> DeltaResult { + let null_count_col = format!("{STATS_FIELD_NULL_COUNT}.{}", name.as_ref()); + let null_count = self.collect_count(&null_count_col); + + let min_value = + self.column_bounds(STATS_FIELD_MIN_VALUES, name.as_ref(), AccumulatorType::Min); + let min_value = match &min_value { + Precision::Exact(value) if value.is_null() => Precision::Absent, + // TODO this is a hack, we should not be casting here but rather when we read the checkpoint data. + // it seems sometimes the min/max values are stored as nanoseconds and sometimes as microseconds? + Precision::Exact(ScalarValue::TimestampNanosecond(a, b)) => Precision::Exact( + ScalarValue::TimestampMicrosecond(a.map(|v| v / 1000), b.clone()), + ), + _ => min_value, + }; + + let max_value = + self.column_bounds(STATS_FIELD_MAX_VALUES, name.as_ref(), AccumulatorType::Max); + let max_value = match &max_value { + Precision::Exact(value) if value.is_null() => Precision::Absent, + Precision::Exact(ScalarValue::TimestampNanosecond(a, b)) => Precision::Exact( + ScalarValue::TimestampMicrosecond(a.map(|v| v / 1000), b.clone()), + ), + _ => max_value, + }; + + Ok(ColumnStatistics { + null_count, + max_value, + min_value, + sum_value: Precision::Absent, + distinct_count: Precision::Absent, + byte_size: Precision::Absent, + }) + } + + pub(crate) fn column_stats(&self, name: impl AsRef) -> Option { + self.build_column_stats(name).ok() + } + + pub(crate) fn statistics(&self) -> Option { + let num_rows = self.num_records(); + let total_byte_size = self.total_size_files(); + let column_statistics = self + .snapshot + .schema() + .fields() + .iter() + .map(|field| self.column_stats(field.name())) + .collect::>>()?; + Some(Statistics { + num_rows, + total_byte_size, + column_statistics, + }) + } + + fn pick_stats(&self, column: &Column, stats_field: &'static str) -> Option { + let schema = self.snapshot.schema(); + let field = schema.field_with_name(&column.name).ok()?; + // See issue #1214. Binary type does not support natural order which is required for Datafusion to prune + if matches!( + field.data_type(), + ArrowDataType::Binary | ArrowDataType::LargeBinary | ArrowDataType::BinaryView + ) { + return None; + } + if self + .snapshot + .metadata() + .partition_columns() + .contains(&column.name) + { + let partition_values = + match batch_column::(self.data, FIELD_NAME_PARTITION_VALUES_PARSED) { + Ok(values) => values, + Err(err) => { + warn!( + "Failed to access partitionValues_parsed for column {}: {err}", + column.name() + ); + return None; + } + }; + return nested_struct_column_exact_or_path(partition_values, &column.name).cloned(); + } + + nested_column(self.stats, stats_field, &column.name) + .ok() + .cloned() + } +} + +fn batch_column<'a, T: Array + 'static>(batch: &'a RecordBatch, name: &str) -> DeltaResult<&'a T> { + batch + .column_by_name(name) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| DeltaTableError::schema(format!("column {name} not found in log data"))) +} + +fn nested_column<'a>( + array: &'a StructArray, + root: &str, + name: &str, +) -> Result<&'a Arc, DeltaTableError> { + let current = array.column_by_name(root).ok_or_else(|| { + DeltaTableError::schema(format!("{root} column not found in stats struct")) + })?; + let struct_array = current + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DeltaTableError::schema(format!("Expected struct column for {root} in stats struct")) + })?; + nested_struct_column_exact_or_path(struct_array, name) + .ok_or_else(|| DeltaTableError::schema(format!("{name} column not found in stats struct"))) +} + +fn nested_struct_column_exact_or_path<'a>( + array: &'a StructArray, + name: &str, +) -> Option<&'a Arc> { + if let Some(current) = array.column_by_name(name) { + return Some(current); + } + + let mut path_iter = name.split('.'); + let first = path_iter.next()?; + let mut current = array.column_by_name(first)?; + for segment in path_iter { + let struct_array = current.as_any().downcast_ref::()?; + current = struct_array.column_by_name(segment)?; + } + Some(current) +} + +impl PruningStatistics for SnapshotPruningStats<'_> { + /// return the minimum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows + fn min_values(&self, column: &Column) -> Option { + self.pick_stats(column, STATS_FIELD_MIN_VALUES) + } + + /// return the maximum values for the named column, if known. + /// Note: the returned array must contain `num_containers()` rows. + fn max_values(&self, column: &Column) -> Option { + self.pick_stats(column, STATS_FIELD_MAX_VALUES) + } + + /// return the number of containers (e.g. row groups) being + /// pruned with these statistics + fn num_containers(&self) -> usize { + self.data.num_rows() + } + + /// return the number of null values for the named column as an + /// `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows. + fn null_counts(&self, column: &Column) -> Option { + if !self + .snapshot + .metadata() + .partition_columns() + .contains(&column.name) + { + let counts = self.pick_stats(column, STATS_FIELD_NULL_COUNT)?; + return ::datafusion::arrow::compute::cast(counts.as_ref(), &ArrowDataType::UInt64) + .ok(); + } + let partition_values = self.pick_stats(column, "__dummy__")?; + let row_counts = self.row_counts(column)?; + let row_counts = row_counts.as_any().downcast_ref::()?; + let mut null_counts = Vec::with_capacity(partition_values.len()); + for i in 0..partition_values.len() { + let null_count = if partition_values.is_null(i) { + row_counts.value(i) + } else { + 0 + }; + null_counts.push(null_count); + } + Some(Arc::new(UInt64Array::from(null_counts))) + } + + /// return the number of rows for the named column in each container + /// as an `Option`. + /// + /// Note: the returned array must contain `num_containers()` rows + fn row_counts(&self, _column: &Column) -> Option { + let row_counts = + nested_struct_column_exact_or_path(self.stats, STATS_FIELD_NUM_RECORDS)?.clone(); + ::datafusion::arrow::compute::cast(row_counts.as_ref(), &ArrowDataType::UInt64).ok() + } + + // This function is optional but will optimize partition column pruning + fn contained(&self, column: &Column, value: &HashSet) -> Option { + if value.is_empty() + || !self + .snapshot + .metadata() + .partition_columns() + .contains(&column.name) + { + return None; + } + + // Retrieve the partition values for the column + let partition_values = self.pick_stats(column, "__dummy__")?; + + let partition_values = partition_values + .as_any() + .downcast_ref::() + .ok_or(DeltaTableError::generic( + "failed to downcast string result to StringArray.", + )) + .ok()?; + + let mut contains = Vec::with_capacity(partition_values.len()); + + // TODO: this was inspired by parquet's BloomFilter pruning, decide if we should + // just convert to Vec for a subset of column types and use .contains + fn check_scalar(pv: &str, value: &ScalarValue) -> bool { + match value { + ScalarValue::Utf8(Some(v)) + | ScalarValue::Utf8View(Some(v)) + | ScalarValue::LargeUtf8(Some(v)) => pv == v, + + ScalarValue::Dictionary(_, inner) => check_scalar(pv, inner), + // FIXME: is this a good enough default or should we sync this with + // expr_applicable_for_cols and bail out with None + _ => value.to_string() == pv, + } + } + + for i in 0..partition_values.len() { + if partition_values.is_null(i) { + // For IS NULL predicates, we want to include NULL partitions + let contains_null = value.iter().any(|scalar| scalar.is_null()); + contains.push(contains_null); + } else { + contains.push( + value + .iter() + .any(|scalar| check_scalar(partition_values.value(i), scalar)), + ); + } + } + + Some(BooleanArray::from(contains)) + } +} diff --git a/crates/sail-delta-lake/src/kernel/snapshot/stream.rs b/crates/sail-delta-lake/src/kernel/snapshot/stream.rs deleted file mode 100644 index c91945da57..0000000000 --- a/crates/sail-delta-lake/src/kernel/snapshot/stream.rs +++ /dev/null @@ -1,200 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: - -//! the code in this file is hoisted from datafusion with only slight modifications -//! - -use std::pin::Pin; - -use datafusion::arrow::datatypes::SchemaRef; -use datafusion::arrow::record_batch::RecordBatch; -use futures::stream::BoxStream; -use futures::{Future, Stream, StreamExt}; -use tokio::sync::mpsc::{Receiver, Sender}; -use tokio::task::JoinSet; - -use crate::kernel::models::Add; -use crate::kernel::{DeltaResult, DeltaTableError}; - -/// Trait for types that stream [RecordBatch] -/// -/// See [`SendableRecordBatchStream`] for more details. -#[expect(dead_code)] -pub trait RecordBatchStream: Stream> { - /// Returns the schema of this `RecordBatchStream`. - /// - /// Implementation of this trait should guarantee that all `RecordBatch`'s returned by this - /// stream should have the same schema as returned from this method. - fn schema(&self) -> SchemaRef; -} - -/// Trait for a [`Stream`] of [`RecordBatch`]es that can be passed between threads -/// -/// This trait is used to retrieve the results of DataFusion execution plan nodes. -/// -/// The trait is a specialized Rust Async [`Stream`] that also knows the schema -/// of the data it will return (even if the stream has no data). Every -/// `RecordBatch` returned by the stream should have the same schema as returned -/// by [`schema`](`RecordBatchStream::schema`). -/// -/// # See Also -/// -/// * [`RecordBatchStreamAdapter`] to convert an existing [`Stream`] -/// to [`SendableRecordBatchStream`] -/// -/// [`RecordBatchStreamAdapter`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/stream/struct.RecordBatchStreamAdapter.html -/// -/// # Error Handling -/// -/// Once a stream returns an error, it should not be polled again (the caller -/// should stop calling `next`) and handle the error. -/// -/// However, returning `Ready(None)` (end of stream) is likely the safest -/// behavior after an error. Like [`Stream`]s, `RecordBatchStream`s should not -/// be polled after end of stream or returning an error. However, also like -/// [`Stream`]s there is no mechanism to prevent callers polling so returning -/// `Ready(None)` is recommended. -#[expect(dead_code)] -pub type SendableRecordBatchStream = Pin>; - -pub type SendableRBStream = Pin> + Send>>; - -#[expect(dead_code)] -pub type SendableAddStream = Pin> + Send>>; - -/// Creates a stream from a collection of producing tasks, routing panics to the stream. -/// -/// Note that this is similar to [`ReceiverStream` from tokio-stream], with the differences being: -/// -/// 1. Methods to bound and "detach" tasks (`spawn()` and `spawn_blocking()`). -/// -/// 2. Propagates panics, whereas the `tokio` version doesn't propagate panics to the receiver. -/// -/// 3. Automatically cancels any outstanding tasks when the receiver stream is dropped. -/// -/// [`ReceiverStream` from tokio-stream]: https://docs.rs/tokio-stream/latest/tokio_stream/wrappers/struct.ReceiverStream.html -pub(crate) struct ReceiverStreamBuilder { - tx: Sender>, - rx: Receiver>, - join_set: JoinSet>, -} - -impl ReceiverStreamBuilder { - /// Create new channels with the specified buffer size - pub fn new(capacity: usize) -> Self { - let (tx, rx) = tokio::sync::mpsc::channel(capacity); - - Self { - tx, - rx, - join_set: JoinSet::new(), - } - } - - /// Get a handle for sending data to the output - pub fn tx(&self) -> Sender> { - self.tx.clone() - } - - /// Spawn task that will be aborted if this builder (or the stream - /// built from it) are dropped - #[expect(dead_code)] - pub fn spawn(&mut self, task: F) - where - F: Future>, - F: Send + 'static, - { - self.join_set.spawn(task); - } - - /// Spawn a blocking task that will be aborted if this builder (or the stream - /// built from it) are dropped. - /// - /// This is often used to spawn tasks that write to the sender - /// retrieved from `Self::tx`. - pub fn spawn_blocking(&mut self, f: F) - where - F: FnOnce() -> DeltaResult<()>, - F: Send + 'static, - { - self.join_set.spawn_blocking(f); - } - - /// Create a stream of all data written to `tx` - pub fn build(self) -> BoxStream<'static, DeltaResult> { - let Self { - tx, - rx, - mut join_set, - } = self; - - // Doesn't need tx - drop(tx); - - // future that checks the result of the join set, and propagates panic if seen - let check = async move { - while let Some(result) = join_set.join_next().await { - match result { - Ok(task_result) => { - match task_result { - // Nothing to report - Ok(_) => continue, - // This means a blocking task error - Err(error) => return Some(Err(error)), - } - } - // This means a tokio task error, likely a panic - Err(e) => { - if e.is_panic() { - // resume on the main thread - std::panic::resume_unwind(e.into_panic()); - } else { - // This should only occur if the task is - // cancelled, which would only occur if - // the JoinSet were aborted, which in turn - // would imply that the receiver has been - // dropped and this code is not running - return Some(Err(DeltaTableError::generic(format!( - "Non Panic Task error: {e}" - )))); - } - } - } - } - None - }; - - let check_stream = futures::stream::once(check) - // unwrap Option / only return the error - .filter_map(|item| async move { item }); - - // Convert the receiver into a stream - let rx_stream = futures::stream::unfold(rx, |mut rx| async move { - let next_item = rx.recv().await; - next_item.map(|next_item| (next_item, rx)) - }); - - // Merge the streams together so whichever is ready first - // produces the batch - futures::stream::select(rx_stream, check_stream).boxed() - } -} - -pub(crate) type RecordBatchReceiverStreamBuilder = ReceiverStreamBuilder; diff --git a/crates/sail-delta-lake/src/kernel/statistics.rs b/crates/sail-delta-lake/src/kernel/statistics.rs deleted file mode 100644 index beae6aa3a9..0000000000 --- a/crates/sail-delta-lake/src/kernel/statistics.rs +++ /dev/null @@ -1,101 +0,0 @@ -use std::collections::HashMap; - -use serde::{Deserialize, Serialize}; - -/// Column statistics stored in `Stats`. -#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] -#[serde(untagged)] -pub enum ColumnValueStat { - Column(HashMap), - Value(serde_json::Value), -} - -impl ColumnValueStat { - pub fn as_column(&self) -> Option<&HashMap> { - match self { - ColumnValueStat::Column(m) => Some(m), - _ => None, - } - } - - pub fn as_value(&self) -> Option<&serde_json::Value> { - match self { - ColumnValueStat::Value(v) => Some(v), - _ => None, - } - } -} - -/// Column null-count statistics stored in `Stats`. -#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] -#[serde(untagged)] -pub enum ColumnCountStat { - Column(HashMap), - Value(i64), -} - -impl ColumnCountStat { - pub fn as_column(&self) -> Option<&HashMap> { - match self { - ColumnCountStat::Column(m) => Some(m), - _ => None, - } - } - - pub fn as_value(&self) -> Option { - match self { - ColumnCountStat::Value(v) => Some(*v), - _ => None, - } - } -} - -/// Statistics associated with an Add action. -#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub struct Stats { - pub num_records: i64, - pub min_values: HashMap, - pub max_values: HashMap, - pub null_count: HashMap, -} - -impl Stats { - pub fn from_json_str(value: &str) -> Result { - serde_json::from_str::(value).map(|stats| stats.into_stats()) - } - - pub fn from_json_opt(value: Option<&str>) -> Result, serde_json::error::Error> { - value.map(Self::from_json_str).transpose() - } - - pub fn to_json_string(&self) -> Result { - serde_json::to_string(self) - } -} - -#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -struct PartialStats { - pub num_records: i64, - pub min_values: Option>, - pub max_values: Option>, - pub null_count: Option>, -} - -impl PartialStats { - fn into_stats(self) -> Stats { - let PartialStats { - num_records, - min_values, - max_values, - null_count, - } = self; - Stats { - num_records, - min_values: min_values.unwrap_or_default(), - max_values: max_values.unwrap_or_default(), - null_count: null_count.unwrap_or_default(), - } - } -} diff --git a/crates/sail-delta-lake/src/kernel/table_properties.rs b/crates/sail-delta-lake/src/kernel/table_properties.rs deleted file mode 100644 index a3f3427cd3..0000000000 --- a/crates/sail-delta-lake/src/kernel/table_properties.rs +++ /dev/null @@ -1,72 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: -use std::num::NonZeroU64; -use std::time::Duration; - -use delta_kernel::table_properties::{IsolationLevel, TableProperties}; - -/// Convenience helpers for accessing common table properties with sensible defaults. -pub trait TablePropertiesExt { - fn append_only(&self) -> bool; - fn log_retention_duration(&self) -> Duration; - fn enable_expired_log_cleanup(&self) -> bool; - fn checkpoint_interval(&self) -> NonZeroU64; - fn deleted_file_retention_duration(&self) -> Duration; - fn isolation_level(&self) -> IsolationLevel; -} - -const SECONDS_PER_MINUTE: u64 = 60; -const SECONDS_PER_HOUR: u64 = 60 * SECONDS_PER_MINUTE; -const SECONDS_PER_DAY: u64 = 24 * SECONDS_PER_HOUR; -const SECONDS_PER_WEEK: u64 = 7 * SECONDS_PER_DAY; - -const DEFAULT_LOG_RETENTION_SECS: u64 = 30 * SECONDS_PER_DAY; -const DEFAULT_DELETED_FILE_RETENTION_SECS: u64 = SECONDS_PER_WEEK; -const DEFAULT_CHECKPOINT_INTERVAL: NonZeroU64 = - NonZeroU64::new(100).expect("non-zero checkpoint interval"); - -impl TablePropertiesExt for TableProperties { - fn append_only(&self) -> bool { - self.append_only.unwrap_or(false) - } - - fn log_retention_duration(&self) -> Duration { - self.log_retention_duration - .unwrap_or(Duration::from_secs(DEFAULT_LOG_RETENTION_SECS)) - } - - fn enable_expired_log_cleanup(&self) -> bool { - self.enable_expired_log_cleanup.unwrap_or(true) - } - - fn checkpoint_interval(&self) -> NonZeroU64 { - self.checkpoint_interval - .unwrap_or(DEFAULT_CHECKPOINT_INTERVAL) - } - - fn deleted_file_retention_duration(&self) -> Duration { - self.deleted_file_retention_duration - .unwrap_or(Duration::from_secs(DEFAULT_DELETED_FILE_RETENTION_SECS)) - } - - fn isolation_level(&self) -> IsolationLevel { - self.isolation_level.unwrap_or_default() - } -} diff --git a/crates/sail-delta-lake/src/kernel/transaction/conflict_checker.rs b/crates/sail-delta-lake/src/kernel/transaction/conflict_checker.rs index 1facc8ec27..285a2e7fa7 100644 --- a/crates/sail-delta-lake/src/kernel/transaction/conflict_checker.rs +++ b/crates/sail-delta-lake/src/kernel/transaction/conflict_checker.rs @@ -22,55 +22,13 @@ use std::collections::HashSet; -use delta_kernel::table_properties::IsolationLevel; -use delta_kernel::Error as KernelError; -use thiserror::Error; - -use crate::kernel::models::{Action, Add, CommitInfo, Metadata, Protocol, Remove, Transaction}; -use crate::kernel::snapshot::LogDataHandler; -use crate::kernel::{DeltaOperation, DeltaResult, TablePropertiesExt}; +use crate::kernel::DeltaOperation; +use crate::spec::{ + Action, Add, CommitConflictError, CommitInfo, DeltaError, DeltaResult, IsolationLevel, + Metadata, Protocol, Remove, Transaction, +}; use crate::storage::{get_actions, LogStore}; - -/// Exceptions raised during commit conflict resolution. -#[derive(Error, Debug)] -pub enum CommitConflictError { - #[error("Commit failed: a concurrent transactions added new data.\nHelp: This transaction's query must be rerun to include the new data. Also, if you don't care to require this check to pass in the future, the isolation level can be set to Snapshot Isolation.")] - ConcurrentAppend, - - #[error("Commit failed: a concurrent transaction deleted data this operation read.\nHelp: This transaction's query must be rerun to exclude the removed data. Also, if you don't care to require this check to pass in the future, the isolation level can be set to Snapshot Isolation.")] - ConcurrentDeleteRead, - - #[error("Commit failed: a concurrent transaction deleted the same data your transaction deletes.\nHelp: you should retry this write operation. If it was based on data contained in the table, you should rerun the query generating the data.")] - ConcurrentDeleteDelete, - - #[error("Metadata changed since last commit.")] - MetadataChanged, - - #[error("Concurrent transaction failed.")] - ConcurrentTransaction, - - #[error("Protocol changed since last commit: {0}")] - ProtocolChanged(String), - - #[error("Delta-rs does not support writer version {0}")] - UnsupportedWriterVersion(i32), - - #[error("Delta-rs does not support reader version {0}")] - UnsupportedReaderVersion(i32), - - #[error("Snapshot is corrupted: {source}")] - CorruptedState { - source: Box, - }, - - #[error("Error evaluating predicate: {source}")] - Predicate { - source: Box, - }, - - #[error("No metadata found, please make sure table is loaded.")] - NoMetadata, -} +use crate::table::DeltaSnapshot; /// A struct representing different attributes of current transaction needed for conflict detection. #[expect(unused)] @@ -80,15 +38,15 @@ pub(crate) struct TransactionInfo<'a> { read_app_ids: HashSet, /// delta log actions that the transaction wants to commit actions: &'a [Action], - /// read [`DeltaTableState`] used for the transaction - read_snapshot: LogDataHandler<'a>, + /// read snapshot used for the transaction + read_snapshot: &'a DeltaSnapshot, /// Whether the transaction tainted the whole table read_whole_table: bool, } impl<'a> TransactionInfo<'a> { pub fn try_new( - read_snapshot: LogDataHandler<'a>, + read_snapshot: &'a DeltaSnapshot, actions: &'a [Action], read_whole_table: bool, ) -> DeltaResult { @@ -103,7 +61,7 @@ impl<'a> TransactionInfo<'a> { } pub fn new( - read_snapshot: LogDataHandler<'a>, + read_snapshot: &'a DeltaSnapshot, actions: &'a [Action], read_whole_table: bool, ) -> Self { @@ -149,7 +107,7 @@ impl<'a> TransactionInfo<'a> { /// Files read by the transaction pub fn read_files(&self) -> Result + '_, CommitConflictError> { - Ok(self.read_snapshot.iter().map(|f| f.add_action())) + Ok(self.read_snapshot.adds().iter().cloned()) } /// Whether the whole table was read during the transaction @@ -207,7 +165,7 @@ impl WinningCommitSummary { commit_info, }) } - None => Err(KernelError::MissingVersion.into()), + None => Err(DeltaError::MissingVersion), } } diff --git a/crates/sail-delta-lake/src/kernel/transaction/mod.rs b/crates/sail-delta-lake/src/kernel/transaction/mod.rs index 2150840474..6786321dd3 100644 --- a/crates/sail-delta-lake/src/kernel/transaction/mod.rs +++ b/crates/sail-delta-lake/src/kernel/transaction/mod.rs @@ -24,25 +24,22 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::Bytes; use chrono::Utc; -use delta_kernel::table_features::TableFeature; -use delta_kernel::table_properties::TableProperties; use futures::future::BoxFuture; use log::*; -use object_store::path::Path; -use object_store::Error as ObjectStoreError; +use object_store::{Error as ObjectStoreError, ObjectStoreExt, PutMode, PutOptions}; use serde::{Deserialize, Serialize}; use serde_json::Value; -use thiserror::Error; use uuid::Uuid; -use crate::error::{DeltaError, KernelError}; -use crate::kernel::checkpoints::{cleanup_expired_logs_for, create_checkpoint_for}; -use crate::kernel::models::{Action, Metadata, Protocol, Transaction}; -use crate::kernel::snapshot::EagerSnapshot; +use crate::delta_log::cleanup::cleanup_expired_delta_log_files; +use crate::delta_log::{resolve_effective_protocol_and_metadata, resolve_version_timestamp}; +use crate::kernel::checkpoints::create_checkpoint_for; use crate::kernel::transaction::conflict_checker::{TransactionInfo, WinningCommitSummary}; -use crate::kernel::{DeltaOperation, DeltaResult, TablePropertiesExt}; +use crate::kernel::DeltaOperation; +use crate::spec::{checksum_path, temp_commit_path, Action, DeltaError, DeltaResult, Transaction}; +pub use crate::spec::{CommitConflictError, TransactionError}; use crate::storage::{CommitOrBytes, LogStoreRef, ObjectStoreRef}; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; mod conflict_checker; mod protocol; @@ -50,7 +47,6 @@ mod protocol; use conflict_checker::ConflictChecker; pub use protocol::INSTANCE as PROTOCOL; -const DELTA_LOG_FOLDER: &str = "_delta_log"; pub(crate) const DEFAULT_RETRIES: usize = 15; #[derive(Default, Debug, PartialEq, Clone, Serialize, Deserialize)] @@ -74,7 +70,7 @@ pub struct Metrics { pub num_log_files_cleaned_up: u64, } -#[derive(Default, Debug, PartialEq, Clone)] +#[derive(Default, Debug, PartialEq, Clone, Serialize, Deserialize)] pub struct OperationMetrics { pub num_files: Option, pub num_output_rows: Option, @@ -82,6 +78,10 @@ pub struct OperationMetrics { pub execution_time_ms: Option, pub num_removed_files: Option, pub num_added_files: Option, + pub num_output_files: Option, + pub num_added_bytes: Option, + pub num_removed_bytes: Option, + pub write_time_ms: Option, pub extra: HashMap, } @@ -106,8 +106,42 @@ impl OperationMetrics { if let Some(v) = self.num_added_files { out.insert("numAddedFiles".to_string(), Value::from(v)); } + if let Some(v) = self.num_output_files { + out.insert("numOutputFiles".to_string(), Value::from(v)); + } + if let Some(v) = self.num_added_bytes { + out.insert("numAddedBytes".to_string(), Value::from(v)); + } + if let Some(v) = self.num_removed_bytes { + out.insert("numRemovedBytes".to_string(), Value::from(v)); + } + if let Some(v) = self.write_time_ms { + out.insert("writeTimeMs".to_string(), Value::from(v)); + } out } + + pub fn merge(&mut self, other: Self) { + fn merge_opt(target: &mut Option, source: Option) { + if let Some(source) = source { + let merged = target.unwrap_or_default().saturating_add(source); + *target = Some(merged); + } + } + + merge_opt(&mut self.num_files, other.num_files); + merge_opt(&mut self.num_output_rows, other.num_output_rows); + merge_opt(&mut self.num_output_bytes, other.num_output_bytes); + merge_opt(&mut self.execution_time_ms, other.execution_time_ms); + merge_opt(&mut self.num_removed_files, other.num_removed_files); + merge_opt(&mut self.num_added_files, other.num_added_files); + merge_opt(&mut self.num_output_files, other.num_output_files); + merge_opt(&mut self.num_added_bytes, other.num_added_bytes); + merge_opt(&mut self.num_removed_bytes, other.num_removed_bytes); + merge_opt(&mut self.write_time_ms, other.write_time_ms); + + self.extra.extend(other.extra); + } } impl From> for OperationMetrics { @@ -129,6 +163,10 @@ impl From> for OperationMetrics { let execution_time_ms = take_u64(&mut value, "executionTimeMs"); let num_removed_files = take_u64(&mut value, "numRemovedFiles"); let num_added_files = take_u64(&mut value, "numAddedFiles"); + let num_output_files = take_u64(&mut value, "numOutputFiles"); + let num_added_bytes = take_u64(&mut value, "numAddedBytes"); + let num_removed_bytes = take_u64(&mut value, "numRemovedBytes"); + let write_time_ms = take_u64(&mut value, "writeTimeMs"); Self { num_files, @@ -137,48 +175,25 @@ impl From> for OperationMetrics { execution_time_ms, num_removed_files, num_added_files, + num_output_files, + num_added_bytes, + num_removed_bytes, + write_time_ms, extra: value, } } } -#[derive(Error, Debug)] -pub enum TransactionError { - #[error("Tried committing existing table version: {0}")] - VersionAlreadyExists(i64), - - #[error("Error serializing commit log to json: {json_err}")] - SerializeLogJson { json_err: serde_json::error::Error }, - - #[error("Log storage error: {source}")] - ObjectStore { - #[from] - source: ObjectStoreError, - }, - - #[error("Failed to commit transaction: {0}")] - CommitConflict(#[from] conflict_checker::CommitConflictError), - - #[error("Failed to commit transaction: {0}")] - MaxCommitAttempts(i32), - - #[error( - "The transaction includes Remove action with data change but Delta table is append-only" - )] - DeltaTableAppendOnly, - - #[error("Unsupported table features required: {0:?}")] - UnsupportedTableFeatures(Vec), - - #[error("Table features must be specified, please specify: {0:?}")] - TableFeaturesRequired(TableFeature), - - #[error("Transaction failed: {msg}")] - LogStoreError { - msg: String, - #[source] - source: Box, - }, +fn actions_to_log_bytes(actions: &[Action]) -> Result { + let mut buf: Vec = Vec::new(); + for (index, action) in actions.iter().enumerate() { + if index > 0 { + buf.push(b'\n'); + } + serde_json::to_writer(&mut buf, action) + .map_err(|e| TransactionError::SerializeLogJson { json_err: e })?; + } + Ok(Bytes::from(buf)) } #[derive(Debug)] @@ -196,46 +211,50 @@ impl CommitData { app_transactions: Vec, ) -> Self { let is_blind_append = Self::is_blind_append(&actions, &operation); - - let mut has_commit_info = false; - for action in actions.iter_mut() { - if let Action::CommitInfo(info) = action { - info.is_blind_append = Some(is_blind_append); - has_commit_info = true; - } + let mut commit_info = actions + .iter() + .find_map(|action| match action { + Action::CommitInfo(info) => Some(info.clone()), + _ => None, + }) + .unwrap_or_else(|| operation.get_commit_info()); + if commit_info.in_commit_timestamp.is_none() { + commit_info.in_commit_timestamp = commit_info + .info + .remove("inCommitTimestamp") + .and_then(|value| value.as_i64()); + } else { + commit_info.info.remove("inCommitTimestamp"); } - - if !has_commit_info { - let mut commit_info = operation.get_commit_info(); - commit_info.timestamp = Some(Utc::now().timestamp_millis()); - commit_info.is_blind_append = Some(is_blind_append); - app_metadata.insert( - "clientVersion".to_string(), - Value::String(format!("sail-delta-lake.{}", env!("CARGO_PKG_VERSION"))), + commit_info.is_blind_append = Some(is_blind_append); + app_metadata + .entry("clientVersion".to_string()) + .or_insert_with(|| { + Value::String(format!("sail-delta-lake.{}", env!("CARGO_PKG_VERSION"))) + }); + // Merge operationMetrics into the final commitInfo.info. + // If the caller also provided `operationMetrics` in app metadata, merge both. + let mut merged_operation_metrics: HashMap = HashMap::new(); + if let Some(Value::Object(obj)) = commit_info.info.get("operationMetrics").cloned() { + merged_operation_metrics.extend(obj); + } + if let Some(Value::Object(obj)) = app_metadata.get("operationMetrics").cloned() { + merged_operation_metrics.extend(obj); + } + merged_operation_metrics.extend(operation_metrics.into_map()); + + // Merge base info + app metadata (app metadata wins on conflicts). + let mut merged_info = commit_info.info.clone(); + merged_info.extend(app_metadata.clone()); + if !merged_operation_metrics.is_empty() { + merged_info.insert( + "operationMetrics".to_string(), + Value::Object(merged_operation_metrics.into_iter().collect()), ); - // Merge operationMetrics into the final commitInfo.info. - // If the caller also provided `operationMetrics` in app metadata, merge both. - let mut merged_operation_metrics: HashMap = HashMap::new(); - if let Some(Value::Object(obj)) = commit_info.info.get("operationMetrics").cloned() { - merged_operation_metrics.extend(obj); - } - if let Some(Value::Object(obj)) = app_metadata.get("operationMetrics").cloned() { - merged_operation_metrics.extend(obj); - } - merged_operation_metrics.extend(operation_metrics.into_map()); - - // Merge base info + app metadata (app metadata wins on conflicts). - let mut merged_info = commit_info.info.clone(); - merged_info.extend(app_metadata.clone()); - if !merged_operation_metrics.is_empty() { - merged_info.insert( - "operationMetrics".to_string(), - Value::Object(merged_operation_metrics.into_iter().collect()), - ); - } - commit_info.info = merged_info; - actions.push(Action::CommitInfo(commit_info)); } + commit_info.info = merged_info; + actions.retain(|action| !matches!(action, Action::CommitInfo(_))); + actions.insert(0, Action::CommitInfo(commit_info)); for txn in &app_transactions { actions.push(Action::Txn(txn.clone())); @@ -244,29 +263,156 @@ impl CommitData { Self { actions, operation } } - pub fn get_bytes(&self) -> Result { - // Write newline-delimited JSON without building intermediate Strings. - let mut buf: Vec = Vec::new(); - for (i, action) in self.actions.iter().enumerate() { - if i > 0 { - buf.push(b'\n'); - } - serde_json::to_writer(&mut buf, action) - .map_err(|e| TransactionError::SerializeLogJson { json_err: e })?; - } - Ok(Bytes::from(buf)) + fn commit_info(&self) -> Option<&crate::spec::CommitInfo> { + self.actions.iter().find_map(|action| match action { + Action::CommitInfo(info) => Some(info), + _ => None, + }) + } + + fn version_checksum_txn_id(&self) -> Option { + self.commit_info().and_then(|info| { + info.info + .get("txnId") + .and_then(Value::as_str) + .map(str::to_owned) + }) + } + + fn version_checksum_in_commit_timestamp(&self) -> Option { + self.commit_info().and_then(|info| info.in_commit_timestamp) } fn is_blind_append(actions: &[Action], operation: &DeltaOperation) -> bool { match operation { - DeltaOperation::Write { predicate, .. } if predicate.is_none() => actions - .iter() - .all(|action| matches!(action, Action::Add(_) | Action::Txn(_))), + DeltaOperation::Write { predicate, .. } if predicate.is_none() => { + actions.iter().all(|action| { + matches!( + action, + Action::Add(_) | Action::Txn(_) | Action::CommitInfo(_) + ) + }) + } _ => false, } } } +async fn write_tmp_commit(log_entry: Bytes, store: ObjectStoreRef) -> DeltaResult { + let token = uuid::Uuid::new_v4().to_string(); + let path = temp_commit_path(&token); + store.put(&path, log_entry.into()).await?; + Ok(CommitOrBytes::TmpCommit(path)) +} + +async fn prepare_commit_or_bytes( + log_store: &LogStoreRef, + operation_id: Uuid, + actions: &[Action], +) -> DeltaResult { + let log_entry = actions_to_log_bytes(actions)?; + if ["LakeFSLogStore", "DefaultLogStore"].contains(&log_store.name().as_str()) { + Ok(CommitOrBytes::LogBytes(log_entry)) + } else { + write_tmp_commit(log_entry, log_store.object_store(Some(operation_id))).await + } +} + +async fn previous_effective_commit_timestamp( + log_store: &LogStoreRef, + snapshot: Option<&Arc>, +) -> DeltaResult> { + let Some(snapshot) = snapshot else { + return Ok(None); + }; + resolve_version_timestamp( + log_store.as_ref(), + snapshot.version(), + snapshot.version_timestamp(snapshot.version()), + snapshot.protocol(), + snapshot.metadata(), + ) + .await + .map(Some) +} + +fn finalized_commit_info(actions: &mut [Action]) -> &mut crate::spec::CommitInfo { + match actions.first_mut() { + Some(Action::CommitInfo(info)) => info, + _ => unreachable!("commit actions must be normalized with commitInfo at index 0"), + } +} + +fn finalize_attempt_actions( + base_actions: &[Action], + read_snapshot: Option<&Arc>, + version: i64, + previous_commit_timestamp: Option, + now_ms: i64, +) -> DeltaResult> { + let mut finalized_actions = base_actions.to_vec(); + let old_in_commit_timestamps_enabled = read_snapshot + .map(|snapshot| snapshot.in_commit_timestamps_enabled()) + .unwrap_or(false); + let effective_protocol_and_metadata = resolve_effective_protocol_and_metadata( + read_snapshot.map(|snapshot| snapshot.protocol()), + read_snapshot.map(|snapshot| snapshot.metadata()), + &finalized_actions, + ); + let new_in_commit_timestamps_enabled = effective_protocol_and_metadata + .as_ref() + .map(|(protocol, metadata)| { + let table_properties = + crate::spec::TableProperties::from(metadata.configuration().iter()); + protocol.is_in_commit_timestamps_enabled(&table_properties) + }) + .unwrap_or(false); + let mut in_commit_timestamp = None; + + { + let commit_info = finalized_commit_info(&mut finalized_actions); + commit_info.timestamp = Some(now_ms); + commit_info.info.remove("inCommitTimestamp"); + if new_in_commit_timestamps_enabled { + let min_timestamp = previous_commit_timestamp + .map(|timestamp| timestamp.saturating_add(1)) + .unwrap_or(now_ms); + in_commit_timestamp = Some(now_ms.max(min_timestamp)); + commit_info.in_commit_timestamp = in_commit_timestamp; + } else { + commit_info.in_commit_timestamp = None; + } + } + + if read_snapshot.is_some() + && !old_in_commit_timestamps_enabled + && new_in_commit_timestamps_enabled + { + if let Some(in_commit_timestamp) = in_commit_timestamp { + if let Some(metadata) = finalized_actions + .iter_mut() + .find_map(|action| match action { + Action::Metadata(metadata) => Some(metadata), + _ => None, + }) + { + *metadata = metadata + .clone() + .add_config_key( + "delta.inCommitTimestampEnablementVersion".to_string(), + version.to_string(), + ) + .add_config_key( + "delta.inCommitTimestampEnablementTimestamp".to_string(), + in_commit_timestamp.to_string(), + ); + } + } + } + + Ok(finalized_actions) +} + #[async_trait] pub trait CustomExecuteHandler: Send + Sync { async fn before_post_commit_hook( @@ -284,58 +430,6 @@ pub trait CustomExecuteHandler: Send + Sync { ) -> DeltaResult<()>; } -/// Reference to some structure that contains mandatory attributes for performing a commit. -pub trait TableReference: Send + Sync { - /// Well known table configuration - fn config(&self) -> &TableProperties; - - /// Get the table protocol of the snapshot - fn protocol(&self) -> &Protocol; - - /// Get the table metadata of the snapshot - #[expect(dead_code)] - fn metadata(&self) -> &Metadata; - - /// Try to cast this table reference to a `EagerSnapshot` - fn eager_snapshot(&self) -> &EagerSnapshot; -} - -impl TableReference for EagerSnapshot { - fn protocol(&self) -> &Protocol { - EagerSnapshot::protocol(self) - } - - fn metadata(&self) -> &Metadata { - EagerSnapshot::metadata(self) - } - - fn config(&self) -> &TableProperties { - self.table_properties() - } - - fn eager_snapshot(&self) -> &EagerSnapshot { - self - } -} - -impl TableReference for DeltaTableState { - fn config(&self) -> &TableProperties { - self.table_properties() - } - - fn protocol(&self) -> &Protocol { - EagerSnapshot::protocol(self) - } - - fn metadata(&self) -> &Metadata { - EagerSnapshot::metadata(self) - } - - fn eager_snapshot(&self) -> &EagerSnapshot { - self - } -} - #[derive(Clone, Debug, Copy)] /// Properties for post commit hook. pub struct PostCommitHookProperties { @@ -465,7 +559,7 @@ impl Default for CommitBuilder { } } -impl<'a> CommitBuilder { +impl CommitBuilder { /// Actions to be included in the commit pub fn with_actions(mut self, actions: Vec) -> Self { self.actions = actions; @@ -508,10 +602,10 @@ impl<'a> CommitBuilder { /// Prepare a Commit operation using the configured builder pub fn build( self, - table_data: Option<&'a dyn TableReference>, + table_data: Option>, log_store: LogStoreRef, operation: DeltaOperation, - ) -> PreCommit<'a> { + ) -> PreCommit { let data = CommitData::new( self.actions, operation, @@ -532,9 +626,9 @@ impl<'a> CommitBuilder { } /// Represents a commit that has not yet started but all details are finalized -pub struct PreCommit<'a> { +pub struct PreCommit { log_store: LogStoreRef, - table_data: Option<&'a dyn TableReference>, + table_data: Option>, data: CommitData, max_retries: usize, post_commit_hook: Option, @@ -542,55 +636,31 @@ pub struct PreCommit<'a> { operation_id: Uuid, } -impl<'a> std::future::IntoFuture for PreCommit<'a> { +impl std::future::IntoFuture for PreCommit { type Output = DeltaResult; - type IntoFuture = BoxFuture<'a, Self::Output>; + type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { Box::pin(async move { self.into_prepared_commit_future().await?.await?.await }) } } -impl<'a> PreCommit<'a> { +impl PreCommit { /// Prepare the commit but do not finalize it - pub fn into_prepared_commit_future(self) -> BoxFuture<'a, DeltaResult>> { + pub fn into_prepared_commit_future(self) -> BoxFuture<'static, DeltaResult> { let this = self; - // Write delta log entry as temporary file to storage. For the actual commit, - // the temporary file is moved (atomic rename) to the delta log folder within `commit` function. - async fn write_tmp_commit( - log_entry: Bytes, - store: ObjectStoreRef, - ) -> DeltaResult { - let token = uuid::Uuid::new_v4().to_string(); - let path = Path::from_iter([DELTA_LOG_FOLDER, &format!("_commit_{token}.json.tmp")]); - store.put(&path, log_entry.into()).await?; - Ok(CommitOrBytes::TmpCommit(path)) - } - Box::pin(async move { let local_actions: Vec<_> = this.data.actions.to_vec(); - if let Some(table_reference) = this.table_data { - PROTOCOL.can_commit(table_reference, &local_actions, &this.data.operation)?; + if let Some(table_reference) = &this.table_data { + PROTOCOL.can_commit( + table_reference.as_ref(), + &local_actions, + &this.data.operation, + )?; } - let log_entry = this.data.get_bytes()?; - - // With the DefaultLogStore & LakeFSLogstore, we just pass the bytes around, since we use conditionalPuts - // Other stores will use tmp_commits - let commit_or_bytes = if ["LakeFSLogStore", "DefaultLogStore"] - .contains(&this.log_store.name().as_str()) - { - CommitOrBytes::LogBytes(log_entry) - } else { - write_tmp_commit( - log_entry, - this.log_store.object_store(Some(this.operation_id)), - ) - .await? - }; Ok(PreparedCommit { - commit_or_bytes, log_store: this.log_store, table_data: this.table_data, max_retries: this.max_retries, @@ -604,11 +674,10 @@ impl<'a> PreCommit<'a> { } /// Represents a inflight commit -pub struct PreparedCommit<'a> { - commit_or_bytes: CommitOrBytes, +pub struct PreparedCommit { log_store: LogStoreRef, data: CommitData, - table_data: Option<&'a dyn TableReference>, + table_data: Option>, max_retries: usize, post_commit: Option, post_commit_hook_handler: Option>, @@ -622,15 +691,14 @@ pub struct PreparedCommit<'a> { // } // } -impl<'a> std::future::IntoFuture for PreparedCommit<'a> { +impl std::future::IntoFuture for PreparedCommit { type Output = DeltaResult; - type IntoFuture = BoxFuture<'a, Self::Output>; + type IntoFuture = BoxFuture<'static, Self::Output>; fn into_future(self) -> Self::IntoFuture { let this = self; Box::pin(async move { - let mut commit_or_bytes = this.commit_or_bytes; let mut local_actions: Vec<_> = this.data.actions.to_vec(); let creation_intent = this.table_data.is_none(); let creation_protocol = local_actions.iter().find_map(|a| match a { @@ -655,34 +723,27 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { }; let total_retries = effective_max_retries + 1; - let mut read_snapshot: Option = this - .table_data - .map(|table_ref| table_ref.eager_snapshot().clone()); + let mut read_snapshot: Option> = this.table_data.clone(); let mut creation_actions_stripped = false; for attempt_number in 1..=total_retries { let snapshot_version = read_snapshot.as_ref().map(|s| s.version()).unwrap_or(-1); let latest_version = match this.log_store.get_latest_version(snapshot_version).await { Ok(v) => Some(v), - Err(DeltaError::Kernel(KernelError::MissingVersion)) => None, + Err(DeltaError::MissingVersion) => None, Err(err) => return Err(err), }; if let Some(latest_version) = latest_version { - // Ensure we have a snapshot aligned to the latest version. - if read_snapshot - .as_ref() - .map(|s| s.version() < latest_version) - .unwrap_or(true) - { - let snapshot = EagerSnapshot::try_new( + if read_snapshot.is_none() { + let snapshot = DeltaSnapshot::try_new( this.log_store.as_ref(), Default::default(), Some(latest_version), + None, ) .await?; - - read_snapshot = Some(snapshot); + read_snapshot = Some(Arc::new(snapshot)); } if let Some(snapshot) = &read_snapshot { @@ -692,7 +753,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { if let Some(txn_protocol) = creation_protocol.as_ref() { if txn_protocol != snapshot.protocol() { return Err(TransactionError::CommitConflict( - conflict_checker::CommitConflictError::ProtocolChanged( + CommitConflictError::ProtocolChanged( "protocol changed".into(), ), ) @@ -712,7 +773,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { }); if !metadata_compatible { return Err(TransactionError::CommitConflict( - conflict_checker::CommitConflictError::MetadataChanged, + CommitConflictError::MetadataChanged, ) .into()); } @@ -731,16 +792,6 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { info.is_blind_append = Some(true); } } - - let mut jsons = Vec::::new(); - for action in &local_actions { - let json = serde_json::to_string(action).map_err(|e| { - TransactionError::SerializeLogJson { json_err: e } - })?; - jsons.push(json); - } - commit_or_bytes = - CommitOrBytes::LogBytes(Bytes::from(jsons.join("\n"))); creation_actions_stripped = true; } } @@ -768,7 +819,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { ) .await?; let transaction_info = TransactionInfo::try_new( - snapshot.log_data(), + snapshot, &local_actions, this.data.operation.read_whole_table(), )?; @@ -787,7 +838,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { } // Update snapshot to latest version after successful conflict check if let Some(snapshot) = &mut read_snapshot { - snapshot + Arc::make_mut(snapshot) .update(this.log_store.as_ref(), Some(latest_version as u64)) .await?; } @@ -795,6 +846,19 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { } } let version: i64 = latest_version.map(|v| v + 1).unwrap_or(0); + let previous_commit_timestamp = + previous_effective_commit_timestamp(&this.log_store, read_snapshot.as_ref()) + .await?; + let finalized_actions = finalize_attempt_actions( + &local_actions, + read_snapshot.as_ref(), + version, + previous_commit_timestamp, + Utc::now().timestamp_millis(), + )?; + let commit_or_bytes = + prepare_commit_or_bytes(&this.log_store, this.operation_id, &finalized_actions) + .await?; match this .log_store @@ -804,7 +868,10 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { Ok(()) => { return Ok(PostCommit { version, - data: this.data, + data: CommitData { + actions: finalized_actions, + operation: this.data.operation.clone(), + }, create_checkpoint: this .post_commit .map(|v| v.create_checkpoint) @@ -814,8 +881,7 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { .map(|v| v.cleanup_expired_logs) .unwrap_or_default(), log_store: this.log_store, - table_data: read_snapshot - .map(|snapshot| Box::new(snapshot) as Box), + table_data: read_snapshot, custom_execute_handler: this.post_commit_hook_handler, metrics: CommitMetrics { num_retries: attempt_number as u64 - 1, @@ -823,6 +889,9 @@ impl<'a> std::future::IntoFuture for PreparedCommit<'a> { }); } Err(TransactionError::VersionAlreadyExists(version)) => { + this.log_store + .abort_commit_entry(version, commit_or_bytes, this.operation_id) + .await?; error!("The transaction {version} already exists, will retry!"); continue; } @@ -845,63 +914,144 @@ pub struct PostCommit { /// The winning version number of the commit pub version: i64, /// The data that was committed to the log store - #[expect(unused)] pub data: CommitData, create_checkpoint: bool, cleanup_expired_logs: Option, log_store: LogStoreRef, - table_data: Option>, + table_data: Option>, custom_execute_handler: Option>, metrics: CommitMetrics, } impl PostCommit { + async fn write_version_checksum(&self, table_state: &DeltaSnapshot, operation_id: Uuid) { + if !table_state.table_properties().write_checksum_file_enabled() { + debug!( + "Skipping version checksum for version {} because delta.writeChecksumFile.enabled=false", + self.version + ); + return; + } + + let checksum = match table_state.build_version_checksum( + self.data.version_checksum_txn_id(), + self.data.version_checksum_in_commit_timestamp(), + ) { + Ok(Some(checksum)) => checksum, + Ok(None) => return, + Err(err) => { + warn!( + "Failed to build version checksum for version {}: {}", + self.version, err + ); + return; + } + }; + + let crc_path = checksum_path(self.version); + let checksum_bytes = match serde_json::to_vec(&checksum) { + Ok(bytes) => bytes, + Err(err) => { + warn!( + "Failed to serialize version checksum for version {}: {}", + self.version, err + ); + return; + } + }; + + let put_result = self + .log_store + .object_store(Some(operation_id)) + .put_opts( + &crc_path, + Bytes::from(checksum_bytes).into(), + PutOptions { + mode: PutMode::Create, + ..Default::default() + }, + ) + .await; + + match put_result { + Ok(_) => { + debug!( + "Wrote version checksum for version {} to {}", + self.version, crc_path + ); + } + Err(ObjectStoreError::AlreadyExists { .. }) => { + warn!( + "Version checksum already exists for version {} at {}", + self.version, crc_path + ); + } + Err(err) => { + warn!( + "Failed to write version checksum for version {} to {}: {}", + self.version, crc_path, err + ); + } + } + } + /// Runs the post commit activities - async fn run_post_commit_hook(&self) -> DeltaResult<(DeltaTableState, PostCommitMetrics)> { + async fn run_post_commit_hook(&self) -> DeltaResult<(Arc, PostCommitMetrics)> { let post_commit_operation_id = Uuid::new_v4(); // Always construct a state for the committed version so checkpoint + cleanup can run // even when `table_data` isn't available (e.g. planner didn't provide a snapshot). - let mut state = if let Some(table) = &self.table_data { - let mut snapshot = table.eager_snapshot().clone(); + let mut state = if let Some(snapshot) = &self.table_data { + let mut snapshot = Arc::clone(snapshot); if self.version != snapshot.version() { - snapshot + Arc::make_mut(&mut snapshot) .update(self.log_store.as_ref(), Some(self.version as u64)) .await?; } - DeltaTableState { snapshot } + snapshot } else { - DeltaTableState::try_new( - self.log_store.as_ref(), - Default::default(), - Some(self.version), + Arc::new( + DeltaSnapshot::try_new( + self.log_store.as_ref(), + Default::default(), + Some(self.version), + None, + ) + .await?, ) - .await? }; + self.write_version_checksum(state.as_ref(), post_commit_operation_id) + .await; + let cleanup_logs = if let Some(cleanup_logs) = self.cleanup_expired_logs { cleanup_logs } else { state.table_properties().enable_expired_log_cleanup() }; + let will_create_checkpoint = self.create_checkpoint + && should_create_checkpoint( + self.version, + state.table_properties().checkpoint_interval().get() as i64, + ); // Run arbitrary before_post_commit_hook code if let Some(custom_execute_handler) = &self.custom_execute_handler { custom_execute_handler .before_post_commit_hook( &self.log_store, - cleanup_logs || self.create_checkpoint, + will_create_checkpoint, post_commit_operation_id, ) .await? } let mut new_checkpoint_created = false; - if self.create_checkpoint { + if will_create_checkpoint { // Execute create checkpoint hook new_checkpoint_created = self .create_checkpoint( - &state, + state.as_ref(), &self.log_store, self.version, post_commit_operation_id, @@ -910,26 +1060,32 @@ impl PostCommit { } let mut num_log_files_cleaned_up: u64 = 0; - if cleanup_logs { + if cleanup_logs && new_checkpoint_created { + let retention_millis = state + .table_properties() + .log_retention_duration() + .as_millis() as i64; + let cutoff_timestamp = (Utc::now().timestamp_millis() - retention_millis) + .div_euclid(24 * 60 * 60 * 1000) + * (24 * 60 * 60 * 1000); // Execute clean up logs hook - num_log_files_cleaned_up = cleanup_expired_logs_for( - self.version, + num_log_files_cleaned_up = cleanup_expired_delta_log_files( + state.as_ref(), self.log_store.as_ref(), - Utc::now().timestamp_millis() - - state - .table_properties() - .log_retention_duration() - .as_millis() as i64, + cutoff_timestamp, Some(post_commit_operation_id), ) .await? as u64; if num_log_files_cleaned_up > 0 { - state = DeltaTableState::try_new( - self.log_store.as_ref(), - state.load_config().clone(), - Some(self.version), - ) - .await?; + state = Arc::new( + DeltaSnapshot::try_new( + self.log_store.as_ref(), + state.load_config().clone(), + Some(self.version), + None, + ) + .await?, + ); } } @@ -938,7 +1094,7 @@ impl PostCommit { custom_execute_handler .after_post_commit_hook( &self.log_store, - cleanup_logs || self.create_checkpoint, + new_checkpoint_created, post_commit_operation_id, ) .await? @@ -954,7 +1110,7 @@ impl PostCommit { } async fn create_checkpoint( &self, - table_state: &DeltaTableState, + table_state: &DeltaSnapshot, log_store: &LogStoreRef, version: i64, operation_id: Uuid, @@ -966,9 +1122,8 @@ impl PostCommit { debug!("table_state.load_config().require_files=false; creating checkpoint via kernel snapshot anyway"); } - let checkpoint_interval = table_state.config().checkpoint_interval().get() as i64; - // TODO: SQL `TBLPROPERTIES(delta.checkpointInterval)` isn't plumbed into `metaData.configuration` yet. - if version >= 0 && (version % checkpoint_interval) == 0 { + let checkpoint_interval = table_state.table_properties().checkpoint_interval().get() as i64; + if should_create_checkpoint(version, checkpoint_interval) { info!("Creating checkpoint for version {version}"); create_checkpoint_for(version, log_store.as_ref(), operation_id).await?; Ok(true) @@ -978,10 +1133,14 @@ impl PostCommit { } } +fn should_create_checkpoint(version: i64, checkpoint_interval: i64) -> bool { + version != 0 && version % checkpoint_interval == 0 +} + /// A commit that successfully completed pub struct FinalizedCommit { /// The new table state after a commit - pub snapshot: DeltaTableState, + pub snapshot: Arc, /// Version of the finalized commit pub version: i64, @@ -992,7 +1151,7 @@ pub struct FinalizedCommit { impl FinalizedCommit { /// The new table state after a commit #[expect(dead_code)] - pub fn snapshot(&self) -> DeltaTableState { + pub fn snapshot(&self) -> Arc { self.snapshot.clone() } /// Version of the finalized commit @@ -1025,3 +1184,219 @@ impl std::future::IntoFuture for PostCommit { }) } } + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use object_store::memory::InMemory; + use object_store::ObjectStore; + use url::Url; + + use super::*; + use crate::schema::protocol_for_create; + use crate::spec::{ + checksum_path, Action, CommitInfo, DataType, DeltaError, Metadata, SaveMode, StructField, + StructType, VersionChecksum, + }; + use crate::storage::{default_logstore, get_actions, StorageConfig}; + + fn test_log_store(store: Arc) -> LogStoreRef { + default_logstore( + store.clone(), + store, + &Url::parse("memory:///").unwrap(), + &StorageConfig, + ) + } + + fn test_metadata( + configuration: impl IntoIterator, + ) -> Metadata { + Metadata::try_new( + None, + None, + StructType::try_new([StructField::not_null("id", DataType::LONG)]).unwrap(), + Vec::new(), + 0, + configuration + .into_iter() + .map(|(key, value)| (key.to_string(), value.to_string())) + .collect(), + ) + .unwrap() + } + + async fn read_commit_actions(log_store: &LogStoreRef, version: i64) -> Vec { + let bytes = log_store.read_commit_entry(version).await.unwrap().unwrap(); + get_actions(version, &bytes).unwrap() + } + + async fn read_version_checksum(log_store: &LogStoreRef, version: i64) -> VersionChecksum { + let bytes = log_store + .object_store(None) + .get(&checksum_path(version)) + .await + .unwrap() + .bytes() + .await + .unwrap(); + serde_json::from_slice(&bytes).unwrap() + } + + fn commit_info(actions: &[Action]) -> DeltaResult<&CommitInfo> { + match actions.first() { + Some(Action::CommitInfo(info)) => Ok(info), + _ => Err(DeltaError::generic("expected commitInfo action at index 0")), + } + } + + #[tokio::test] + async fn commit_writes_commit_info_first_monotonic_ict_and_checksum() -> DeltaResult<()> { + let store: Arc = Arc::new(InMemory::new()); + let log_store = test_log_store(store); + let protocol = protocol_for_create(false, false, true)?; + let metadata = test_metadata([("delta.enableInCommitTimestamps", "true")]); + + let created = CommitBuilder::default() + .with_actions(vec![ + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ]) + .build( + None, + log_store.clone(), + DeltaOperation::Create { + mode: SaveMode::ErrorIfExists, + location: "memory:///".to_string(), + protocol: Box::new(protocol), + metadata: Box::new(metadata), + }, + ) + .await?; + let first_actions = read_commit_actions(&log_store, 0).await; + let first_commit_info = commit_info(&first_actions)?; + let first_ict = first_commit_info.in_commit_timestamp.ok_or_else(|| { + DeltaError::generic("ICT-enabled create commit should write inCommitTimestamp") + })?; + assert!(matches!(first_actions.first(), Some(Action::CommitInfo(_)))); + assert_eq!( + read_version_checksum(&log_store, 0) + .await + .in_commit_timestamp_opt, + Some(first_ict) + ); + + let appended = CommitBuilder::default() + .with_actions(vec![]) + .build( + Some(created.snapshot.clone()), + log_store.clone(), + DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }, + ) + .await?; + let second_actions = read_commit_actions(&log_store, appended.version).await; + let second_commit_info = commit_info(&second_actions)?; + let second_ict = second_commit_info.in_commit_timestamp.ok_or_else(|| { + DeltaError::generic("ICT-enabled append commit should write inCommitTimestamp") + })?; + + assert!(matches!( + second_actions.first(), + Some(Action::CommitInfo(_)) + )); + assert!(second_ict > first_ict); + assert_eq!( + read_version_checksum(&log_store, appended.version) + .await + .in_commit_timestamp_opt, + Some(second_ict) + ); + Ok(()) + } + + #[tokio::test] + async fn finalize_attempt_actions_backfills_enablement_metadata() -> DeltaResult<()> { + let store: Arc = Arc::new(InMemory::new()); + let log_store = test_log_store(store); + let protocol = protocol_for_create(false, false, false)?; + let metadata = test_metadata([]); + let created = CommitBuilder::default() + .with_actions(vec![ + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ]) + .build( + None, + log_store.clone(), + DeltaOperation::Create { + mode: SaveMode::ErrorIfExists, + location: "memory:///".to_string(), + protocol: Box::new(protocol), + metadata: Box::new(metadata), + }, + ) + .await?; + let previous_timestamp = created.snapshot.version_timestamp(0).ok_or_else(|| { + DeltaError::generic("non-ICT tables still track pre-enable commit timestamps") + })?; + + let upgrade_protocol = protocol_for_create(false, false, true)?; + let upgrade_metadata = test_metadata([("delta.enableInCommitTimestamps", "true")]); + let base_actions = CommitData::new( + vec![ + Action::Protocol(upgrade_protocol), + Action::Metadata(upgrade_metadata), + ], + DeltaOperation::Write { + mode: SaveMode::Append, + partition_by: None, + predicate: None, + }, + HashMap::new(), + OperationMetrics::default(), + vec![], + ) + .actions; + + let finalized_actions = finalize_attempt_actions( + &base_actions, + Some(&created.snapshot), + 1, + Some(previous_timestamp), + previous_timestamp.saturating_sub(10), + )?; + let commit_info = commit_info(&finalized_actions)?; + let upgrade_timestamp = commit_info + .in_commit_timestamp + .ok_or_else(|| DeltaError::generic("upgrade commit should assign inCommitTimestamp"))?; + assert_eq!(upgrade_timestamp, previous_timestamp + 1); + + let metadata = finalized_actions + .iter() + .find_map(|action| match action { + Action::Metadata(metadata) => Some(metadata), + _ => None, + }) + .ok_or_else(|| DeltaError::generic("upgrade commit should keep metadata action"))?; + assert_eq!( + metadata + .configuration() + .get("delta.inCommitTimestampEnablementVersion"), + Some(&"1".to_string()) + ); + assert_eq!( + metadata + .configuration() + .get("delta.inCommitTimestampEnablementTimestamp"), + Some(&upgrade_timestamp.to_string()) + ); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/kernel/transaction/protocol.rs b/crates/sail-delta-lake/src/kernel/transaction/protocol.rs index a2f26ce256..5764f67a0f 100644 --- a/crates/sail-delta-lake/src/kernel/transaction/protocol.rs +++ b/crates/sail-delta-lake/src/kernel/transaction/protocol.rs @@ -21,13 +21,11 @@ use std::collections::HashSet; use std::sync::LazyLock; -use delta_kernel::table_features::TableFeature; - -use super::{TableReference, TransactionError}; -use crate::kernel::models::{contains_timestampntz, Action, Protocol, Schema}; -use crate::kernel::snapshot::EagerSnapshot; -use crate::kernel::{DeltaOperation, TablePropertiesExt}; -use crate::table::DeltaTableState; +use crate::kernel::DeltaOperation; +use crate::spec::{ + contains_timestampntz, Action, Protocol, Schema, TableFeature, TransactionError, +}; +use crate::table::DeltaSnapshot; static READER_V2: LazyLock> = LazyLock::new(|| HashSet::from_iter([TableFeature::ColumnMapping])); @@ -98,7 +96,7 @@ impl ProtocolChecker { /// Check append-only at the high level (operation level) #[expect(unused)] - pub fn check_append_only(&self, snapshot: &EagerSnapshot) -> Result<(), TransactionError> { + pub fn check_append_only(&self, snapshot: &DeltaSnapshot) -> Result<(), TransactionError> { if snapshot.table_properties().append_only() { return Err(TransactionError::DeltaTableAppendOnly); } @@ -109,7 +107,7 @@ impl ProtocolChecker { #[expect(unused)] pub fn check_can_write_timestamp_ntz( &self, - snapshot: &DeltaTableState, + snapshot: &DeltaSnapshot, schema: &Schema, ) -> Result<(), TransactionError> { let contains_timestampntz = contains_timestampntz(schema.fields()); @@ -136,7 +134,7 @@ impl ProtocolChecker { } /// Check if delta-rs can read form the given delta table. - pub fn can_read_from(&self, snapshot: &dyn TableReference) -> Result<(), TransactionError> { + pub fn can_read_from(&self, snapshot: &DeltaSnapshot) -> Result<(), TransactionError> { self.can_read_from_protocol(snapshot.protocol()) } @@ -144,8 +142,14 @@ impl ProtocolChecker { let required_features: Option> = match protocol.min_reader_version() { 0 | 1 => None, 2 => Some(READER_V2.clone()), - // _ => protocol.reader_features_set(), - _ => Some(HashSet::new()), + _ => Some( + protocol + .reader_features() + .unwrap_or_default() + .iter() + .cloned() + .collect(), + ), }; if let Some(features) = required_features { let mut diff = features.difference(&self.reader_features).peekable(); @@ -159,7 +163,7 @@ impl ProtocolChecker { } /// Check if delta-rs can write to the given delta table. - pub fn can_write_to(&self, snapshot: &dyn TableReference) -> Result<(), TransactionError> { + pub fn can_write_to(&self, snapshot: &DeltaSnapshot) -> Result<(), TransactionError> { // NOTE: writers must always support all required reader features self.can_read_from(snapshot)?; let min_writer_version = snapshot.protocol().min_writer_version(); @@ -171,8 +175,15 @@ impl ProtocolChecker { 4 => Some(WRITER_V4.clone()), 5 => Some(WRITER_V5.clone()), 6 => Some(WRITER_V6.clone()), - // _ => snapshot.protocol().writer_features_set(), - _ => Some(HashSet::new()), + _ => Some( + snapshot + .protocol() + .writer_features() + .unwrap_or_default() + .iter() + .cloned() + .collect(), + ), }; if let Some(features) = required_features { @@ -188,7 +199,7 @@ impl ProtocolChecker { pub fn can_commit( &self, - snapshot: &dyn TableReference, + snapshot: &DeltaSnapshot, actions: &[Action], operation: &DeltaOperation, ) -> Result<(), TransactionError> { @@ -198,16 +209,12 @@ impl ProtocolChecker { let append_only_enabled = if snapshot.protocol().min_writer_version() < 2 { false } else if snapshot.protocol().min_writer_version() < 7 { - snapshot.config().append_only() + snapshot.table_properties().append_only() } else { snapshot .protocol() - .writer_features() - .ok_or(TransactionError::TableFeaturesRequired( - TableFeature::AppendOnly, - ))? - .contains(&TableFeature::AppendOnly) - && snapshot.config().append_only() + .has_writer_feature(&TableFeature::AppendOnly) + && snapshot.table_properties().append_only() }; if append_only_enabled { match operation { @@ -241,6 +248,7 @@ pub static INSTANCE: LazyLock = LazyLock::new(|| { let mut writer_features = HashSet::new(); writer_features.insert(TableFeature::AppendOnly); + writer_features.insert(TableFeature::InCommitTimestamp); writer_features.insert(TableFeature::TimestampWithoutTimezone); { writer_features.insert(TableFeature::ChangeDataFeed); diff --git a/crates/sail-delta-lake/src/lib.rs b/crates/sail-delta-lake/src/lib.rs index 59095ecff6..9e4aa5ff30 100644 --- a/crates/sail-delta-lake/src/lib.rs +++ b/crates/sail-delta-lake/src/lib.rs @@ -12,7 +12,7 @@ pub mod conversion; pub mod datasource; -pub mod error; +mod delta_log; mod kernel; pub mod logical; pub mod operations; @@ -21,6 +21,8 @@ pub mod physical; pub mod physical_plan; pub mod planner; pub mod schema; +pub mod session_extension; +pub mod spec; pub mod storage; pub mod table; pub mod table_format; @@ -32,7 +34,7 @@ use sail_physical_plan::{register_format_type, FormatTag}; pub use table::{create_delta_provider, create_delta_source}; pub use table_format::DeltaTableFormat; -pub use crate::error::{DeltaError, DeltaError as DeltaTableError, DeltaResult, KernelError}; +pub use crate::spec::{DeltaError, DeltaError as DeltaTableError, DeltaResult}; static INIT: Once = Once::new(); @@ -41,6 +43,7 @@ pub fn init_delta_types() { let _ = register_format_type::(FormatTag::Delta); let _ = register_format_type::(FormatTag::Delta); let _ = register_format_type::(FormatTag::Delta); + let _ = register_format_type::(FormatTag::Delta); let _ = register_format_type::(FormatTag::Delta); let _ = register_format_type::(FormatTag::Delta); let _ = register_format_type::(FormatTag::Delta); diff --git a/crates/sail-delta-lake/src/logical/handle.rs b/crates/sail-delta-lake/src/logical/handle.rs index ea2f0be55b..51ab33b7b8 100644 --- a/crates/sail-delta-lake/src/logical/handle.rs +++ b/crates/sail-delta-lake/src/logical/handle.rs @@ -6,13 +6,13 @@ use datafusion::arrow::datatypes::SchemaRef; use crate::datasource::DeltaScanConfig; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; #[derive(Clone, Debug)] pub struct DeltaTableHandle(Arc); pub struct DeltaTableHandleInner { - pub snapshot: DeltaTableState, + pub snapshot: Arc, pub log_store: LogStoreRef, pub config: DeltaScanConfig, pub schema: SchemaRef, diff --git a/crates/sail-delta-lake/src/logical/table_source.rs b/crates/sail-delta-lake/src/logical/table_source.rs index f4760089c3..cfd41d195b 100644 --- a/crates/sail-delta-lake/src/logical/table_source.rs +++ b/crates/sail-delta-lake/src/logical/table_source.rs @@ -1,4 +1,5 @@ use std::any::Any; +use std::sync::Arc; use datafusion::arrow::datatypes::SchemaRef; use datafusion::common::Result; @@ -6,7 +7,7 @@ use datafusion::logical_expr::{Expr, TableProviderFilterPushDown, TableSource}; use crate::datasource::{df_logical_schema, get_pushdown_filters, DeltaScanConfig}; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; use crate::DeltaResult; /// Logical-only Delta table source used in DataFusion logical plans. @@ -15,7 +16,7 @@ use crate::DeltaResult; /// behavior. Physical planning is handled by rewriting scans to an extension node. #[derive(Clone)] pub struct DeltaTableSource { - snapshot: DeltaTableState, + snapshot: Arc, log_store: LogStoreRef, config: DeltaScanConfig, schema: SchemaRef, @@ -41,12 +42,12 @@ impl std::fmt::Debug for DeltaTableSource { impl DeltaTableSource { pub fn try_new( - snapshot: DeltaTableState, + snapshot: Arc, log_store: LogStoreRef, config: DeltaScanConfig, ) -> DeltaResult { let schema = df_logical_schema( - &snapshot, + snapshot.as_ref(), &config.file_column_name, &config.commit_version_column_name, &config.commit_timestamp_column_name, @@ -60,11 +61,7 @@ impl DeltaTableSource { }) } - pub fn try_with_config(&self, config: DeltaScanConfig) -> DeltaResult { - Self::try_new(self.snapshot.clone(), self.log_store.clone(), config) - } - - pub fn snapshot(&self) -> &DeltaTableState { + pub fn snapshot(&self) -> &Arc { &self.snapshot } diff --git a/crates/sail-delta-lake/src/operations/write/partitioning.rs b/crates/sail-delta-lake/src/operations/write/partitioning.rs index e7be00b9e6..ff5e3acdd5 100644 --- a/crates/sail-delta-lake/src/operations/write/partitioning.rs +++ b/crates/sail-delta-lake/src/operations/write/partitioning.rs @@ -1,17 +1,17 @@ use datafusion::arrow::array::{ArrayRef, RecordBatch}; use datafusion::arrow::row::{RowConverter, SortField}; -use delta_kernel::expressions::Scalar; +use datafusion::common::scalar::ScalarValue; use indexmap::IndexMap; -use crate::kernel::models::ScalarExt; -use crate::kernel::DeltaTableError; +use crate::conversion::ScalarExt; +use crate::spec::DeltaError as DeltaTableError; /// A contiguous range of rows that share the same partition values. #[derive(Debug)] pub struct PartitionRange { pub start: usize, pub end: usize, - pub partition_values: IndexMap, + pub partition_values: IndexMap, } /// Detect contiguous partition ranges from an input batch. @@ -111,7 +111,7 @@ fn push_partition_range( .iter() .map(|&idx| { let col = values.column(idx); - Scalar::from_array(&col.slice(start, 1), 0) + ScalarValue::from_array(col.as_ref(), start) .ok_or_else(|| DeltaTableError::generic("failed to parse partition value")) }) .collect::, _>>()?; diff --git a/crates/sail-delta-lake/src/operations/write/stats.rs b/crates/sail-delta-lake/src/operations/write/stats.rs index a7b20a9afb..26910e71cc 100644 --- a/crates/sail-delta-lake/src/operations/write/stats.rs +++ b/crates/sail-delta-lake/src/operations/write/stats.rs @@ -24,7 +24,7 @@ use std::ops::{AddAssign, Not}; use std::sync::Arc; use std::time::{SystemTime, UNIX_EPOCH}; -use delta_kernel::expressions::Scalar; +use datafusion::common::scalar::ScalarValue; use indexmap::IndexMap; use log::warn; use parquet::basic::{LogicalType, TimeUnit, Type}; @@ -33,12 +33,14 @@ use parquet::file::statistics::Statistics; use parquet::schema::types::{ColumnDescriptor, SchemaDescriptor}; use sail_common::spec::SAIL_LIST_FIELD_NAME; -use crate::kernel::models::{Add, ColumnCountStat, ColumnValueStat, ScalarExt, Stats}; -use crate::kernel::DeltaTableError; +use crate::conversion::ScalarExt; +use crate::spec::{ + Add, ColumnCountStat, ColumnValueStat, DeltaError as DeltaTableError, StatValue, Stats, +}; /// Creates an [`Add`] log action struct with statistics. pub fn create_add( - partition_values: &IndexMap, + partition_values: &IndexMap, path: String, size: i64, file_metadata: &ParquetMetaData, @@ -91,7 +93,7 @@ pub fn create_add( } /// Creates stats from parquet metadata already in memory pub fn stats_from_parquet_metadata( - partition_values: &IndexMap, + partition_values: &IndexMap, parquet_metadata: &ParquetMetaData, num_indexed_cols: i32, stats_columns: &Option>, @@ -111,7 +113,7 @@ pub fn stats_from_parquet_metadata( } fn stats_from_file_metadata( - partition_values: &IndexMap, + partition_values: &IndexMap, file_metadata: &ParquetMetaData, num_indexed_cols: i32, stats_columns: &Option>, @@ -130,7 +132,7 @@ fn stats_from_file_metadata( } fn stats_from_metadata( - partition_values: &IndexMap, + partition_values: &IndexMap, schema_descriptor: Arc, row_group_metadata: Vec, num_rows: i64, @@ -393,23 +395,29 @@ pub fn sign_extend_be(b: &[u8]) -> [u8; N] { result } -impl From for serde_json::Value { +impl From for StatValue { fn from(scalar: StatsScalar) -> Self { match scalar { - StatsScalar::Boolean(v) => serde_json::Value::Bool(v), - StatsScalar::Int32(v) => serde_json::Value::from(v), - StatsScalar::Int64(v) => serde_json::Value::from(v), - StatsScalar::Float32(v) => serde_json::Value::from(v), - StatsScalar::Float64(v) => serde_json::Value::from(v), - StatsScalar::Date(v) => serde_json::Value::from(v.format("%Y-%m-%d").to_string()), + StatsScalar::Boolean(v) => StatValue::Boolean(v), + StatsScalar::Int32(v) => StatValue::Number(v.into()), + StatsScalar::Int64(v) => StatValue::Number(v.into()), + StatsScalar::Float32(v) => serde_json::Number::from_f64(v as f64) + .map(StatValue::Number) + .unwrap_or_else(|| StatValue::String(v.to_string())), + StatsScalar::Float64(v) => serde_json::Number::from_f64(v) + .map(StatValue::Number) + .unwrap_or_else(|| StatValue::String(v.to_string())), + StatsScalar::Date(v) => StatValue::String(v.format("%Y-%m-%d").to_string()), StatsScalar::Timestamp(v) => { - serde_json::Value::from(v.format("%Y-%m-%dT%H:%M:%S%.fZ").to_string()) + StatValue::String(v.format("%Y-%m-%dT%H:%M:%S%.fZ").to_string()) } StatsScalar::TimestampNtz(v) => { - serde_json::Value::from(v.format("%Y-%m-%dT%H:%M:%S%.f").to_string()) + StatValue::String(v.format("%Y-%m-%dT%H:%M:%S%.f").to_string()) } - StatsScalar::Decimal(v) => serde_json::Value::from(v), - StatsScalar::String(v) => serde_json::Value::from(v), + StatsScalar::Decimal(v) => serde_json::Number::from_f64(v) + .map(StatValue::Number) + .unwrap_or_else(|| StatValue::String(v.to_string())), + StatsScalar::String(v) => StatValue::String(v), StatsScalar::Bytes(v) => { let escaped_bytes = v .into_iter() @@ -417,13 +425,19 @@ impl From for serde_json::Value { .collect::>(); // escape_default always produces valid ASCII so we can use from_utf8_lossy here let escaped_string = String::from_utf8_lossy(escaped_bytes.as_slice()).into_owned(); - serde_json::Value::from(escaped_string) + StatValue::String(escaped_string) } - StatsScalar::Uuid(v) => serde_json::Value::from(v.hyphenated().to_string()), + StatsScalar::Uuid(v) => StatValue::String(v.hyphenated().to_string()), } } } +impl From for serde_json::Value { + fn from(scalar: StatsScalar) -> Self { + StatValue::from(scalar).into() + } +} + /// Aggregated stats from multiple row groups struct AggregatedStats { pub min: Option, diff --git a/crates/sail-delta-lake/src/operations/write/writer.rs b/crates/sail-delta-lake/src/operations/write/writer.rs index 895d2b1983..25b82fd5f1 100644 --- a/crates/sail-delta-lake/src/operations/write/writer.rs +++ b/crates/sail-delta-lake/src/operations/write/writer.rs @@ -24,10 +24,10 @@ use std::sync::Arc; use bytes::Bytes; use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::{Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; -use delta_kernel::expressions::Scalar; +use datafusion::common::scalar::ScalarValue; use indexmap::IndexMap; use object_store::path::Path; -use object_store::ObjectStore; +use object_store::{ObjectStore, ObjectStoreExt}; use parquet::arrow::AsyncArrowWriter; use parquet::basic::Compression; use parquet::file::metadata::ParquetMetaData; @@ -38,8 +38,8 @@ use uuid::Uuid; use super::async_utils::AsyncShareableBuffer; use super::partitioning::partition_ranges; use super::stats::create_add; -use crate::kernel::models::{Add, ScalarExt}; -use crate::kernel::DeltaTableError; +use crate::conversion::ScalarExt; +use crate::spec::{Add, DeltaError as DeltaTableError}; /// Trait for creating hive partition paths from partition values pub trait PartitionsExt { @@ -47,7 +47,7 @@ pub trait PartitionsExt { fn hive_partition_segments(&self) -> Vec; } -impl PartitionsExt for IndexMap { +impl PartitionsExt for IndexMap { fn hive_partition_path(&self) -> String { self.hive_partition_segments().join("/") } @@ -210,7 +210,7 @@ impl DeltaWriter { async fn switch_partition_if_needed( &mut self, partition_key: String, - partition_values: IndexMap, + partition_values: IndexMap, ) -> Result<(), DeltaTableError> { if self.current_partition_key.as_deref() == Some(partition_key.as_str()) && self.current_writer.is_some() @@ -281,7 +281,7 @@ pub struct PartitionWriterConfig { /// Partition path segments pub partition_segments: Vec, /// Values for all partition columns - pub partition_values: IndexMap, + pub partition_values: IndexMap, /// Properties passed to underlying parquet writer pub writer_properties: WriterProperties, /// Size above which we will write a buffered parquet file to disk @@ -294,7 +294,7 @@ impl PartitionWriterConfig { pub fn new( table_path: Path, file_schema: ArrowSchemaRef, - partition_values: IndexMap, + partition_values: IndexMap, writer_properties: WriterProperties, target_file_size: u64, write_batch_size: usize, @@ -462,9 +462,9 @@ impl PartitionWriter { let mut full_path = self.config.table_path.clone(); for segment in &self.config.partition_segments { - full_path = full_path.child(segment.as_str()); + full_path = full_path.join(segment.as_str()); } - full_path = full_path.child(file_name.as_str()); + full_path = full_path.join(file_name.as_str()); let relative_path = if self.config.partition_segments.is_empty() { file_name @@ -553,7 +553,7 @@ mod tests { use object_store::ObjectStore; use super::{DeltaWriter, WriterConfig}; - use crate::kernel::DeltaTableError; + use crate::spec::DeltaError as DeltaTableError; fn make_batch(values: Vec, parts: Vec<&str>) -> Result { let schema = Arc::new(Schema::new(vec![ diff --git a/crates/sail-delta-lake/src/options.rs b/crates/sail-delta-lake/src/options.rs index 31060a6eb8..353ccd50ca 100644 --- a/crates/sail-delta-lake/src/options.rs +++ b/crates/sail-delta-lake/src/options.rs @@ -12,8 +12,8 @@ use serde::{Deserialize, Serialize}; -/// Options that control the behavior of Delta Lake tables. -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +/// Execution-time options that control a single Delta Lake read or write operation. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] pub struct TableDeltaOptions { pub replace_where: Option, pub merge_schema: bool, @@ -24,16 +24,43 @@ pub struct TableDeltaOptions { pub version_as_of: Option, pub timestamp_as_of: Option, - /// Column mapping mode for new tables (dataframe API only) - #[serde(default)] - pub column_mapping_mode: ColumnMappingModeOption, + /// Enable metadata-as-data read path (avoid loading file list on driver; use log replay + discovery). + pub metadata_as_data_read: bool, + + /// Strategy for Delta log replay in metadata-as-data path. + pub delta_log_replay_strategy: DeltaLogReplayStrategyOption, + + /// Max commit JSON file count to use hash-no-sort replay when strategy is `Auto`. + #[serde(default = "default_delta_log_replay_hash_threshold")] + pub delta_log_replay_hash_threshold: usize, +} + +impl Default for TableDeltaOptions { + fn default() -> Self { + Self { + replace_where: None, + merge_schema: false, + overwrite_schema: false, + target_file_size: 0, + write_batch_size: 0, + version_as_of: None, + timestamp_as_of: None, + metadata_as_data_read: false, + delta_log_replay_strategy: DeltaLogReplayStrategyOption::Auto, + delta_log_replay_hash_threshold: default_delta_log_replay_hash_threshold(), + } + } +} + +pub fn default_delta_log_replay_hash_threshold() -> usize { + 100 } #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)] #[serde(rename_all = "camelCase")] -pub enum ColumnMappingModeOption { +pub enum DeltaLogReplayStrategyOption { #[default] - None, - Name, - Id, + Auto, + Sort, + Hash, } diff --git a/crates/sail-delta-lake/src/physical/scan_planner.rs b/crates/sail-delta-lake/src/physical/scan_planner.rs index d446a2ec76..b2a84c11cd 100644 --- a/crates/sail-delta-lake/src/physical/scan_planner.rs +++ b/crates/sail-delta-lake/src/physical/scan_planner.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use datafusion::arrow::datatypes::Schema as ArrowSchema; @@ -7,22 +7,30 @@ use datafusion::common::{Result, ToDFSchema}; use datafusion::datasource::source::DataSourceExec; use datafusion::logical_expr::utils::conjunction; use datafusion::logical_expr::Expr; -use datafusion::physical_plan::ExecutionPlan; -use delta_kernel::table_features::ColumnMappingMode; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::projection::ProjectionExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::{ExecutionPlan, Partitioning}; use sail_common_datafusion::rename::physical_plan::rename_projected_physical_plan; -use crate::datasource::scan::{build_file_scan_config, FileScanParams}; -use crate::datasource::{ - df_logical_schema, prune_files, simplify_expr, DataFusionMixins, DeltaScanConfig, +use crate::datasource::scan::{build_file_scan_config, FileScanParams, TableStatsMode}; +use crate::datasource::{df_logical_schema, simplify_expr, DeltaScanConfig}; +use crate::options::TableDeltaOptions; +use crate::physical_plan::planner::metadata_predicate::{ + build_metadata_filter, predicate_requires_stats, }; -use crate::kernel::models::Add; +use crate::physical_plan::planner::utils::LogReplayOptions; +use crate::physical_plan::planner::{DeltaTableConfig as PlannerTableConfig, PlannerContext}; +use crate::physical_plan::{DeltaDiscoveryExec, DeltaScanByAddsExec}; use crate::schema::get_physical_schema; +use crate::spec::{Add, ColumnMappingMode, StructType}; use crate::storage::LogStoreRef; -use crate::table::DeltaTableState; +use crate::table::DeltaSnapshot; pub(crate) async fn plan_delta_scan( session: &dyn Session, - snapshot: &DeltaTableState, + snapshot: &DeltaSnapshot, log_store: &LogStoreRef, config: &DeltaScanConfig, files: Option>>, @@ -45,7 +53,7 @@ pub(crate) async fn plan_delta_scan( &config.commit_timestamp_column_name, Some(schema.clone()), )?; - + let table_partition_cols = snapshot.metadata().partition_columns().clone(); let logical_schema = if let Some(used_columns) = projection { let mut fields = vec![]; for idx in used_columns { @@ -63,7 +71,6 @@ pub(crate) async fn plan_delta_scan( } } // Ensure all partition columns are included in logical schema - let table_partition_cols = snapshot.metadata().partition_columns(); for partition_col in table_partition_cols.iter() { if let Ok(idx) = full_logical_schema.index_of(partition_col.as_str()) { if !used_columns.contains(&idx) && !fields.iter().any(|f| f.name() == partition_col) @@ -77,10 +84,33 @@ pub(crate) async fn plan_delta_scan( Arc::clone(&full_logical_schema) }; + let (scan_projection, projection_prefix_len) = if let Some(used_columns) = projection { + let mut scan_projection = used_columns.clone(); + let filter_expr = conjunction(filters.iter().cloned()); + if let Some(expr) = &filter_expr { + for c in expr.column_refs() { + let idx = full_logical_schema.index_of(c.name.as_str())?; + if !scan_projection.contains(&idx) { + scan_projection.push(idx); + } + } + } + for partition_col in table_partition_cols.iter() { + if let Ok(idx) = full_logical_schema.index_of(partition_col.as_str()) { + if !scan_projection.contains(&idx) { + scan_projection.push(idx); + } + } + } + (Some(scan_projection), Some(used_columns.len())) + } else { + (None, None) + }; + // Separate filters for pruning vs pushdown. // // Exact and Inexact filters are used for pruning; Inexact are additionally pushed down. - let partition_cols = snapshot.metadata().partition_columns(); + let partition_cols = &table_partition_cols; let predicates: Vec<&Expr> = filters.iter().collect(); let pushdown_filters = crate::datasource::get_pushdown_filters(&predicates, partition_cols.as_slice()); @@ -100,46 +130,55 @@ pub(crate) async fn plan_delta_scan( } } - let (files, pruning_mask) = match files { - Some(files) => (files, None), - None => { - let result = prune_files( - snapshot, - log_store, - session, - &pruning_filters, - limit, - logical_schema.clone(), - ) - .await?; - (Arc::new(result.files), result.pruning_mask) - } - }; + let table_schema = snapshot + .input_schema() + .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?; - // Prepare pushdown filter for Parquet. - let pushdown_filter = if !parquet_pushdown_filters.is_empty() { + let pruning_expr = conjunction(pruning_filters); + let pruning_predicate = if let Some(expr) = pruning_expr.as_ref() { let df_schema = logical_schema.clone().to_dfschema()?; - let pushdown_expr = conjunction(parquet_pushdown_filters); - pushdown_expr - .map(|expr| simplify_expr(session, &df_schema, expr)) - .transpose()? + Some( + simplify_expr(session, &df_schema, expr.clone()).map_err(|e| { + datafusion::common::DataFusionError::Plan(format!( + "failed to simplify scan pruning filter: {e}" + )) + })?, + ) } else { None }; + let (files, pruning_mask): (Option>>, Option>) = match files { + Some(files) => { + if let Some(predicate) = pruning_predicate.as_ref() { + let source_files = files.as_ref().clone(); + let pruning_mask = crate::datasource::pruning::prune_adds_by_physical_predicate( + source_files.clone(), + table_schema.clone(), + Arc::clone(predicate), + )?; + let pruned_files = source_files + .into_iter() + .zip(pruning_mask.iter().copied()) + .filter_map(|(add, keep)| keep.then_some(add)) + .collect::>(); + (Some(Arc::new(pruned_files)), Some(pruning_mask)) + } else { + (Some(files), None) + } + } + None => (None, None), + }; + // Build physical file schema (non-partition columns) - let table_partition_cols = snapshot.metadata().partition_columns(); let kmode: ColumnMappingMode = snapshot.effective_column_mapping_mode(); - let kschema_arc = snapshot.snapshot().table_configuration().schema(); - let physical_arrow: ArrowSchema = get_physical_schema(&kschema_arc, kmode); - let physical_partition_cols: HashSet = table_partition_cols - .iter() - .map(|col| { - kschema_arc - .field(col) - .map(|f| f.physical_name(kmode).to_string()) - .unwrap_or_else(|| col.clone()) - }) + let kschema_arc = snapshot.schema(); + let logical_kernel = StructType::try_from(kschema_arc)?; + let physical_arrow: ArrowSchema = get_physical_schema(&logical_kernel, kmode); + let physical_partition_cols: HashSet = snapshot + .physical_partition_columns() + .into_iter() + .map(|(_, physical)| physical) .collect(); let file_fields = physical_arrow @@ -150,30 +189,150 @@ pub(crate) async fn plan_delta_scan( .collect::>(); let file_schema = Arc::new(ArrowSchema::new(file_fields)); - let file_scan_config = build_file_scan_config( - snapshot, - log_store, - &files, - &config, - FileScanParams { - pruning_mask: pruning_mask.as_deref(), - projection, + // Prepare pushdown filter for Parquet. + let pushdown_filter = if !parquet_pushdown_filters.is_empty() { + let df_schema = full_logical_schema.clone().to_dfschema()?; + let pushdown_expr = conjunction(parquet_pushdown_filters); + pushdown_expr + .map(|expr| { + simplify_expr(session, &df_schema, expr).map_err(|e| { + datafusion::common::DataFusionError::Plan(format!( + "failed to simplify parquet pushdown filter: {e}" + )) + }) + }) + .transpose()? + } else { + None + }; + + if let Some(files) = files { + let file_scan_config = build_file_scan_config( + snapshot, + log_store, + &files, + &config, + FileScanParams { + pruning_mask: pruning_mask.as_deref(), + projection, + limit, + pushdown_filter, + sort_order: None, + table_stats_mode: TableStatsMode::Snapshot, + }, + session, + file_schema, + )?; + + let scan_exec = DataSourceExec::from_data_source(file_scan_config); + + // Rename columns from physical back to logical names expected by `schema` + let logical_names = full_logical_schema + .fields() + .iter() + .map(|f| f.name().clone()) + .collect::>(); + let renamed = rename_projected_physical_plan(scan_exec, &logical_names, projection)?; + return Ok(renamed); + } + + // Metadata-as-data path: log scan -> replay -> discovery -> scan by adds. + let table_url = log_store.config().location.clone(); + + let planner_options = TableDeltaOptions { + delta_log_replay_strategy: config.delta_log_replay_strategy, + delta_log_replay_hash_threshold: config.delta_log_replay_hash_threshold, + ..TableDeltaOptions::default() + }; + + let planner_ctx = PlannerContext::new( + session, + PlannerTableConfig::new( + table_url.clone(), + planner_options, + HashMap::new(), + table_partition_cols.clone(), + None, + true, + ), + ); + let log_replay_options = LogReplayOptions { + include_stats_json: pruning_expr + .as_ref() + .is_some_and(|expr| predicate_requires_stats(expr, &table_partition_cols)), + ..Default::default() + }; + + let meta_scan: Arc = + crate::physical_plan::planner::utils::build_log_replay_pipeline_with_options( + &planner_ctx, + snapshot, + log_replay_options, + ) + .await + .map_err(|e| { + datafusion::common::DataFusionError::Plan(format!( + "failed to build log replay pipeline: {e}" + )) + })?; + let meta_scan: Arc = if let Some(predicate) = pruning_expr { + build_metadata_filter(session, meta_scan, snapshot, predicate)? + } else { + meta_scan + }; + // TODO(metadata-as-data-aqe): This path intentionally prioritizes metadata scalability and + // low TTFB over perfect static CBO. Add a runtime re-optimization hook after replay/discovery + // so downstream repartitioning and join strategy can react to exact file cardinality/bytes. + + let find_files: Arc = Arc::new(DeltaDiscoveryExec::with_input( + meta_scan, + table_url.clone(), + None, + None, + snapshot.version(), + table_partition_cols.clone(), + false, + )?); + + // TODO(adaptive-partitioning): Replace fixed fan-out with adaptive planning. + // Plan: (1) pick partition count from discovered `size_bytes` (clamped to [1, target_partitions] + // and gated by `optimizer.repartition_file_min_size`), then (2) replace round-robin with + // size-aware distribution to reduce small-task overhead and skew. + let target_partitions = session.config().target_partitions().max(1); + let find_files: Arc = Arc::new(RepartitionExec::try_new( + find_files, + Partitioning::RoundRobinBatch(target_partitions), + )?); + + let mut scan_exec: Arc = Arc::new( + DeltaScanByAddsExec::new( + find_files, + table_url, + snapshot.version(), + table_schema, + logical_schema.clone(), + config.clone(), + scan_projection.clone(), limit, pushdown_filter, - sort_order: None, - }, - session, - file_schema, - )?; + ) + .with_table_statistics(snapshot.datafusion_table_statistics(None)), + ); - let scan_exec = DataSourceExec::from_data_source(file_scan_config); + // NOTE: Keep filtering inside DeltaScanByAddsExec pushdown path for now. + // Wrapping an additional FilterExec here can trigger DataFusion interval + // inference assertion failures on some nullable predicates in metadata-as-data + // scans (tracked separately). - // Rename columns from physical back to logical names expected by `schema` - let logical_names = full_logical_schema - .fields() - .iter() - .map(|f| f.name().clone()) - .collect::>(); - let renamed = rename_projected_physical_plan(scan_exec, &logical_names, projection)?; - Ok(renamed) + if let Some(prefix_len) = projection_prefix_len { + let mut proj_exprs = Vec::with_capacity(prefix_len); + for idx in 0..prefix_len { + let field = logical_schema.field(idx); + let expr = Arc::new(Column::new(field.name(), idx)) as Arc; + proj_exprs.push((expr, field.name().clone())); + } + scan_exec = Arc::new(ProjectionExec::try_new(proj_exprs, scan_exec)?); + } + + Ok(scan_exec) } diff --git a/crates/sail-delta-lake/src/physical/table_scan_planner.rs b/crates/sail-delta-lake/src/physical/table_scan_planner.rs index e5a421f534..7e5a0ac2a9 100644 --- a/crates/sail-delta-lake/src/physical/table_scan_planner.rs +++ b/crates/sail-delta-lake/src/physical/table_scan_planner.rs @@ -31,12 +31,18 @@ impl ExtensionPlanner for DeltaTablePhysicalPlanner { let handle = node.handle().inner(); let filters = unnormalize_cols(node.filters().clone()); let projection = node.projection().map(|p| p.to_vec()); + let files = + if !handle.snapshot.load_config().require_files || handle.snapshot.adds().is_empty() { + None + } else { + Some(Arc::new(handle.snapshot.adds().to_vec())) + }; let plan = plan_delta_scan( session_state, &handle.snapshot, &handle.log_store, &handle.config, - None, + files, projection.as_ref(), &filters, node.fetch(), diff --git a/crates/sail-delta-lake/src/physical_plan/action_schema.rs b/crates/sail-delta-lake/src/physical_plan/action_schema.rs index c39ca4ce20..6e5100ae90 100644 --- a/crates/sail-delta-lake/src/physical_plan/action_schema.rs +++ b/crates/sail-delta-lake/src/physical_plan/action_schema.rs @@ -1,129 +1,193 @@ -use std::collections::{BTreeMap, HashMap}; use std::sync::{Arc, LazyLock}; -use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Field, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, Result}; use serde::{Deserialize, Serialize}; -use serde_json::Value; -use crate::kernel::models::{Action, Add, Metadata, Protocol, Remove}; +use crate::kernel::transaction::OperationMetrics; use crate::kernel::DeltaOperation; +use crate::spec::{ + add_struct_type, metadata_struct_type, protocol_struct_type, remove_struct_type, Action, Add, + Metadata, Protocol, Remove, +}; pub const COL_ACTION: &str = "action"; -const COL_PARTITION_VALUES: &str = "partition_values"; - -static ACTION_FIELDS: LazyLock> = LazyLock::new(|| { - #[expect( - clippy::unwrap_used, - reason = "ACTION_FIELDS is a process-global constant." - )] - let fields = delta_action_fields_build() - .map_err(|msg| format!("delta action fields initialization failed: {msg}")) - .unwrap(); - fields -}); + +static ACTION_FIELDS: LazyLock> = + LazyLock::new(delta_action_fields_build); static ACTION_SCHEMA: LazyLock = LazyLock::new(|| Arc::new(Schema::new((*ACTION_FIELDS).clone()))); #[derive(Debug, Clone, Default)] -pub struct CommitMeta { +pub struct ExecCommitMeta { pub row_count: u64, pub operation: Option, - pub operation_metrics: HashMap, + pub operation_metrics: OperationMetrics, } -fn partition_values_type() -> DataType { - // Arrow Map is represented as `Map>`. - let entries_struct = DataType::Struct( - vec![ - Arc::new(Field::new("keys", DataType::Utf8, false)), - Arc::new(Field::new("values", DataType::Utf8, true)), - ] - .into(), - ); - let entries_field = Arc::new(Field::new("entries", entries_struct, false)); - DataType::Map(entries_field, false) +fn action_field(name: &str, data_type: ArrowDataType, nullable: bool) -> Arc { + Arc::new(Field::new(name, data_type, nullable)) } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct AddAction { - path: String, - partition_values: BTreeMap>, - size: i64, - modification_time: i64, - data_change: bool, - stats_json: Option, +fn action_struct_field(name: &str, schema: crate::spec::StructType, nullable: bool) -> Arc { + #[expect(clippy::expect_used)] + let data_type = ArrowDataType::try_from(&crate::spec::DataType::from(schema)) + .expect("action payload schema should convert to Arrow"); + action_field(name, data_type, nullable) } -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct RemoveAction { - path: String, - data_change: bool, - deletion_timestamp: Option, - extended_file_metadata: Option, - // Keep non-null in Arrow: Remove.partition_values=None is encoded as empty map (not NULL) - partition_values: BTreeMap>, - size: Option, +fn action_union_type() -> ArrowDataType { + ArrowDataType::Union( + vec![ + (0, action_struct_field("add", add_struct_type(), false)), + ( + 1, + action_struct_field("remove", remove_struct_type(), false), + ), + ( + 2, + action_struct_field("protocol", protocol_struct_type(), false), + ), + ( + 3, + action_struct_field("metadata", metadata_struct_type(), false), + ), + ( + 4, + action_field("commit_meta", ExecCommitMetaTransport::data_type(), false), + ), + ] + .into_iter() + .collect(), + datafusion::arrow::datatypes::UnionMode::Dense, + ) } #[derive(Debug, Clone, Serialize, Deserialize)] -pub struct CommitMetaAction { +struct ExecCommitMetaTransport { commit_row_count: u64, operation_json: Option, - operation_metrics_json: String, + num_files: Option, + num_output_rows: Option, + num_output_bytes: Option, + execution_time_ms: Option, + num_removed_files: Option, + num_added_files: Option, + num_output_files: Option, + num_added_bytes: Option, + num_removed_bytes: Option, + write_time_ms: Option, + operation_metrics_extra_json: Option, +} + +impl ExecCommitMetaTransport { + fn data_type() -> ArrowDataType { + ArrowDataType::Struct( + vec![ + action_field("commit_row_count", ArrowDataType::UInt64, false), + action_field("operation_json", ArrowDataType::Utf8, true), + action_field("num_files", ArrowDataType::UInt64, true), + action_field("num_output_rows", ArrowDataType::UInt64, true), + action_field("num_output_bytes", ArrowDataType::UInt64, true), + action_field("execution_time_ms", ArrowDataType::UInt64, true), + action_field("num_removed_files", ArrowDataType::UInt64, true), + action_field("num_added_files", ArrowDataType::UInt64, true), + action_field("num_output_files", ArrowDataType::UInt64, true), + action_field("num_added_bytes", ArrowDataType::UInt64, true), + action_field("num_removed_bytes", ArrowDataType::UInt64, true), + action_field("write_time_ms", ArrowDataType::UInt64, true), + action_field("operation_metrics_extra_json", ArrowDataType::Utf8, true), + ] + .into(), + ) + } + + fn from_exec_meta(meta: ExecCommitMeta) -> Result { + let operation_json = meta + .operation + .as_ref() + .map(serde_json::to_string) + .transpose() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let operation_metrics_extra_json = (!meta.operation_metrics.extra.is_empty()) + .then(|| serde_json::to_string(&meta.operation_metrics.extra)) + .transpose() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + Ok(Self { + commit_row_count: meta.row_count, + operation_json, + num_files: meta.operation_metrics.num_files, + num_output_rows: meta.operation_metrics.num_output_rows, + num_output_bytes: meta.operation_metrics.num_output_bytes, + execution_time_ms: meta.operation_metrics.execution_time_ms, + num_removed_files: meta.operation_metrics.num_removed_files, + num_added_files: meta.operation_metrics.num_added_files, + num_output_files: meta.operation_metrics.num_output_files, + num_added_bytes: meta.operation_metrics.num_added_bytes, + num_removed_bytes: meta.operation_metrics.num_removed_bytes, + write_time_ms: meta.operation_metrics.write_time_ms, + operation_metrics_extra_json, + }) + } + + fn into_exec_meta(self) -> Result { + let operation: Option = self + .operation_json + .as_deref() + .map(serde_json::from_str::) + .transpose() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let extra = self + .operation_metrics_extra_json + .as_deref() + .map(serde_json::from_str) + .transpose() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .unwrap_or_default(); + + Ok(ExecCommitMeta { + row_count: self.commit_row_count, + operation, + operation_metrics: OperationMetrics { + num_files: self.num_files, + num_output_rows: self.num_output_rows, + num_output_bytes: self.num_output_bytes, + execution_time_ms: self.execution_time_ms, + num_removed_files: self.num_removed_files, + num_added_files: self.num_added_files, + num_output_files: self.num_output_files, + num_added_bytes: self.num_added_bytes, + num_removed_bytes: self.num_removed_bytes, + write_time_ms: self.write_time_ms, + extra, + }, + }) + } } #[derive(Debug, Clone, Serialize, Deserialize)] -pub enum ExecAction { +enum PhysicalExecAction { #[serde(rename = "add")] - Add(AddAction), + Add(Add), #[serde(rename = "remove")] - Remove(RemoveAction), - // Protocol / Metadata are relatively rare; we keep them as JSON strings for now. + Remove(Remove), #[serde(rename = "protocol")] - Protocol(String), + Protocol(Protocol), #[serde(rename = "metadata")] - Metadata(String), + Metadata(Metadata), #[serde(rename = "commit_meta")] - CommitMeta(CommitMetaAction), + CommitMeta(ExecCommitMetaTransport), } #[derive(Debug, Clone, Serialize, Deserialize)] struct ActionRow { - action: ExecAction, -} - -fn delta_action_tracing_options() -> std::result::Result -{ - use serde_arrow::schema::TracingOptions; - - TracingOptions::default() - .map_as_struct(false) - .strings_as_large_utf8(false) - .sequence_as_large_list(false) - // Force MapArray strategy + stable entry field names ("keys"/"values") inside the union variants. - .overwrite( - "action.add.partition_values", - Field::new(COL_PARTITION_VALUES, partition_values_type(), false), - ) - .and_then(|opts| { - opts.overwrite( - "action.remove.partition_values", - Field::new(COL_PARTITION_VALUES, partition_values_type(), false), - ) - }) - .map_err(|e| format!("failed to overwrite partition_values field: {e}")) + action: PhysicalExecAction, } -fn delta_action_fields_build( -) -> std::result::Result, String> { - use serde_arrow::schema::SchemaLike; - - Vec::::from_type::( - delta_action_tracing_options()?, - ) - .map_err(|e| format!("ActionRow schema tracing failed: {e}")) +fn delta_action_fields_build() -> Vec { + vec![action_field(COL_ACTION, action_union_type(), false)] } fn delta_action_fields() -> Result<&'static Vec> { @@ -134,7 +198,7 @@ pub fn delta_action_schema() -> Result { Ok(Arc::clone(&*ACTION_SCHEMA)) } -pub fn encode_actions(actions: Vec) -> Result { +fn encode_transport_actions(actions: Vec) -> Result { if actions.is_empty() { return Ok(RecordBatch::new_empty(delta_action_schema()?)); } @@ -148,81 +212,38 @@ pub fn encode_actions(actions: Vec) -> Result { .map_err(|e| DataFusionError::External(Box::new(e))) } -pub fn encode_add_actions(adds: Vec) -> Result { - let actions: Vec = adds.into_iter().map(ExecAction::from).collect(); - encode_actions(actions) -} - -impl From for ExecAction { - fn from(add: Add) -> Self { - ExecAction::Add(AddAction { - path: add.path, - partition_values: add.partition_values.into_iter().collect(), - size: add.size, - modification_time: add.modification_time, - data_change: add.data_change, - stats_json: add.stats, - }) - } -} - -impl From for ExecAction { - fn from(remove: Remove) -> Self { - ExecAction::Remove(RemoveAction { - path: remove.path, - data_change: remove.data_change, - deletion_timestamp: remove.deletion_timestamp, - extended_file_metadata: remove.extended_file_metadata, - partition_values: remove - .partition_values - .unwrap_or_default() - .into_iter() - .collect(), - size: remove.size, - }) - } -} - -impl TryFrom for ExecAction { +impl TryFrom for PhysicalExecAction { type Error = DataFusionError; - fn try_from(protocol: Protocol) -> Result { - let protocol_json = - serde_json::to_string(&protocol).map_err(|e| DataFusionError::External(Box::new(e)))?; - Ok(ExecAction::Protocol(protocol_json)) + fn try_from(action: Action) -> Result { + match action { + Action::Add(add) => Ok(Self::Add(add)), + Action::Remove(remove) => Ok(Self::Remove(remove)), + Action::Protocol(protocol) => Ok(Self::Protocol(protocol)), + Action::Metadata(metadata) => Ok(Self::Metadata(metadata)), + unsupported => Err(DataFusionError::Plan(format!( + "unsupported physical action transport variant: {unsupported:?}" + ))), + } } } -impl TryFrom for ExecAction { - type Error = DataFusionError; +pub fn encode_actions( + actions: Vec, + exec_meta: Option, +) -> Result { + let mut transport_actions = actions + .into_iter() + .map(PhysicalExecAction::try_from) + .collect::>>()?; - fn try_from(metadata: Metadata) -> Result { - let metadata_json = - serde_json::to_string(&metadata).map_err(|e| DataFusionError::External(Box::new(e)))?; - Ok(ExecAction::Metadata(metadata_json)) + if let Some(exec_meta) = exec_meta { + transport_actions.push(PhysicalExecAction::CommitMeta( + ExecCommitMetaTransport::from_exec_meta(exec_meta)?, + )); } -} - -impl TryFrom for ExecAction { - type Error = DataFusionError; - fn try_from(meta: CommitMeta) -> Result { - let operation_json = meta - .operation - .as_ref() - .map(serde_json::to_string) - .transpose() - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let operation_metrics_json = serde_json::to_string(&meta.operation_metrics) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - Ok(ExecAction::CommitMeta(CommitMetaAction { - commit_row_count: meta.row_count, - operation_json, - operation_metrics_json, - })) - } + encode_transport_actions(transport_actions) } pub fn decode_adds_from_batch(batch: &RecordBatch) -> Result> { @@ -238,71 +259,21 @@ pub fn decode_adds_from_batch(batch: &RecordBatch) -> Result> { pub fn decode_actions_and_meta_from_batch( batch: &RecordBatch, -) -> Result<(Vec, Option)> { +) -> Result<(Vec, Option)> { let mut out_actions: Vec = Vec::new(); - let mut out_meta: Option = None; + let mut out_meta: Option = None; let rows: Vec = serde_arrow::from_record_batch(batch) .map_err(|e| DataFusionError::External(Box::new(e)))?; for row in rows { match row.action { - ExecAction::Add(a) => { - out_actions.push(Action::Add(Add { - path: a.path, - partition_values: a.partition_values.into_iter().collect(), - size: a.size, - modification_time: a.modification_time, - data_change: a.data_change, - stats: a.stats_json, - tags: None, - deletion_vector: None, - base_row_id: None, - default_row_commit_version: None, - clustering_provider: None, - commit_version: None, - commit_timestamp: None, - })); - } - ExecAction::Remove(r) => { - out_actions.push(Action::Remove(Remove { - path: r.path, - data_change: r.data_change, - deletion_timestamp: r.deletion_timestamp, - extended_file_metadata: r.extended_file_metadata, - partition_values: Some(r.partition_values.into_iter().collect()), - size: r.size, - tags: None, - deletion_vector: None, - base_row_id: None, - default_row_commit_version: None, - })); - } - ExecAction::Protocol(s) => { - let p: Protocol = - serde_json::from_str(&s).map_err(|e| DataFusionError::External(Box::new(e)))?; - out_actions.push(Action::Protocol(p)); - } - ExecAction::Metadata(s) => { - let m: Metadata = - serde_json::from_str(&s).map_err(|e| DataFusionError::External(Box::new(e)))?; - out_actions.push(Action::Metadata(m)); - } - ExecAction::CommitMeta(cm) => { - let operation: Option = cm - .operation_json - .as_deref() - .map(serde_json::from_str::) - .transpose() - .map_err(|e| DataFusionError::External(Box::new(e)))?; - let operation_metrics: HashMap = - serde_json::from_str::>(&cm.operation_metrics_json) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - out_meta = Some(CommitMeta { - row_count: cm.commit_row_count, - operation, - operation_metrics, - }); + PhysicalExecAction::Add(add) => out_actions.push(Action::Add(add)), + PhysicalExecAction::Remove(remove) => out_actions.push(Action::Remove(remove)), + PhysicalExecAction::Protocol(protocol) => out_actions.push(Action::Protocol(protocol)), + PhysicalExecAction::Metadata(metadata) => out_actions.push(Action::Metadata(metadata)), + PhysicalExecAction::CommitMeta(cm) => { + out_meta = Some(cm.into_exec_meta()?); } } } @@ -312,7 +283,13 @@ pub fn decode_actions_and_meta_from_batch( #[cfg(test)] mod tests { + use std::collections::HashMap; + + use datafusion::arrow::datatypes::DataType as ArrowDataType; + use super::*; + use crate::kernel::transaction::OperationMetrics; + use crate::spec::{DeletionVectorDescriptor, StorageType, StructType}; #[test] fn encode_actions_produces_action_column() -> Result<()> { @@ -332,8 +309,7 @@ mod tests { commit_timestamp: None, }]; - let exec_actions: Vec = adds.into_iter().map(|add| add.into()).collect(); - let rb = encode_actions(exec_actions)?; + let rb = encode_actions(adds.into_iter().map(Action::Add).collect(), None)?; assert_eq!(rb.schema(), delta_action_schema()?); assert_eq!(rb.num_rows(), 1); assert!(rb.column_by_name(COL_ACTION).is_some()); @@ -364,27 +340,24 @@ mod tests { extended_file_metadata: Some(true), partition_values: None, size: Some(1), + stats: None, tags: None, deletion_vector: None, base_row_id: None, default_row_commit_version: None, }]; - let meta = CommitMeta { + let meta = ExecCommitMeta { row_count: 10, operation: None, - operation_metrics: HashMap::new(), + operation_metrics: OperationMetrics::default(), }; - - let mut exec_actions: Vec = Vec::new(); - for add in adds { - exec_actions.push(add.into()); - } - for remove in removes { - exec_actions.push(remove.into()); - } - exec_actions.push(meta.try_into()?); - - let batch = encode_actions(exec_actions)?; + let batch = encode_actions( + adds.into_iter() + .map(Action::Add) + .chain(removes.into_iter().map(Action::Remove)) + .collect(), + Some(meta), + )?; let (actions, decoded_meta) = decode_actions_and_meta_from_batch(&batch)?; assert_eq!(actions.len(), 2); @@ -396,4 +369,118 @@ mod tests { assert_eq!(decoded_meta.row_count, 10); Ok(()) } + + #[test] + fn protocol_and_metadata_roundtrip_as_typed_actions() -> Result<()> { + let protocol = Protocol::new(3, 7, None, None); + let metadata = Metadata::try_new( + Some("tbl".to_string()), + Some("desc".to_string()), + StructType::try_new([]).map_err(|e| DataFusionError::External(Box::new(e)))?, + vec!["p".to_string()], + 0, + HashMap::from([("k".to_string(), "v".to_string())]), + ) + .map_err(|e| DataFusionError::External(Box::new(e)))? + .with_table_id("table-id".to_string()); + + let batch = encode_actions( + vec![ + Action::Protocol(protocol.clone()), + Action::Metadata(metadata.clone()), + ], + None, + )?; + let (actions, decoded_meta) = decode_actions_and_meta_from_batch(&batch)?; + + assert!(decoded_meta.is_none()); + assert_eq!(actions.len(), 2); + assert!(matches!(&actions[0], Action::Protocol(value) if value == &protocol)); + assert!(matches!(&actions[1], Action::Metadata(value) if value == &metadata)); + Ok(()) + } + + #[test] + fn add_and_remove_roundtrip_preserve_extended_fields() -> Result<()> { + let add = Add { + path: "part-000.parquet".to_string(), + partition_values: HashMap::from([("p".to_string(), Some("1".to_string()))]), + size: 10, + modification_time: 20, + data_change: true, + stats: Some("{\"numRecords\":1}".to_string()), + tags: Some(HashMap::from([("k".to_string(), Some("v".to_string()))])), + deletion_vector: Some(DeletionVectorDescriptor { + storage_type: StorageType::Inline, + path_or_inline_dv: "encoded-dv".to_string(), + offset: Some(12), + size_in_bytes: 34, + cardinality: 56, + }), + base_row_id: Some(1), + default_row_commit_version: Some(2), + clustering_provider: Some("liquid".to_string()), + commit_version: None, + commit_timestamp: None, + }; + let remove = Remove { + path: "part-000.parquet".to_string(), + data_change: true, + deletion_timestamp: Some(30), + extended_file_metadata: Some(true), + partition_values: Some(HashMap::from([("p".to_string(), Some("1".to_string()))])), + size: Some(10), + stats: Some("{\"numRecords\":1}".to_string()), + tags: Some(HashMap::from([("k".to_string(), Some("v".to_string()))])), + deletion_vector: add.deletion_vector.clone(), + base_row_id: Some(1), + default_row_commit_version: Some(2), + }; + + let batch = encode_actions( + vec![Action::Add(add.clone()), Action::Remove(remove.clone())], + None, + )?; + let (actions, decoded_meta) = decode_actions_and_meta_from_batch(&batch)?; + + assert!(decoded_meta.is_none()); + assert_eq!(actions, vec![Action::Add(add), Action::Remove(remove)]); + Ok(()) + } + + #[test] + fn action_union_reuses_shared_payload_types() -> Result<()> { + let schema = delta_action_schema()?; + let action_field = schema + .field_with_name(COL_ACTION) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let ArrowDataType::Union(fields, _) = action_field.data_type() else { + return Err(DataFusionError::Internal( + "action column should be a dense union".into(), + )); + }; + + let add_type = fields + .iter() + .find(|(_, field)| field.name() == "add") + .map(|(_, field)| field.data_type().clone()) + .ok_or_else(|| DataFusionError::Internal("missing add union member".into()))?; + let remove_type = fields + .iter() + .find(|(_, field)| field.name() == "remove") + .map(|(_, field)| field.data_type().clone()) + .ok_or_else(|| DataFusionError::Internal("missing remove union member".into()))?; + + let expected_add = + ArrowDataType::try_from(&crate::spec::DataType::from(crate::spec::add_struct_type())) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let expected_remove = ArrowDataType::try_from(&crate::spec::DataType::from( + crate::spec::remove_struct_type(), + )) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + assert_eq!(add_type, expected_add); + assert_eq!(remove_type, expected_remove); + Ok(()) + } } diff --git a/crates/sail-delta-lake/src/physical_plan/commit_exec.rs b/crates/sail-delta-lake/src/physical_plan/commit_exec.rs index f51b9acf64..058280d379 100644 --- a/crates/sail-delta-lake/src/physical_plan/commit_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/commit_exec.rs @@ -30,18 +30,18 @@ use datafusion::physical_plan::{ }; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_physical_expr::{Distribution, EquivalenceProperties}; -use delta_kernel::engine::arrow_conversion::TryIntoKernel; -use delta_kernel::schema::StructType; use futures::stream::{self, StreamExt}; use sail_common_datafusion::datasource::PhysicalSinkMode; use url::Url; -use crate::kernel::models::{Action, Metadata, Protocol}; -use crate::kernel::transaction::{CommitBuilder, CommitProperties, TableReference}; +use crate::kernel::transaction::{CommitBuilder, CommitProperties, OperationMetrics}; use crate::kernel::{DeltaOperation, SaveMode}; -use crate::physical_plan::action_schema::CommitMeta; +use crate::physical_plan::action_schema::ExecCommitMeta; use crate::physical_plan::{decode_actions_and_meta_from_batch, COL_ACTION}; -use crate::schema::normalize_delta_schema; +use crate::schema::{ + metadata_for_create_with_struct_type, normalize_delta_schema, protocol_for_create, +}; +use crate::spec::{Action, StructType}; use crate::storage::{get_object_store_from_context, StorageConfig}; use crate::table::{create_delta_table_with_object_store, open_table_with_object_store}; @@ -60,7 +60,7 @@ pub struct DeltaCommitExec { sink_schema: SchemaRef, sink_mode: PhysicalSinkMode, metrics: ExecutionPlanMetricsSet, - cache: PlanProperties, + cache: Arc, } impl DeltaCommitExec { @@ -90,13 +90,13 @@ impl DeltaCommitExec { } } - fn compute_properties(schema: SchemaRef) -> PlanProperties { - PlanProperties::new( + fn compute_properties(schema: SchemaRef) -> Arc { + Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ) + )) } pub fn table_url(&self) -> &Url { @@ -134,7 +134,7 @@ impl ExecutionPlan for DeltaCommitExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -230,7 +230,7 @@ impl ExecutionPlan for DeltaCommitExec { let mut actions: Vec = Vec::new(); let mut initial_actions: Vec = Vec::new(); let mut operation: Option = None; - let mut operation_metrics: HashMap = HashMap::new(); + let mut operation_metrics = OperationMetrics::default(); let mut data = input_stream; while let Some(batch_result) = data.next().await { @@ -246,7 +246,7 @@ impl ExecutionPlan for DeltaCommitExec { _ => actions.push(a), } } - if let Some(CommitMeta { + if let Some(ExecCommitMeta { row_count, operation: op, operation_metrics: metrics, @@ -256,7 +256,7 @@ impl ExecutionPlan for DeltaCommitExec { if operation.is_none() { operation = op; } - merge_operation_metrics(&mut operation_metrics, metrics); + operation_metrics.merge(metrics); } has_data = has_data || batch.num_rows() > 0; } else { @@ -323,34 +323,23 @@ impl ExecutionPlan for DeltaCommitExec { DeltaOperation::Create { mode: SaveMode::ErrorIfExists, location: table_url.to_string(), - protocol, - metadata, + protocol: Box::new(protocol), + metadata: Box::new(metadata), }, final_actions, ) } else { // Construct minimal protocol/metadata and insert them let normalized_sink = normalize_delta_schema(&sink_schema); - let delta_schema: StructType = normalized_sink - .as_ref() - .try_into_kernel() - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let protocol_json = serde_json::json!({ - "minReaderVersion": 1, - "minWriterVersion": 2, - }); - let protocol: Protocol = serde_json::from_value(protocol_json) + let protocol = protocol_for_create(false, false, false) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let configuration: HashMap = HashMap::new(); - let metadata = Metadata::try_new( - None, - None, - delta_schema.clone(), + let metadata = metadata_for_create_with_struct_type( + StructType::try_from(normalized_sink.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?, partition_columns.to_vec(), Utc::now().timestamp_millis(), - configuration, + HashMap::new(), ) .map_err(|e| DataFusionError::External(Box::new(e)))?; @@ -363,8 +352,8 @@ impl ExecutionPlan for DeltaCommitExec { DeltaOperation::Create { mode: SaveMode::ErrorIfExists, location: table_url.to_string(), - protocol, - metadata, + protocol: Box::new(protocol), + metadata: Box::new(metadata), }, updated_actions, ) @@ -393,7 +382,7 @@ impl ExecutionPlan for DeltaCommitExec { } else { None }; - let reference = snapshot.as_ref().map(|s| *s as &dyn TableReference); + let reference = snapshot.cloned(); let finalized_commit = CommitBuilder::from( CommitProperties::default().with_operation_metrics(operation_metrics), @@ -437,48 +426,6 @@ impl ExecutionPlan for DeltaCommitExec { } } -fn merge_operation_metrics( - target: &mut HashMap, - source: HashMap, -) { - for (k, v) in source { - match (target.get(&k), &v) { - (Some(serde_json::Value::Number(a)), serde_json::Value::Number(b)) => { - let sum_i64 = a - .as_i64() - .and_then(|ai| b.as_i64().map(|bi| ai.saturating_add(bi))); - let sum_u64 = a - .as_u64() - .and_then(|au| b.as_u64().map(|bu| au.saturating_add(bu))); - - if let Some(sum) = sum_u64 { - target.insert(k, serde_json::Value::from(sum)); - } else if let Some(sum) = sum_i64 { - target.insert(k, serde_json::Value::from(sum)); - } else if let (Some(af), Some(bf)) = (a.as_f64(), b.as_f64()) { - let sum = af + bf; - target.insert( - k, - serde_json::Value::Number( - serde_json::Number::from_f64(sum) - .unwrap_or_else(|| serde_json::Number::from(0)), - ), - ); - } else { - target.insert(k, v); - } - } - (None, _) => { - target.insert(k, v); - } - _ => { - // Different shapes; prefer the latest value. - target.insert(k, v); - } - } - } -} - impl DisplayAs for DeltaCommitExec { fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { match t { diff --git a/crates/sail-delta-lake/src/physical_plan/discovery_exec.rs b/crates/sail-delta-lake/src/physical_plan/discovery_exec.rs index 0396c8ef33..c7df268479 100644 --- a/crates/sail-delta-lake/src/physical_plan/discovery_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/discovery_exec.rs @@ -4,7 +4,6 @@ use std::sync::Arc; use async_trait::async_trait; use datafusion::arrow::array::BooleanArray; -use datafusion::arrow::compute::filter_record_batch; use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::execution::context::TaskContext; @@ -19,8 +18,6 @@ use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalExpr use futures::TryStreamExt; use url::Url; -use crate::physical_plan::meta_adds; - #[derive(Debug)] pub struct DeltaDiscoveryExec { table_url: Url, @@ -30,7 +27,7 @@ pub struct DeltaDiscoveryExec { input: Arc, input_partition_columns: Vec, input_partition_scan: bool, - cache: PlanProperties, + cache: Arc, } impl DeltaDiscoveryExec { @@ -102,13 +99,13 @@ impl DeltaDiscoveryExec { ) } - fn compute_properties(schema: SchemaRef, output_partitions: usize) -> PlanProperties { - PlanProperties::new( + fn compute_properties(schema: SchemaRef, output_partitions: usize) -> Arc { + Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(output_partitions.max(1)), EmissionType::Final, Boundedness::Bounded, - ) + )) } /// Get the table URL @@ -145,23 +142,6 @@ impl DeltaDiscoveryExec { pub fn input_partition_scan(&self) -> bool { self.input_partition_scan } - - fn prune_mask_for_meta_batch( - batch: &RecordBatch, - predicate: &Arc, - table_schema: &SchemaRef, - partition_columns: &[String], - ) -> Result> { - let adds = meta_adds::decode_adds_from_meta_batch(batch, Some(partition_columns))?; - if adds.is_empty() { - return Ok(vec![]); - } - crate::datasource::pruning::prune_adds_by_physical_predicate( - adds, - table_schema.clone(), - Arc::clone(predicate), - ) - } } #[async_trait] @@ -174,7 +154,7 @@ impl ExecutionPlan for DeltaDiscoveryExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -208,45 +188,18 @@ impl ExecutionPlan for DeltaDiscoveryExec { let schema = self.schema(); let input_stream = self.input.execute(partition, context)?; let schema_for_stream = schema.clone(); - let predicate_for_stream = self.predicate.clone(); - let table_schema_for_stream = self.table_schema.clone(); - let partition_columns_for_stream = self.input_partition_columns.clone(); let partition_scan = self.input_partition_scan; let s = input_stream.try_filter_map(move |batch| { let schema = schema_for_stream.clone(); - let predicate = predicate_for_stream.clone(); - let table_schema = table_schema_for_stream.clone(); - let partition_columns = partition_columns_for_stream.clone(); async move { if batch.num_rows() == 0 { return Ok(None); } - let filtered = match (&predicate, &table_schema) { - (Some(pred), Some(ts)) => { - let mask = DeltaDiscoveryExec::prune_mask_for_meta_batch( - &batch, - pred, - ts, - &partition_columns, - )?; - if mask.is_empty() { - batch.clone() - } else { - let b = BooleanArray::from(mask); - filter_record_batch(&batch, &b) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))? - } - } - _ => batch.clone(), - }; - - let scan_array = Arc::new(BooleanArray::from(vec![ - partition_scan; - filtered.num_rows() - ])); - let mut cols = filtered.columns().to_vec(); + let scan_array = + Arc::new(BooleanArray::from(vec![partition_scan; batch.num_rows()])); + let mut cols = batch.columns().to_vec(); cols.push(scan_array); let out = RecordBatch::try_new(schema, cols) .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; diff --git a/crates/sail-delta-lake/src/physical_plan/expr_adapter.rs b/crates/sail-delta-lake/src/physical_plan/expr_adapter.rs index 24a21e53f6..445378dc0c 100644 --- a/crates/sail-delta-lake/src/physical_plan/expr_adapter.rs +++ b/crates/sail-delta-lake/src/physical_plan/expr_adapter.rs @@ -38,16 +38,16 @@ impl PhysicalExprAdapterFactory for DeltaPhysicalExprAdapterFactory { &self, logical_file_schema: SchemaRef, physical_file_schema: SchemaRef, - ) -> Arc { + ) -> Result> { let (column_mapping, default_values) = Self::create_column_mapping(&logical_file_schema, &physical_file_schema); - Arc::new(DeltaPhysicalExprAdapter { + Ok(Arc::new(DeltaPhysicalExprAdapter { logical_file_schema, physical_file_schema, column_mapping, default_values, - }) + })) } } diff --git a/crates/sail-delta-lake/src/physical_plan/log_replay_exec.rs b/crates/sail-delta-lake/src/physical_plan/log_replay_exec.rs index ffb325aae8..ccdbfa6a3e 100644 --- a/crates/sail-delta-lake/src/physical_plan/log_replay_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/log_replay_exec.rs @@ -1,4 +1,5 @@ use std::any::Any; +use std::collections::HashMap; use std::fmt; use std::sync::Arc; @@ -24,6 +25,20 @@ use url::Url; use crate::physical_plan::{COL_LOG_IS_REMOVE, COL_LOG_VERSION, COL_REPLAY_PATH}; const OUTPUT_BATCH_ROWS: usize = 8192; +const MAX_COMMIT_REPLAY_ENTRIES: usize = 5_000_000; + +#[derive(Debug, Clone)] +enum ReplayMode { + /// Sort-based replay (spill-friendly). Requires local ordering on + /// (replay_path ASC, log_version DESC, is_remove ASC). + Sort { input: Arc }, + /// Hash-based replay to avoid checkpoint-side SortExec pipeline breakers: + /// build a small map from commits, stream checkpoint rows, then emit commit-only adds. + Hash { + checkpoint: Arc, + commits: Arc, + }, +} /// A unary node that filters Delta log rows into the active set (tombstone replay). /// @@ -40,16 +55,19 @@ const OUTPUT_BATCH_ROWS: usize = 8192; /// - This exec is designed to be **spill-friendly** by requiring the input to be hash-partitioned /// and sorted by `__sail_delta_replay_path`, enabling streaming replay without materializing the /// full active set in memory. +/// +/// TODO(aqe-after-log-replay): Emit replay output cardinality/bytes as first-class runtime +/// feedback and allow adaptive re-planning (e.g., add shuffle, switch HashJoin to BroadcastJoin). #[derive(Debug, Clone)] pub struct DeltaLogReplayExec { - input: Arc, + mode: ReplayMode, table_url: Url, version: i64, partition_columns: Vec, // purely for observability (EXPLAIN); populated by the planner when available checkpoint_files: Vec, commit_files: Vec, - cache: PlanProperties, + cache: Arc, } impl DeltaLogReplayExec { @@ -63,14 +81,49 @@ impl DeltaLogReplayExec { ) -> Self { let schema = Self::output_schema(&input.schema()); let output_partitions = input.output_partitioning().partition_count().max(1); - let cache = PlanProperties::new( + let cache = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(output_partitions), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { - input, + mode: ReplayMode::Sort { input }, + table_url, + version, + partition_columns, + checkpoint_files, + commit_files, + cache, + } + } + + pub fn new_hash( + checkpoint: Arc, + commits: Arc, + table_url: Url, + version: i64, + partition_columns: Vec, + checkpoint_files: Vec, + commit_files: Vec, + ) -> Self { + let schema = Self::output_schema(&checkpoint.schema()); + let output_partitions = checkpoint + .output_partitioning() + .partition_count() + .max(commits.output_partitioning().partition_count()) + .max(1); + let cache = Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema), + Partitioning::UnknownPartitioning(output_partitions), + EmissionType::Final, + Boundedness::Bounded, + )); + Self { + mode: ReplayMode::Hash { + checkpoint, + commits, + }, table_url, version, partition_columns, @@ -115,6 +168,64 @@ impl DeltaLogReplayExec { } } +fn required_replay_columns( + batch: &RecordBatch, +) -> Result<(Arc, Arc, Arc)> { + let replay_path = batch.column_by_name(COL_REPLAY_PATH).ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec input must have Utf8 column '{COL_REPLAY_PATH}'" + )) + })?; + let replay_path = cast(replay_path.as_ref(), &DataType::Utf8) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let replay_path = replay_path + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec '{COL_REPLAY_PATH}' must be Utf8" + )) + })?; + + let is_remove = batch.column_by_name(COL_LOG_IS_REMOVE).ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec input must have Boolean column '{COL_LOG_IS_REMOVE}'" + )) + })?; + let is_remove = cast(is_remove.as_ref(), &DataType::Boolean) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let is_remove = is_remove + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec '{COL_LOG_IS_REMOVE}' must be Boolean" + )) + })?; + + let log_version = batch.column_by_name(COL_LOG_VERSION).ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec input must have Int64 column '{COL_LOG_VERSION}'" + )) + })?; + let log_version = cast(log_version.as_ref(), &DataType::Int64) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let log_version = log_version + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "DeltaLogReplayExec '{COL_LOG_VERSION}' must be Int64" + )) + })?; + + Ok(( + Arc::new(replay_path.clone()), + Arc::new(is_remove.clone()), + Arc::new(log_version.clone()), + )) +} + struct ReplayState { input: SendableRecordBatchStream, output_schema: SchemaRef, @@ -203,54 +314,7 @@ impl ReplayState { // The planner is expected to materialize `COL_REPLAY_PATH`, `COL_LOG_IS_REMOVE`, and // `COL_LOG_VERSION` (via a projection) for distribution/sorting and tombstone logic. - let replay_path = batch.column_by_name(COL_REPLAY_PATH).ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec input must have Utf8 column '{COL_REPLAY_PATH}'" - )) - })?; - let replay_path = cast(replay_path.as_ref(), &DataType::Utf8) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; - let replay_path = replay_path - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec '{COL_REPLAY_PATH}' must be Utf8" - )) - })?; - - let is_remove = batch.column_by_name(COL_LOG_IS_REMOVE).ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec input must have Boolean column '{COL_LOG_IS_REMOVE}'" - )) - })?; - let is_remove = cast(is_remove.as_ref(), &DataType::Boolean) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; - let is_remove = is_remove - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec '{COL_LOG_IS_REMOVE}' must be Boolean" - )) - })?; - - // Require a log version column so the planner can enforce newest-first replay semantics. - let log_version = batch.column_by_name(COL_LOG_VERSION).ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec input must have Int64 column '{COL_LOG_VERSION}'" - )) - })?; - let log_version = cast(log_version.as_ref(), &DataType::Int64) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; - let _log_version = log_version - .as_any() - .downcast_ref::() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "DeltaLogReplayExec '{COL_LOG_VERSION}' must be Int64" - )) - })?; + let (replay_path, is_remove, _log_version) = required_replay_columns(batch)?; for row in 0..batch.num_rows() { if replay_path.is_null(row) { @@ -293,6 +357,205 @@ impl ReplayState { } } +#[derive(Debug, Clone)] +struct ReplayEntry { + log_version: i64, + is_remove: bool, + payload: Option>, +} + +enum HashReplayStage { + Build, + Probe, + Emit, + Done, +} + +struct HashReplayState { + commits: SendableRecordBatchStream, + checkpoint: SendableRecordBatchStream, + output_schema: SchemaRef, + output_col_indices: Vec, + + map: HashMap, + + // output builders for the next RecordBatch + out_col_slices: Vec>, + out_rows: usize, + + stage: HashReplayStage, + // materialized after probe finishes + emit_rows: Option>>, +} + +impl HashReplayState { + fn new( + commits: SendableRecordBatchStream, + checkpoint: SendableRecordBatchStream, + output_schema: SchemaRef, + ) -> Self { + let input_schema = checkpoint.schema(); + let mut output_col_indices = Vec::with_capacity(input_schema.fields().len()); + for (i, f) in input_schema.fields().iter().enumerate() { + if f.name() == COL_REPLAY_PATH + || f.name() == COL_LOG_IS_REMOVE + || f.name() == COL_LOG_VERSION + { + continue; + } + output_col_indices.push(i); + } + let out_cols = output_schema.fields().len(); + Self { + commits, + checkpoint, + out_col_slices: vec![Vec::new(); out_cols], + output_schema, + output_col_indices, + map: HashMap::new(), + out_rows: 0, + stage: HashReplayStage::Build, + emit_rows: None, + } + } + + fn take_output_batch(&mut self) -> Result { + if self.out_rows == 0 { + return internal_err!("DeltaLogReplayExec produced an empty output batch"); + } + let mut cols = Vec::with_capacity(self.out_col_slices.len()); + for slices in &mut self.out_col_slices { + if slices.is_empty() { + return internal_err!( + "DeltaLogReplayExec produced an incomplete output batch (missing column slices)" + ); + } + let parts: Vec<&dyn Array> = slices.iter().map(|a| a.as_ref()).collect(); + let col = concat(&parts).map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + slices.clear(); + cols.push(col); + } + self.out_rows = 0; + RecordBatch::try_new(Arc::clone(&self.output_schema), cols) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } + + fn push_payload_row(&mut self, row: Vec) { + for (i, col) in row.into_iter().enumerate() { + // Safety: `row` is constructed with exactly `out_col_slices.len()` columns. + if let Some(dst) = self.out_col_slices.get_mut(i) { + dst.push(col); + } + } + self.out_rows += 1; + } + + fn process_commits_batch(&mut self, batch: &RecordBatch) -> Result<()> { + if batch.num_rows() == 0 { + return Ok(()); + } + + let (replay_path, is_remove, log_version) = required_replay_columns(batch)?; + for row in 0..batch.num_rows() { + if replay_path.is_null(row) { + continue; + } + let key = replay_path.value(row).to_string(); + let removed = !is_remove.is_null(row) && is_remove.value(row); + let version = if log_version.is_null(row) { + i64::MIN + } else { + log_version.value(row) + }; + + let should_replace = match self.map.get(&key) { + None => true, + Some(existing) => { + // Compare-and-swap winner rule: + // - Higher version wins + // - At equal version, Add beats Remove (DV update pattern) + version > existing.log_version + || (version == existing.log_version && existing.is_remove && !removed) + } + }; + if !should_replace { + continue; + } + + let payload = if removed { + None + } else { + let mut out = Vec::with_capacity(self.output_col_indices.len()); + for idx in &self.output_col_indices { + out.push(batch.column(*idx).slice(row, 1)); + } + Some(out) + }; + + self.map.insert( + key, + ReplayEntry { + log_version: version, + is_remove: removed, + payload, + }, + ); + if self.map.len() > MAX_COMMIT_REPLAY_ENTRIES { + return Err(DataFusionError::Execution(format!( + "DeltaLogReplayExec hash replay exceeded MAX_COMMIT_REPLAY_ENTRIES={MAX_COMMIT_REPLAY_ENTRIES}" + ))); + } + } + Ok(()) + } + + fn process_checkpoint_batch(&mut self, batch: &RecordBatch) -> Result<()> { + if batch.num_rows() == 0 { + return Ok(()); + } + + let (replay_path, is_remove, _log_version) = required_replay_columns(batch)?; + for row in 0..batch.num_rows() { + if replay_path.is_null(row) { + continue; + } + if !is_remove.is_null(row) && is_remove.value(row) { + // Checkpoints can include tombstones; they are not part of the active add set. + continue; + } + + let key = replay_path.value(row).to_string(); + if self.map.contains_key(&key) { + continue; + } + + let mut out = Vec::with_capacity(self.output_col_indices.len()); + for idx in &self.output_col_indices { + out.push(batch.column(*idx).slice(row, 1)); + } + self.push_payload_row(out); + } + + Ok(()) + } + + fn finalize_emit_rows(&mut self) { + if self.emit_rows.is_some() { + return; + } + let mut rows: Vec> = Vec::new(); + for (_k, entry) in self.map.drain() { + if entry.is_remove { + continue; + } + if let Some(payload) = entry.payload { + rows.push(payload); + } + } + self.emit_rows = Some(rows.into_iter()); + } +} + #[async_trait] impl ExecutionPlan for DeltaLogReplayExec { fn name(&self) -> &'static str { @@ -303,83 +566,126 @@ impl ExecutionPlan for DeltaLogReplayExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } fn required_input_distribution(&self) -> Vec { - // Log replay is only correct if all actions for the same `path` are co-located in the - // same partition. We express this as a required hash distribution over the derived - // `COL_REPLAY_PATH` column, which is expected to be produced by the planner. - // - // If the column isn't present (e.g. an unexpected upstream), fall back to single - // partition to preserve correctness. - let idx = match self.input.schema().index_of(COL_REPLAY_PATH) { - Ok(i) => i, - Err(_) => return vec![Distribution::SinglePartition], + let dist_for = |plan: &Arc| -> Distribution { + let idx = match plan.schema().index_of(COL_REPLAY_PATH) { + Ok(i) => i, + Err(_) => return Distribution::SinglePartition, + }; + let expr: Arc = Arc::new( + datafusion_physical_expr::expressions::Column::new(COL_REPLAY_PATH, idx), + ); + Distribution::HashPartitioned(vec![expr]) }; - let expr: Arc = Arc::new( - datafusion_physical_expr::expressions::Column::new(COL_REPLAY_PATH, idx), - ); - vec![Distribution::HashPartitioned(vec![expr])] + + match &self.mode { + ReplayMode::Sort { input } => vec![dist_for(input)], + ReplayMode::Hash { + checkpoint, + commits, + } => { + vec![dist_for(checkpoint), dist_for(commits)] + } + } } fn required_input_ordering(&self) -> Vec> { - // The streaming replay logic relies on all rows for the same `COL_REPLAY_PATH` - // being adjacent within each partition, so we require a local ordering by - // (COL_REPLAY_PATH ASC, COL_LOG_VERSION DESC). - // TODO: Add COL_LOG_IS_REMOVE ASC as a tie-breaker so Add beats Remove within - // the same path/version (needed for DV updates: Remove(old dv) + Add(new dv)). - let replay_idx = match self.input.schema().index_of(COL_REPLAY_PATH) { - Ok(i) => i, - Err(_) => return vec![None], - }; - let version_idx = match self.input.schema().index_of(COL_LOG_VERSION) { - Ok(i) => i, - Err(_) => return vec![None], - }; - - let Some(ordering) = LexOrdering::new(vec![ - PhysicalSortExpr { - expr: Arc::new(Column::new(COL_REPLAY_PATH, replay_idx)), - options: SortOptions { - descending: false, - nulls_first: false, - }, - }, - PhysicalSortExpr { - expr: Arc::new(Column::new(COL_LOG_VERSION, version_idx)), - options: SortOptions { - descending: true, - nulls_first: false, - }, - }, - ]) else { - return vec![None]; - }; - - vec![Some(OrderingRequirements::from(ordering))] + match &self.mode { + ReplayMode::Hash { .. } => vec![None, None], + ReplayMode::Sort { input } => { + // The streaming replay logic relies on all rows for the same `COL_REPLAY_PATH` + // being adjacent within each partition, so we require a local ordering by: + // (COL_REPLAY_PATH ASC, COL_LOG_VERSION DESC, COL_LOG_IS_REMOVE ASC). + // + // The extra `COL_LOG_IS_REMOVE` tie-break makes Add beat Remove for the same + // path/version (DV update pattern: Remove(old dv) + Add(new dv) in one commit). + let replay_idx = match input.schema().index_of(COL_REPLAY_PATH) { + Ok(i) => i, + Err(_) => return vec![None], + }; + let version_idx = match input.schema().index_of(COL_LOG_VERSION) { + Ok(i) => i, + Err(_) => return vec![None], + }; + let is_remove_idx = match input.schema().index_of(COL_LOG_IS_REMOVE) { + Ok(i) => i, + Err(_) => return vec![None], + }; + + let Some(ordering) = LexOrdering::new(vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_REPLAY_PATH, replay_idx)), + options: SortOptions { + descending: false, + nulls_first: false, + }, + }, + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_LOG_VERSION, version_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }, + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_LOG_IS_REMOVE, is_remove_idx)), + options: SortOptions { + descending: false, + nulls_first: false, + }, + }, + ]) else { + return vec![None]; + }; + + vec![Some(OrderingRequirements::from(ordering))] + } + } } fn children(&self) -> Vec<&Arc> { - vec![&self.input] + match &self.mode { + ReplayMode::Sort { input } => vec![input], + ReplayMode::Hash { + checkpoint, + commits, + } => vec![checkpoint, commits], + } } fn with_new_children( self: Arc, children: Vec>, ) -> Result> { - if children.len() != 1 { - return internal_err!("DeltaLogReplayExec expects exactly one child"); + match (&self.mode, children.len()) { + (ReplayMode::Sort { .. }, 1) => Ok(Arc::new(Self::new( + Arc::clone(&children[0]), + self.table_url.clone(), + self.version, + self.partition_columns.clone(), + self.checkpoint_files.clone(), + self.commit_files.clone(), + ))), + (ReplayMode::Hash { .. }, 2) => Ok(Arc::new(Self::new_hash( + Arc::clone(&children[0]), + Arc::clone(&children[1]), + self.table_url.clone(), + self.version, + self.partition_columns.clone(), + self.checkpoint_files.clone(), + self.commit_files.clone(), + ))), + (ReplayMode::Sort { .. }, _) => { + internal_err!("DeltaLogReplayExec (sort) expects exactly one child") + } + (ReplayMode::Hash { .. }, _) => { + internal_err!("DeltaLogReplayExec (hash) expects exactly two children") + } } - Ok(Arc::new(Self::new( - Arc::clone(&children[0]), - self.table_url.clone(), - self.version, - self.partition_columns.clone(), - self.checkpoint_files.clone(), - self.commit_files.clone(), - ))) } fn execute( @@ -387,43 +693,131 @@ impl ExecutionPlan for DeltaLogReplayExec { partition: usize, context: Arc, ) -> Result { - let input_stream = self.input.execute(partition, context)?; let output_schema = self.schema(); - let partition_columns = self.partition_columns.clone(); - let state = ReplayState::new(input_stream, Arc::clone(&output_schema), partition_columns); + match &self.mode { + ReplayMode::Sort { input } => { + let input_stream = input.execute(partition, context)?; + let partition_columns = self.partition_columns.clone(); + let state = + ReplayState::new(input_stream, Arc::clone(&output_schema), partition_columns); + + let s = stream::try_unfold(state, |mut st| async move { + loop { + if st.out_rows >= OUTPUT_BATCH_ROWS { + let out = st.take_output_batch()?; + return Ok(Some((out, st))); + } + + if st.finished { + // Final flush. + st.flush_current_group(); + if st.out_rows > 0 { + let out = st.take_output_batch()?; + return Ok(Some((out, st))); + } + return Ok(None); + } + + match st.input.try_next().await? { + Some(batch) => { + st.process_batch(&batch)?; + continue; + } + None => { + st.finished = true; + continue; + } + } + } + }); - let s = stream::try_unfold(state, |mut st| async move { - loop { - if st.out_rows >= OUTPUT_BATCH_ROWS { - let out = st.take_output_batch()?; - return Ok(Some((out, st))); - } + Ok(Box::pin(RecordBatchStreamAdapter::new(output_schema, s))) + } + ReplayMode::Hash { + checkpoint, + commits, + } => { + let commits_stream = commits.execute(partition, Arc::clone(&context))?; + let checkpoint_stream = checkpoint.execute(partition, context)?; + + let state = HashReplayState::new( + commits_stream, + checkpoint_stream, + Arc::clone(&output_schema), + ); - if st.finished { - // Final flush. - st.flush_current_group(); - if st.out_rows > 0 { - let out = st.take_output_batch()?; - return Ok(Some((out, st))); + let s = stream::try_unfold(state, |mut st| async move { + loop { + if st.out_rows >= OUTPUT_BATCH_ROWS { + let out = st.take_output_batch()?; + return Ok(Some((out, st))); + } + + match st.stage { + HashReplayStage::Build => match st.commits.try_next().await? { + Some(batch) => { + st.process_commits_batch(&batch)?; + continue; + } + None => { + st.stage = HashReplayStage::Probe; + continue; + } + }, + HashReplayStage::Probe => match st.checkpoint.try_next().await? { + Some(batch) => { + st.process_checkpoint_batch(&batch)?; + continue; + } + None => { + st.stage = HashReplayStage::Emit; + continue; + } + }, + HashReplayStage::Emit => { + st.finalize_emit_rows(); + let mut iter = match st.emit_rows.take() { + Some(it) => it, + None => { + st.stage = HashReplayStage::Done; + continue; + } + }; + + while st.out_rows < OUTPUT_BATCH_ROWS { + match iter.next() { + Some(row) => { + st.push_payload_row(row); + continue; + } + None => break, + } + } + + let is_exhausted = iter.as_slice().is_empty(); + st.emit_rows = Some(iter); + if is_exhausted { + st.stage = HashReplayStage::Done; + continue; + } + // If we produced rows, let the outer loop flush as needed. + continue; + } + HashReplayStage::Done => { + if st.out_rows > 0 { + let out = st.take_output_batch()?; + return Ok(Some((out, st))); + } + return Ok(None); + } + } } - return Ok(None); - } + }); - match st.input.try_next().await? { - Some(batch) => { - st.process_batch(&batch)?; - continue; - } - None => { - st.finished = true; - continue; - } - } + Ok(Box::pin(RecordBatchStreamAdapter::new(output_schema, s))) } - }); - - Ok(Box::pin(RecordBatchStreamAdapter::new(output_schema, s))) + } } } @@ -441,6 +835,10 @@ impl DisplayAs for DeltaLogReplayExec { writeln!(f, "format: delta")?; writeln!(f, "table_path={}", self.table_url)?; writeln!(f, "version={}", self.version)?; + match &self.mode { + ReplayMode::Sort { .. } => writeln!(f, "mode=sort")?, + ReplayMode::Hash { .. } => writeln!(f, "mode=hash")?, + } if !self.checkpoint_files.is_empty() { writeln!(f, "checkpoint_files=[{}]", self.checkpoint_files.join(", "))?; } @@ -460,22 +858,23 @@ mod tests { use futures::TryStreamExt; use super::*; + use crate::spec::fields::{FIELD_NAME_MODIFICATION_TIME, FIELD_NAME_PATH, FIELD_NAME_SIZE}; #[derive(Debug)] struct OneBatchExec { batch: RecordBatch, - cache: PlanProperties, + cache: Arc, } impl OneBatchExec { fn new(batch: RecordBatch) -> Self { let schema = batch.schema(); - let cache = PlanProperties::new( + let cache = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { batch, cache } } } @@ -501,7 +900,7 @@ mod tests { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -555,9 +954,13 @@ mod tests { // Row2: add b let add_fields: Fields = vec![ - Arc::new(Field::new("path", DataType::Utf8, true)), - Arc::new(Field::new("size", DataType::Int64, true)), - Arc::new(Field::new("modificationTime", DataType::Int64, true)), + Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true)), + Arc::new(Field::new(FIELD_NAME_SIZE, DataType::Int64, true)), + Arc::new(Field::new( + FIELD_NAME_MODIFICATION_TIME, + DataType::Int64, + true, + )), ] .into(); @@ -614,7 +1017,7 @@ mod tests { .unwrap(); #[expect(clippy::unwrap_used)] let path_col = add - .column_by_name("path") + .column_by_name(FIELD_NAME_PATH) .unwrap() .as_any() .downcast_ref::() @@ -631,9 +1034,13 @@ mod tests { // Row1: remove a (v1) -> must be ignored (older) let add_fields: Fields = vec![ - Arc::new(Field::new("path", DataType::Utf8, true)), - Arc::new(Field::new("size", DataType::Int64, true)), - Arc::new(Field::new("modificationTime", DataType::Int64, true)), + Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true)), + Arc::new(Field::new(FIELD_NAME_SIZE, DataType::Int64, true)), + Arc::new(Field::new( + FIELD_NAME_MODIFICATION_TIME, + DataType::Int64, + true, + )), ] .into(); @@ -689,7 +1096,7 @@ mod tests { .unwrap(); #[expect(clippy::unwrap_used)] let path_col = add - .column_by_name("path") + .column_by_name(FIELD_NAME_PATH) .unwrap() .as_any() .downcast_ref::() @@ -698,4 +1105,523 @@ mod tests { assert_eq!(path_col.value(0), "a"); Ok(()) } + + #[tokio::test] + async fn hash_replay_checkpoint_hit_is_dropped() -> Result<()> { + // Checkpoint: add a, add b (v0) + // Commits: remove a (v1) -> should hide checkpoint a + + let add_fields: Fields = vec![ + Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true)), + Arc::new(Field::new(FIELD_NAME_SIZE, DataType::Int64, true)), + ] + .into(); + + let cp_add_path = Arc::new(StringArray::from(vec![Some("a"), Some("b")])) as ArrayRef; + let cp_add_size = Arc::new(Int64Array::from(vec![Some(1), Some(2)])) as ArrayRef; + let cp_add_struct = struct_array_with_validity( + add_fields.clone(), + vec![cp_add_path, cp_add_size], + vec![true, true], + ); + + let schema = Arc::new(Schema::new(vec![ + Field::new("add", cp_add_struct.data_type().clone(), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(cp_add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a"), Some("b")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef, + Arc::new(Int64Array::from(vec![0, 0])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let commit_add_struct = struct_array_with_validity( + add_fields, + vec![ + Arc::new(StringArray::from(vec![Option::<&str>::None])) as ArrayRef, + Arc::new(Int64Array::from(vec![None])) as ArrayRef, + ], + vec![false], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(commit_add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![true])) as ArrayRef, + Arc::new(Int64Array::from(vec![1])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + #[expect(clippy::unwrap_used)] + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + #[expect(clippy::unwrap_used)] + let out = stream.try_next().await?.unwrap(); + #[expect(clippy::unwrap_used)] + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + #[expect(clippy::unwrap_used)] + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(out.num_rows(), 1); + assert_eq!(path_col.value(0), "b"); + Ok(()) + } + + #[tokio::test] + async fn hash_replay_checkpoint_miss_is_passed_through() -> Result<()> { + // Checkpoint: add a + // Commits: empty + + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let add_struct = struct_array_with_validity( + add_fields, + vec![Arc::new(StringArray::from(vec![Some("a")])) as ArrayRef], + vec![true], + ); + + let schema = Arc::new(Schema::new(vec![ + Field::new("add", add_struct.data_type().clone(), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false])) as ArrayRef, + Arc::new(Int64Array::from(vec![0])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let empty_commits_batch = RecordBatch::new_empty(schema); + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(empty_commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + #[expect(clippy::unwrap_used)] + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + #[expect(clippy::unwrap_used)] + let out = stream.try_next().await?.unwrap(); + #[expect(clippy::unwrap_used)] + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + #[expect(clippy::unwrap_used)] + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(out.num_rows(), 1); + assert_eq!(path_col.value(0), "a"); + Ok(()) + } + + #[tokio::test] + async fn hash_replay_emits_commit_only_adds_after_checkpoint() -> Result<()> { + // Checkpoint: add a + // Commits: add c (newer) + + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields.clone()), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let cp_add_struct = struct_array_with_validity( + add_fields.clone(), + vec![Arc::new(StringArray::from(vec![Some("a")])) as ArrayRef], + vec![true], + ); + let checkpoint_batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(cp_add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false])) as ArrayRef, + Arc::new(Int64Array::from(vec![0])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let commit_add_struct = struct_array_with_validity( + add_fields, + vec![Arc::new(StringArray::from(vec![Some("c")])) as ArrayRef], + vec![true], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(commit_add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("c")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false])) as ArrayRef, + Arc::new(Int64Array::from(vec![1])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + #[expect(clippy::unwrap_used)] + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + #[expect(clippy::unwrap_used)] + let out = stream.try_next().await?.unwrap(); + assert_eq!(out.num_rows(), 2); + + #[expect(clippy::unwrap_used)] + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + #[expect(clippy::unwrap_used)] + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let mut got = vec![path_col.value(0).to_string(), path_col.value(1).to_string()]; + got.sort(); + assert_eq!(got, vec!["a".to_string(), "c".to_string()]); + Ok(()) + } + + #[tokio::test] + async fn hash_replay_tie_break_add_beats_remove_same_version() -> Result<()> { + // Commits (same path/version), intentionally unordered: + // - remove a (v1) + // - add a (v1) + // Add should still win even without commit-side ordering. + + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields.clone()), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::new_empty(schema.clone()); + + let add_struct = struct_array_with_validity( + add_fields.clone(), + vec![Arc::new(StringArray::from(vec![None, Some("a")])) as ArrayRef], + vec![false, true], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(add_struct) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a"), Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef, + Arc::new(Int64Array::from(vec![1, 1])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + #[expect(clippy::unwrap_used)] + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + #[expect(clippy::unwrap_used)] + let out = stream.try_next().await?.unwrap(); + assert_eq!(out.num_rows(), 1); + #[expect(clippy::unwrap_used)] + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + #[expect(clippy::unwrap_used)] + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(path_col.value(0), "a"); + Ok(()) + } + + #[tokio::test] + #[expect(clippy::unwrap_used)] + async fn hash_replay_unordered_same_version_remove_then_add_add_wins() -> Result<()> { + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields.clone()), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::new_empty(schema.clone()); + let commits_add = struct_array_with_validity( + add_fields, + vec![Arc::new(StringArray::from(vec![None, Some("a")])) as ArrayRef], + vec![false, true], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(commits_add) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a"), Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![true, false])) as ArrayRef, + Arc::new(Int64Array::from(vec![1, 1])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + let out = stream.try_next().await?.unwrap(); + assert_eq!(out.num_rows(), 1); + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(path_col.value(0), "a"); + Ok(()) + } + + #[tokio::test] + #[expect(clippy::unwrap_used)] + async fn hash_replay_unordered_higher_version_overrides_lower() -> Result<()> { + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields.clone()), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::new_empty(schema.clone()); + let commits_add = struct_array_with_validity( + add_fields, + vec![Arc::new(StringArray::from(vec![Some("a"), None])) as ArrayRef], + vec![true, false], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(commits_add) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a"), Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false, true])) as ArrayRef, + Arc::new(Int64Array::from(vec![1, 2])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + assert!(stream.try_next().await?.is_none()); + Ok(()) + } + + #[tokio::test] + #[expect(clippy::unwrap_used)] + async fn hash_replay_unordered_lower_version_cannot_override() -> Result<()> { + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields.clone()), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_batch = RecordBatch::new_empty(schema.clone()); + let commits_add = struct_array_with_validity( + add_fields, + vec![Arc::new(StringArray::from(vec![Some("a"), None])) as ArrayRef], + vec![true, false], + ); + let commits_batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(commits_add) as ArrayRef, + Arc::new(StringArray::from(vec![Some("a"), Some("a")])) as ArrayRef, + Arc::new(BooleanArray::from(vec![false, true])) as ArrayRef, + Arc::new(Int64Array::from(vec![2, 1])) as ArrayRef, + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let checkpoint_plan: Arc = Arc::new(OneBatchExec::new(checkpoint_batch)); + let commits_plan: Arc = Arc::new(OneBatchExec::new(commits_batch)); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ctx = Arc::new(TaskContext::default()); + let mut stream = exec.execute(0, ctx)?; + let out = stream.try_next().await?.unwrap(); + assert_eq!(out.num_rows(), 1); + let add = out + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let path_col = add + .column_by_name(FIELD_NAME_PATH) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(path_col.value(0), "a"); + Ok(()) + } + + #[tokio::test] + #[expect(clippy::unwrap_used)] + async fn hash_replay_required_input_ordering_is_none_for_hash_mode() -> Result<()> { + let add_fields: Fields = + vec![Arc::new(Field::new(FIELD_NAME_PATH, DataType::Utf8, true))].into(); + let schema = Arc::new(Schema::new(vec![ + Field::new("add", DataType::Struct(add_fields), true), + Field::new(COL_REPLAY_PATH, DataType::Utf8, false), + Field::new(COL_LOG_IS_REMOVE, DataType::Boolean, true), + Field::new(COL_LOG_VERSION, DataType::Int64, false), + ])); + + let checkpoint_plan: Arc = + Arc::new(OneBatchExec::new(RecordBatch::new_empty(schema.clone()))); + let commits_plan: Arc = + Arc::new(OneBatchExec::new(RecordBatch::new_empty(schema))); + + let exec = Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_plan, + commits_plan, + Url::parse("file:///tmp/delta").unwrap(), + 0, + vec![], + vec![], + vec![], + )); + + let ordering = exec.required_input_ordering(); + assert_eq!(ordering.len(), 2); + assert!(ordering[0].is_none()); + assert!(ordering[1].is_none()); + Ok(()) + } } diff --git a/crates/sail-delta-lake/src/physical_plan/meta_adds.rs b/crates/sail-delta-lake/src/physical_plan/meta_adds.rs index 74212455df..54a9eb5fdd 100644 --- a/crates/sail-delta-lake/src/physical_plan/meta_adds.rs +++ b/crates/sail-delta-lake/src/physical_plan/meta_adds.rs @@ -5,20 +5,23 @@ use datafusion::arrow::array::{Array, ArrayRef, Int64Array, RecordBatch, StringA use datafusion::arrow::compute::cast; use datafusion::arrow::datatypes::{DataType, SchemaRef}; use datafusion_common::{DataFusionError, Result}; +use percent_encoding::percent_decode_str; use crate::datasource::{COMMIT_TIMESTAMP_COLUMN, COMMIT_VERSION_COLUMN, PATH_COLUMN}; -use crate::kernel::models::Add; +use crate::spec::fields::FIELD_NAME_STATS_PARSED; +use crate::spec::Add; const COL_SIZE_BYTES: &str = "size_bytes"; const COL_MODIFICATION_TIME: &str = "modification_time"; const COL_STATS_JSON: &str = "stats_json"; const COL_PARTITION_SCAN: &str = "partition_scan"; -const RESERVED_META_COLUMNS: [&str; 7] = [ +const RESERVED_META_COLUMNS: [&str; 8] = [ PATH_COLUMN, COL_SIZE_BYTES, COL_MODIFICATION_TIME, COL_STATS_JSON, + FIELD_NAME_STATS_PARSED, COL_PARTITION_SCAN, COMMIT_VERSION_COLUMN, COMMIT_TIMESTAMP_COLUMN, @@ -99,7 +102,11 @@ pub fn decode_adds_from_meta_batch( ))); } - let path = path_arr.value(row).to_string(); + let raw_path = path_arr.value(row); + let path = percent_decode_str(raw_path) + .decode_utf8() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .to_string(); let size = size_arr .map(|a| if a.is_null(row) { 0 } else { a.value(row) }) diff --git a/crates/sail-delta-lake/src/physical_plan/metadata_stats_exec.rs b/crates/sail-delta-lake/src/physical_plan/metadata_stats_exec.rs new file mode 100644 index 0000000000..80e57327cf --- /dev/null +++ b/crates/sail-delta-lake/src/physical_plan/metadata_stats_exec.rs @@ -0,0 +1,357 @@ +use std::any::Any; +use std::fmt; +use std::io::Cursor; +use std::sync::Arc; + +use async_trait::async_trait; +use datafusion::arrow::array::{new_null_array, ArrayRef, StringArray, StructArray}; +use datafusion::arrow::compute::cast; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::json::ReaderBuilder as JsonReaderBuilder; +use datafusion::arrow::record_batch::RecordBatch; +use datafusion::execution::context::TaskContext; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use datafusion_common::{internal_err, DataFusionError, Result}; +use datafusion_physical_expr::{Distribution, EquivalenceProperties}; +use futures::TryStreamExt; + +use crate::spec::fields::{FIELD_NAME_STATS_PARSED, STATS_FIELD_MIN_VALUES}; + +/// The column name used by the replay pipeline for the raw JSON stats string. +const REPLAY_STATS_JSON_COLUMN: &str = "stats_json"; + +#[derive(Debug)] +pub struct DeltaMetadataStatsExec { + input: Arc, + stats_schema: SchemaRef, + output_schema: SchemaRef, + cache: Arc, +} + +impl DeltaMetadataStatsExec { + pub fn new(input: Arc, stats_schema: SchemaRef) -> Self { + let mut fields = input.schema().fields().to_vec(); + fields.push(Arc::new(Field::new( + FIELD_NAME_STATS_PARSED, + DataType::Struct(stats_schema.fields().clone()), + true, + ))); + let output_schema = Arc::new(Schema::new(fields)); + let cache = Arc::new(PlanProperties::new( + EquivalenceProperties::new(output_schema.clone()), + input.output_partitioning().clone(), + EmissionType::Final, + Boundedness::Bounded, + )); + Self { + input, + stats_schema, + output_schema, + cache, + } + } + + pub fn input(&self) -> &Arc { + &self.input + } + + pub fn stats_schema(&self) -> &SchemaRef { + &self.stats_schema + } + + /// Returns the data column names tracked in the stats schema. + /// These are extracted from the `minValues` sub-struct (if present), + /// which lists all non-partition columns for which stats are collected. + fn tracked_column_names(&self) -> Vec<&str> { + self.stats_schema + .field_with_name(STATS_FIELD_MIN_VALUES) + .ok() + .and_then(|f| { + if let DataType::Struct(fields) = f.data_type() { + Some(fields.iter().map(|f| f.name().as_str()).collect()) + } else { + None + } + }) + .unwrap_or_default() + } + + fn parse_stats_array(&self, batch: &RecordBatch) -> Result { + // Priority 1: if the batch already has a typed `stats_parsed` struct column + // (e.g. read from a checkpoint that persists stats in struct form), use it directly. + if let Some(existing) = batch.column_by_name(FIELD_NAME_STATS_PARSED) { + if matches!(existing.data_type(), DataType::Struct(_)) { + return Ok(Arc::clone(existing)); + } + } + + // Priority 2: parse from the replay pipeline's `stats_json` column. + let Some(stats_json_col) = batch.column_by_name(REPLAY_STATS_JSON_COLUMN) else { + return Ok(new_null_array( + &DataType::Struct(self.stats_schema.fields().clone()), + batch.num_rows(), + )); + }; + + let stats_json_col = cast(stats_json_col, &DataType::Utf8) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + let stats_json = stats_json_col + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal( + "metadata stats exec expects Utf8-compatible stats_json column".to_string(), + ) + })?; + + let estimated_json_bytes = stats_json + .iter() + .map(|value| value.map_or(2, str::len) + 1) + .sum(); + let mut json_lines = String::with_capacity(estimated_json_bytes); + for value in stats_json.iter() { + if let Some(value) = value { + json_lines.push_str(value); + } else { + json_lines.push_str("{}"); + } + json_lines.push('\n'); + } + + let mut reader = JsonReaderBuilder::new(Arc::clone(&self.stats_schema)) + .with_batch_size(batch.num_rows().max(1)) + .build(Cursor::new(json_lines)) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let parsed_batch = match reader.next() { + Some(batch) => batch.map_err(|e| DataFusionError::External(Box::new(e)))?, + None => RecordBatch::new_empty(Arc::clone(&self.stats_schema)), + }; + let stats_array: Arc = Arc::new(parsed_batch.into()); + Ok(stats_array) + } +} + +#[async_trait] +impl ExecutionPlan for DeltaMetadataStatsExec { + fn name(&self) -> &'static str { + "DeltaMetadataStatsExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &Arc { + &self.cache + } + + fn required_input_distribution(&self) -> Vec { + vec![Distribution::UnspecifiedDistribution] + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if children.len() != 1 { + return internal_err!( + "DeltaMetadataStatsExec requires exactly one child when used as a unary node" + ); + } + Ok(Arc::new(Self::new( + Arc::clone(&children[0]), + Arc::clone(&self.stats_schema), + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let schema = Arc::clone(&self.output_schema); + let input_stream = self.input.execute(partition, context)?; + let exec = self.clone(); + let stream_schema = Arc::clone(&schema); + + let stream = input_stream.try_filter_map(move |batch| { + let exec = exec.clone(); + let stream_schema = Arc::clone(&stream_schema); + async move { + if batch.num_rows() == 0 { + return Ok(None); + } + + let stats_array = exec.parse_stats_array(&batch)?; + let mut columns = batch.columns().to_vec(); + columns.push(stats_array); + let output = RecordBatch::try_new(Arc::clone(&stream_schema), columns) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + Ok(Some(output)) + } + }); + + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) + } +} + +impl DisplayAs for DeltaMetadataStatsExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + let columns = self.tracked_column_names().join(", "); + match t { + DisplayFormatType::Default | DisplayFormatType::Verbose => { + write!( + f, + "DeltaMetadataStatsExec(output={FIELD_NAME_STATS_PARSED}, columns=[{columns}])" + ) + } + DisplayFormatType::TreeRender => { + write!(f, "output={FIELD_NAME_STATS_PARSED}, columns=[{columns}]") + } + } + } +} + +impl Clone for DeltaMetadataStatsExec { + fn clone(&self) -> Self { + Self { + input: Arc::clone(&self.input), + stats_schema: Arc::clone(&self.stats_schema), + output_schema: Arc::clone(&self.output_schema), + cache: self.cache.clone(), + } + } +} + +#[cfg(test)] +mod tests { + use datafusion::arrow::array::{Array, Int32Array, Int64Array}; + use datafusion::physical_plan::empty::EmptyExec; + + use super::*; + use crate::datasource::PATH_COLUMN; + use crate::spec::fields::{ + STATS_FIELD_MAX_VALUES, STATS_FIELD_MIN_VALUES, STATS_FIELD_NUM_RECORDS, + }; + + fn stats_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new(STATS_FIELD_NUM_RECORDS, DataType::Int64, true), + Field::new( + STATS_FIELD_MIN_VALUES, + DataType::Struct(vec![Arc::new(Field::new("value", DataType::Int32, true))].into()), + true, + ), + Field::new( + STATS_FIELD_MAX_VALUES, + DataType::Struct(vec![Arc::new(Field::new("value", DataType::Int32, true))].into()), + true, + ), + ])) + } + + #[test] + fn parses_stats_json_into_typed_struct_column() -> Result<()> { + let input_schema = Arc::new(Schema::new(vec![ + Field::new(PATH_COLUMN, DataType::Utf8, false), + Field::new(REPLAY_STATS_JSON_COLUMN, DataType::Utf8, true), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&input_schema), + vec![ + Arc::new(StringArray::from(vec![Some("file.parquet")])), + Arc::new(StringArray::from(vec![Some( + r#"{"numRecords":3,"minValues":{"value":1},"maxValues":{"value":7}}"#, + )])), + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + let exec = DeltaMetadataStatsExec::new( + Arc::new(EmptyExec::new(Arc::clone(&input_schema))), + stats_schema(), + ); + let parsed = exec.parse_stats_array(&batch)?; + let stats = parsed + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected parsed stats struct column".to_string()) + })?; + let num_records = stats + .column_by_name(STATS_FIELD_NUM_RECORDS) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + DataFusionError::Internal("expected numRecords Int64 array".to_string()) + })?; + let min_values = stats + .column_by_name(STATS_FIELD_MIN_VALUES) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + DataFusionError::Internal("expected minValues struct array".to_string()) + })?; + let min_value = min_values + .column_by_name("value") + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + DataFusionError::Internal("expected minValues.value Int32 array".to_string()) + })?; + + assert_eq!(num_records.value(0), 3); + assert_eq!(min_value.value(0), 1); + Ok(()) + } + + #[test] + fn reuses_existing_stats_parsed_struct_without_json_parse() -> Result<()> { + let typed_stats = StructArray::from(vec![( + Arc::new(Field::new(STATS_FIELD_NUM_RECORDS, DataType::Int64, true)), + Arc::new(Int64Array::from(vec![Some(42)])) as Arc<_>, + )]); + + let input_schema = Arc::new(Schema::new(vec![ + Field::new(PATH_COLUMN, DataType::Utf8, false), + Field::new( + FIELD_NAME_STATS_PARSED, + typed_stats.data_type().clone(), + true, + ), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&input_schema), + vec![ + Arc::new(StringArray::from(vec![Some("file.parquet")])), + Arc::new(typed_stats), + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + + // The exec's stats_schema only has numRecords; no stats_json column exists. + let exec = DeltaMetadataStatsExec::new( + Arc::new(EmptyExec::new(Arc::clone(&input_schema))), + stats_schema(), + ); + let parsed = exec.parse_stats_array(&batch)?; + let stats = parsed + .as_any() + .downcast_ref::() + .ok_or_else(|| DataFusionError::Internal("expected struct array".to_string()))?; + let num_records = stats + .column_by_name(STATS_FIELD_NUM_RECORDS) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + DataFusionError::Internal("expected numRecords Int64 array".to_string()) + })?; + assert_eq!(num_records.value(0), 42); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/physical_plan/mod.rs b/crates/sail-delta-lake/src/physical_plan/mod.rs index 33d56f34b1..1e84a8eba4 100644 --- a/crates/sail-delta-lake/src/physical_plan/mod.rs +++ b/crates/sail-delta-lake/src/physical_plan/mod.rs @@ -32,18 +32,20 @@ pub mod discovery_exec; mod expr_adapter; mod log_replay_exec; mod meta_adds; +mod metadata_stats_exec; mod remove_actions_exec; mod scan_by_adds_exec; mod writer_exec; pub use action_schema::{ decode_actions_and_meta_from_batch, decode_adds_from_batch, delta_action_schema, - encode_actions, encode_add_actions, CommitMeta, ExecAction, COL_ACTION, + encode_actions, ExecCommitMeta, COL_ACTION, }; pub use commit_exec::DeltaCommitExec; pub use discovery_exec::DeltaDiscoveryExec; pub use expr_adapter::{DeltaCastColumnExpr, DeltaPhysicalExprAdapterFactory}; pub use log_replay_exec::DeltaLogReplayExec; +pub use metadata_stats_exec::DeltaMetadataStatsExec; pub mod planner; pub use planner::{ plan_delete, plan_merge, plan_update, DeltaPhysicalPlanner, DeltaTableConfig, PlannerContext, diff --git a/crates/sail-delta-lake/src/physical_plan/planner/context.rs b/crates/sail-delta-lake/src/physical_plan/planner/context.rs index d7c2737eb5..bf27c4a81d 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/context.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/context.rs @@ -10,7 +10,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, Mutex}; use datafusion::arrow::datatypes::SchemaRef; use datafusion::catalog::Session; @@ -18,6 +19,7 @@ use datafusion::common::{DataFusionError, Result}; use object_store::ObjectStore; use url::Url; +use super::log_segment::LogSegmentFiles; use crate::kernel::DeltaTableConfig as KernelDeltaTableConfig; use crate::options::TableDeltaOptions; use crate::storage::{default_logstore, LogStoreRef, StorageConfig}; @@ -28,6 +30,7 @@ use crate::table::{open_table_with_object_store_and_table_config, DeltaTable}; pub struct DeltaTableConfig { pub table_url: Url, pub options: TableDeltaOptions, + pub metadata_configuration: HashMap, pub partition_columns: Vec, pub table_schema_for_cond: Option, pub table_exists: bool, @@ -37,6 +40,7 @@ impl DeltaTableConfig { pub fn new( table_url: Url, options: TableDeltaOptions, + metadata_configuration: HashMap, partition_columns: Vec, table_schema_for_cond: Option, table_exists: bool, @@ -44,6 +48,7 @@ impl DeltaTableConfig { Self { table_url, options, + metadata_configuration, partition_columns, table_schema_for_cond, table_exists, @@ -55,11 +60,18 @@ impl DeltaTableConfig { pub struct PlannerContext<'a> { session: &'a dyn Session, config: DeltaTableConfig, + // Planner-local memoization cache used to avoid repeated `_delta_log` listings when + // one planning request builds multiple log-replay branches (e.g. overwrite-if old/new). + log_segment_files_cache: Arc>>, } impl<'a> PlannerContext<'a> { pub fn new(session: &'a dyn Session, config: DeltaTableConfig) -> Self { - Self { session, config } + Self { + session, + config, + log_segment_files_cache: Arc::new(Mutex::new(HashMap::new())), + } } pub fn session(&self) -> &'a dyn Session { @@ -82,6 +94,10 @@ impl<'a> PlannerContext<'a> { &self.config.partition_columns } + pub fn metadata_configuration(&self) -> &HashMap { + &self.config.metadata_configuration + } + pub fn table_schema_for_cond(&self) -> Option { self.config.table_schema_for_cond.clone() } @@ -94,6 +110,19 @@ impl<'a> PlannerContext<'a> { self.config } + pub(crate) fn get_cached_log_segment_files(&self, version: i64) -> Option { + self.log_segment_files_cache + .lock() + .ok() + .and_then(|cache| cache.get(&version).cloned()) + } + + pub(crate) fn set_cached_log_segment_files(&self, version: i64, files: LogSegmentFiles) { + if let Ok(mut cache) = self.log_segment_files_cache.lock() { + cache.insert(version, files); + } + } + pub fn object_store(&self) -> Result> { self.session .runtime_env() @@ -105,8 +134,11 @@ impl<'a> PlannerContext<'a> { pub fn log_store(&self) -> Result { let storage_config = StorageConfig; let object_store = self.object_store()?; + let prefixed_store = storage_config + .decorate_store(Arc::clone(&object_store), &self.config.table_url) + .map_err(|e| DataFusionError::External(Box::new(e)))?; Ok(default_logstore( - Arc::clone(&object_store), + prefixed_store, object_store, &self.config.table_url, &storage_config, diff --git a/crates/sail-delta-lake/src/physical_plan/planner/log_scan.rs b/crates/sail-delta-lake/src/physical_plan/planner/log_scan.rs index a67e28147f..f7222c9861 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/log_scan.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/log_scan.rs @@ -1,8 +1,7 @@ -use std::sync::Arc; +use std::sync::{Arc, LazyLock}; -use datafusion::arrow::datatypes::{DataType, Field, Schema}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::common::{DataFusionError, Result, ScalarValue}; -use datafusion::datasource::file_format::json::JsonFormat; use datafusion::datasource::file_format::parquet::ParquetFormat; use datafusion::datasource::file_format::FileFormat; use datafusion::datasource::listing::PartitionedFile; @@ -14,13 +13,35 @@ use datafusion::physical_plan::union::UnionExec; use datafusion::physical_plan::ExecutionPlan; use futures::{stream, StreamExt, TryStreamExt}; use object_store::path::{Path, DELIMITER}; -use object_store::{ObjectMeta, ObjectStore}; +use object_store::{ObjectMeta, ObjectStore, ObjectStoreExt}; use super::context::PlannerContext; use crate::datasource::create_object_store_url; use crate::physical_plan::COL_LOG_VERSION; +use crate::spec::{ + add_struct_type, delta_log_file_path, metadata_struct_type, parse_version_prefix, + protocol_struct_type, remove_struct_type, transaction_struct_type, +}; -const DELTA_LOG_DIR: &str = "_delta_log"; +/// The canonical Delta log file schema with proper Map types for fields like `partitionValues`. +/// +/// JSON schema inference gives `partitionValues` as a Struct, which breaks `map_extract`. +/// By using this fixed schema for JSON-only log reads (when no parquet checkpoint exists), +/// we ensure consistent Map types regardless of whether a checkpoint is present. +static DELTA_LOG_FILE_SCHEMA: LazyLock = LazyLock::new(|| { + fn to_arrow(st: crate::spec::StructType) -> DataType { + #[expect(clippy::expect_used)] + DataType::try_from(&crate::spec::DataType::from(st)) + .expect("spec struct type should convert to Arrow DataType") + } + Arc::new(Schema::new(vec![ + Field::new("add", to_arrow(add_struct_type()), true), + Field::new("remove", to_arrow(remove_struct_type()), true), + Field::new("metaData", to_arrow(metadata_struct_type()), true), + Field::new("protocol", to_arrow(protocol_struct_type()), true), + Field::new("txn", to_arrow(transaction_struct_type()), true), + ])) +}); #[derive(Debug, Clone, Default)] pub struct LogScanOptions { @@ -28,32 +49,18 @@ pub struct LogScanOptions { /// /// When set, the scan will only read these columns plus any required partition columns. pub projection: Option>, - /// Optional inclusive log version range for commit JSON files. - pub commit_version_range: Option<(i64, i64)>, /// Optional pushdown predicate for checkpoint parquet scans. pub parquet_predicate: Option>, } fn parse_log_version_prefix(filename: &str) -> Option { - // Delta log files are typically named with a 20-digit version prefix: - // - commits: 00000000000000000010.json - // - checkpoints: 00000000000000000010.checkpoint.parquet - // - // For multipart checkpoints, we still take the leading version prefix. - let prefix = filename.get(0..20)?; - if !prefix.as_bytes().iter().all(|b| b.is_ascii_digit()) { - return None; - } - prefix.parse::().ok() + parse_version_prefix(filename)?.try_into().ok() } fn log_file_path(table_root_path: &str, filename: &str) -> Path { // Object store paths are absolute for local filesystem stores in our setup (DataFusion uses // `ObjectStoreUrl::local_filesystem()`). - Path::from(format!( - "{}{}{}{}{}", - table_root_path, DELIMITER, DELTA_LOG_DIR, DELIMITER, filename - )) + delta_log_file_path(table_root_path, filename) } async fn head_many( @@ -103,6 +110,7 @@ fn to_partitioned_files(metas: Vec) -> Result> partition_values: vec![ScalarValue::Int64(Some(ver))], range: None, statistics: None, + ordering: None, extensions: None, metadata_size_hint: None, }) @@ -139,58 +147,34 @@ fn to_file_groups(metas: Vec, target_partitions: usize) -> Result, checkpoint_files: Vec, commit_files: Vec, options: LogScanOptions, -) -> Result<(Arc, Vec, Vec)> { +) -> Result<( + Option>, + Option>, + Vec, + Vec, +)> { let store = ctx.object_store()?; let log_store = ctx.log_store()?; let object_store_url = create_object_store_url(&log_store.config().location).map_err(|e| { DataFusionError::External(Box::::from(e)) })?; - // Avoid double-counting actions that are already materialized into the checkpoint: - // only scan commit JSONs strictly newer than the latest checkpoint version. - let latest_checkpoint_version = checkpoint_files - .iter() - .filter_map(|f| parse_log_version_prefix(f)) - .max(); - let commit_files = if let Some(cp_ver) = latest_checkpoint_version { - commit_files - .into_iter() - .filter(|f| { - parse_log_version_prefix(f) - .map(|v| v > cp_ver) - .unwrap_or(true) - }) - .collect::>() - } else { - commit_files - }; - let commit_files = if let Some((start, end)) = options.commit_version_range { - commit_files - .into_iter() - .filter(|f| { - parse_log_version_prefix(f).map(|v| { - let v = i64::try_from(v).unwrap_or(i64::MAX); - v >= start && v <= end - }) == Some(true) - }) - .collect::>() - } else { - commit_files - }; - + // Commit/checkpoint lists are expected to be selected by the planner log-segment resolver. + // This builder only materializes datasource scans from those resolved filenames. let table_root_path = log_store.config().location.path(); let (checkpoint_metas, commit_metas) = tokio::try_join!( head_many(&store, table_root_path, &checkpoint_files), head_many(&store, table_root_path, &commit_files) )?; - // Infer schemas (best-effort). If there are no files for either side, we still build an empty - // scan of the other side. + // Infer schemas for parquet checkpoint files only. JSON commit files use the canonical + // Delta log file schema (see `DELTA_LOG_FILE_SCHEMA`) to avoid type mismatches for + // map-like fields (e.g. `add.partitionValues`). let parquet_schema = if checkpoint_metas.is_empty() { None } else { @@ -200,29 +184,22 @@ pub async fn build_delta_log_datasource_union_with_options( .await?, ) }; - let json_schema = if commit_metas.is_empty() { - None - } else { - Some( - JsonFormat::default() - .infer_schema(ctx.session(), &store, &commit_metas) - .await?, - ) - }; + let has_commit_files = !commit_metas.is_empty(); - let merged = match (parquet_schema, json_schema) { - (Some(p), Some(j)) => { - // The inferred JSON schema may disagree with the checkpoint parquet schema for - // map-like fields (e.g. `add.partitionValues`). Prefer a stable schema to avoid - // planning failures during EXPLAIN. - match Schema::try_merge(vec![p.as_ref().clone(), j.as_ref().clone()]) { - Ok(merged) => Arc::new(merged), - Err(_) => p, - } + let merged = match (parquet_schema, has_commit_files) { + (Some(p), _) => { + // When a checkpoint (parquet) file is present, prefer its schema. The parquet + // checkpoint schema has proper Map types for fields like `add.partitionValues`, + // while JSON inference yields Struct types which are incompatible with `map_extract`. + p + } + (None, true) => { + // No parquet checkpoint exists (e.g. before the first checkpoint interval fires). + // Use the canonical Delta log file schema so that `partitionValues` and other + // map-like fields have the correct Map Arrow type instead of an inferred Struct. + Arc::clone(&*DELTA_LOG_FILE_SCHEMA) } - (Some(p), None) => p, - (None, Some(j)) => j, - (None, None) => { + (None, false) => { return Err(DataFusionError::Plan( "no _delta_log files found to build log scan".to_string(), )) @@ -241,19 +218,22 @@ pub async fn build_delta_log_datasource_union_with_options( indices.push(file_schema_len); continue; } - let idx = merged.index_of(col).map_err(|_| { - DataFusionError::Plan(format!( - "log scan projection column '{col}' not found in merged schema" - )) - })?; - indices.push(idx); + + match merged.index_of(col) { + Ok(idx) => indices.push(idx), + Err(_) => { + // Some Delta writers/checkpoint formats may omit an action column + // (for example, no `remove` records in a newly created table). + // Skip missing projected columns and let downstream replay logic + // treat them as absent. + } + } } Some(indices) } else { None }; - let mut inputs: Vec> = Vec::new(); let target_partitions = ctx.session().config().target_partitions(); let table_schema = TableSchema::new( Arc::clone(&merged), @@ -264,7 +244,9 @@ pub async fn build_delta_log_datasource_union_with_options( ))], ); - if !checkpoint_metas.is_empty() { + let checkpoint_scan: Option> = if checkpoint_metas.is_empty() { + None + } else { let mut source = datafusion::datasource::physical_plan::ParquetSource::new(table_schema.clone()); if let Some(predicate) = &options.parquet_predicate { @@ -276,10 +258,12 @@ pub async fn build_delta_log_datasource_union_with_options( .with_file_groups(groups) .with_projection_indices(projection_indices.clone())? .build(); - inputs.push(DataSourceExec::from_data_source(conf)); - } + Some(DataSourceExec::from_data_source(conf)) + }; - if !commit_metas.is_empty() { + let commit_scan: Option> = if commit_metas.is_empty() { + None + } else { let source: Arc = Arc::new( datafusion::datasource::physical_plan::JsonSource::new(table_schema), ); @@ -288,7 +272,29 @@ pub async fn build_delta_log_datasource_union_with_options( .with_file_groups(groups) .with_projection_indices(projection_indices)? .build(); - inputs.push(DataSourceExec::from_data_source(conf)); + Some(DataSourceExec::from_data_source(conf)) + }; + + Ok((checkpoint_scan, commit_scan, checkpoint_files, commit_files)) +} + +#[expect(dead_code)] +pub async fn build_delta_log_datasource_union_with_options( + ctx: &PlannerContext<'_>, + checkpoint_files: Vec, + commit_files: Vec, + options: LogScanOptions, +) -> Result<(Arc, Vec, Vec)> { + let (checkpoint_scan, commit_scan, checkpoint_files, commit_files) = + build_delta_log_datasource_scans_with_options(ctx, checkpoint_files, commit_files, options) + .await?; + + let mut inputs: Vec> = Vec::new(); + if let Some(cp) = checkpoint_scan { + inputs.push(cp); + } + if let Some(c) = commit_scan { + inputs.push(c); } Ok((UnionExec::try_new(inputs)?, checkpoint_files, commit_files)) diff --git a/crates/sail-delta-lake/src/physical_plan/planner/log_segment.rs b/crates/sail-delta-lake/src/physical_plan/planner/log_segment.rs new file mode 100644 index 0000000000..4d0a0523e1 --- /dev/null +++ b/crates/sail-delta-lake/src/physical_plan/planner/log_segment.rs @@ -0,0 +1,58 @@ +use datafusion::common::Result; + +use super::context::PlannerContext; +pub use crate::kernel::log_segment::{ + list_log_segment_files as kernel_list_log_segment_files, LogSegmentFiles, + LogSegmentResolveOptions, +}; +use crate::spec::{parse_commit_version, parse_version_prefix}; + +/// List Delta log files up to `max_version`, using the planner-local cache when available. +pub async fn list_log_segment_files( + ctx: &PlannerContext<'_>, + max_version: i64, +) -> Result { + if let Some(files) = ctx.get_cached_log_segment_files(max_version) { + return Ok(files); + } + let log_store = ctx.log_store()?; + let files = kernel_list_log_segment_files(&log_store, max_version) + .await + .map_err(|e| datafusion::common::DataFusionError::External(Box::new(e)))?; + ctx.set_cached_log_segment_files(max_version, files.clone()); + Ok(files) +} + +/// Resolve the minimal set of Delta log files needed to replay state up to `max_version`, +/// using the planner-local cache for the initial listing and applying `options` on top. +pub async fn resolve_log_segment_files( + ctx: &PlannerContext<'_>, + max_version: i64, + options: LogSegmentResolveOptions, +) -> Result { + // Obtain the full listing (possibly from cache), then apply resolve options. + let mut files = list_log_segment_files(ctx, max_version).await?; + + // Avoid double-counting actions already materialized into the latest checkpoint: + // only replay commit JSONs strictly newer than that checkpoint version. + let latest_checkpoint_version = files + .checkpoint_files + .iter() + .filter_map(|f| parse_version_prefix(f)) + .max(); + if let Some(cp_ver) = latest_checkpoint_version { + files + .commit_files + .retain(|f| parse_commit_version(f).map(|v| v > cp_ver).unwrap_or(true)); + } + + if let Some((start, end)) = options.commit_version_range { + files.commit_files.retain(|f| { + parse_commit_version(f) + .map(|v| v >= start && v <= end) + .unwrap_or(false) + }); + } + + Ok(files) +} diff --git a/crates/sail-delta-lake/src/physical_plan/planner/metadata_predicate.rs b/crates/sail-delta-lake/src/physical_plan/planner/metadata_predicate.rs new file mode 100644 index 0000000000..d6160212a2 --- /dev/null +++ b/crates/sail-delta-lake/src/physical_plan/planner/metadata_predicate.rs @@ -0,0 +1,548 @@ +use std::collections::HashSet; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType as ArrowDataType, Schema as ArrowSchema, SchemaRef}; +use datafusion::catalog::Session; +use datafusion::common::{ + Column as LogicalColumn, DataFusionError, Result, ScalarValue, ToDFSchema, +}; +use datafusion::logical_expr::expr::{Between, BinaryExpr, Cast, InList}; +use datafusion::logical_expr::utils::{conjunction, disjunction}; +use datafusion::logical_expr::{Expr, Operator}; +use datafusion::physical_plan::filter::FilterExec; +use datafusion::physical_plan::ExecutionPlan; + +use crate::datasource::simplify_expr; +use crate::physical_plan::DeltaMetadataStatsExec; +use crate::schema::make_physical_arrow_schema; +use crate::spec::fields::{ + FIELD_NAME_STATS_PARSED, STATS_FIELD_MAX_VALUES, STATS_FIELD_MIN_VALUES, + STATS_FIELD_NULL_COUNT, STATS_FIELD_NUM_RECORDS, +}; +use crate::spec::{stats_schema, StructType}; +use crate::table::DeltaSnapshot; + +pub(crate) fn predicate_requires_stats(expr: &Expr, partition_columns: &[String]) -> bool { + let partition_columns: HashSet<&str> = partition_columns.iter().map(String::as_str).collect(); + expr.column_refs() + .iter() + .any(|col| !partition_columns.contains(col.name.as_str())) +} + +pub(crate) fn build_metadata_filter( + session: &dyn Session, + input: Arc, + snapshot: &DeltaSnapshot, + predicate: Expr, +) -> Result> { + let partition_columns = snapshot.metadata().partition_columns().clone(); + let needs_stats = predicate_requires_stats(&predicate, &partition_columns); + let rewritten = rewrite_predicate_for_metadata(predicate, &partition_columns); + if !needs_stats { + let df_schema = input.schema().to_dfschema()?; + let physical_expr = simplify_expr(session, &df_schema, rewritten)?; + return Ok(Arc::new(FilterExec::try_new(physical_expr, input)?)); + } + + let input: Arc = Arc::new(DeltaMetadataStatsExec::new( + input, + build_metadata_stats_schema(snapshot)?, + )); + let df_schema = input.schema().to_dfschema()?; + let physical_expr = simplify_expr(session, &df_schema, rewritten)?; + Ok(Arc::new(FilterExec::try_new(physical_expr, input)?)) +} + +pub(crate) fn build_metadata_stats_schema(snapshot: &DeltaSnapshot) -> Result { + let partition_columns = snapshot.metadata().partition_columns(); + let mode = snapshot.effective_column_mapping_mode(); + let non_partition_fields = snapshot + .schema() + .fields() + .iter() + .filter(|field| !partition_columns.contains(field.name())) + .map(|field| field.as_ref().clone()) + .collect::>(); + let logical_non_partition = ArrowSchema::new(non_partition_fields); + let physical_arrow = make_physical_arrow_schema(&logical_non_partition, mode); + let physical_kernel = StructType::try_from(&physical_arrow) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let stats_schema = stats_schema(&physical_kernel, snapshot.table_properties()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(Arc::new( + ArrowSchema::try_from(&stats_schema).map_err(|e| DataFusionError::External(Box::new(e)))?, + )) +} + +fn rewrite_predicate_for_metadata(expr: Expr, partition_columns: &[String]) -> Expr { + let partition_columns = partition_columns.iter().cloned().collect::>(); + MetadataPredicateRewriter { partition_columns }.rewrite(expr) +} + +struct MetadataPredicateRewriter { + partition_columns: HashSet, +} + +#[derive(Clone)] +enum ExprTemplate { + Raw(LogicalColumn), + Cast { + column: LogicalColumn, + data_type: ArrowDataType, + }, +} + +impl MetadataPredicateRewriter { + fn rewrite(&self, expr: Expr) -> Expr { + match expr { + Expr::BinaryExpr(BinaryExpr { left, op, right }) => match op { + Operator::And | Operator::Or => Expr::BinaryExpr(BinaryExpr::new( + Box::new(self.rewrite(*left)), + op, + Box::new(self.rewrite(*right)), + )), + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq => self + .rewrite_comparison(*left, op, *right) + .unwrap_or_else(literal_true), + _ => literal_true(), + }, + Expr::Between(Between { + expr, + negated, + low, + high, + }) => self + .rewrite_between(*expr, negated, *low, *high) + .unwrap_or_else(literal_true), + Expr::InList(InList { + expr, + list, + negated, + }) => self + .rewrite_in_list(*expr, list, negated) + .unwrap_or_else(literal_true), + Expr::IsNull(expr) => self + .rewrite_null_check(*expr, false) + .unwrap_or_else(literal_true), + Expr::IsNotNull(expr) => self + .rewrite_null_check(*expr, true) + .unwrap_or_else(literal_true), + Expr::Not(expr) => match *expr { + Expr::IsNull(expr) => self + .rewrite_null_check(*expr, true) + .unwrap_or_else(literal_true), + Expr::IsNotNull(expr) => self + .rewrite_null_check(*expr, false) + .unwrap_or_else(literal_true), + // Any other NOT is not safe to negate over rewritten stats bounds – + // fall back to keeping all files (literal_true = no pruning). + _ => literal_true(), + }, + Expr::Alias(alias) => self.rewrite(*alias.expr), + Expr::Literal(..) => expr, + _ => literal_true(), + } + } + + fn rewrite_comparison(&self, left: Expr, op: Operator, right: Expr) -> Option { + if left.column_refs().is_empty() && right.column_refs().is_empty() { + return Some(binary(left, op, right)); + } + + if let Some(template) = Self::extract_template(&left) { + if !right.column_refs().is_empty() { + return None; + } + return Some(self.rewrite_template_comparison(template, op, right)); + } + + if let Some(template) = Self::extract_template(&right) { + if !left.column_refs().is_empty() { + return None; + } + return Some(self.rewrite_template_comparison(template, reverse_comparison(op)?, left)); + } + + None + } + + fn rewrite_between(&self, expr: Expr, negated: bool, low: Expr, high: Expr) -> Option { + if !low.column_refs().is_empty() || !high.column_refs().is_empty() { + return None; + } + let template = Self::extract_template(&expr)?; + if self.is_partition_column(template.column_name()) { + return Some(Expr::Between(Between::new( + Box::new(expr), + negated, + Box::new(low), + Box::new(high), + ))); + } + + let min_expr = template.apply(self.stats_bound_expr(template.column_name(), true)); + let max_expr = template.apply(self.stats_bound_expr(template.column_name(), false)); + let missing = any_null([min_expr.clone(), max_expr.clone()]); + let actual = if negated { + disjunction(vec![ + binary(min_expr, Operator::Lt, low), + binary(max_expr, Operator::Gt, high), + ]) + .unwrap_or_else(literal_true) + } else { + conjunction(vec![ + binary(max_expr, Operator::GtEq, low), + binary(min_expr, Operator::LtEq, high), + ]) + .unwrap_or_else(literal_true) + }; + Some(or(missing, actual)) + } + + fn rewrite_in_list(&self, expr: Expr, list: Vec, negated: bool) -> Option { + let template = Self::extract_template(&expr)?; + if list.iter().any(|expr| !expr.column_refs().is_empty()) { + return None; + } + if self.is_partition_column(template.column_name()) { + return Some(Expr::InList(InList::new(Box::new(expr), list, negated))); + } + let mut rewritten = Vec::with_capacity(list.len()); + for value in list { + let op = if negated { + Operator::NotEq + } else { + Operator::Eq + }; + rewritten.push(self.rewrite_template_comparison(template.clone(), op, value)); + } + if negated { + Some(conjunction(rewritten).unwrap_or_else(literal_true)) + } else { + Some(disjunction(rewritten).unwrap_or_else(literal_true)) + } + } + + fn rewrite_null_check(&self, expr: Expr, is_not_null: bool) -> Option { + let template = Self::extract_template(&expr)?; + if self.is_partition_column(template.column_name()) { + return Some(if is_not_null { + Expr::IsNotNull(Box::new(expr)) + } else { + Expr::IsNull(Box::new(expr)) + }); + } + + let null_count = template.apply(self.stats_null_count_expr(template.column_name())); + if is_not_null { + let num_records = self.stats_num_records_expr(); + Some( + disjunction(vec![ + null_count.clone().is_null(), + num_records.clone().is_null(), + binary(null_count, Operator::Lt, num_records), + ]) + .unwrap_or_else(literal_true), + ) + } else { + Some( + disjunction(vec![ + null_count.clone().is_null(), + binary(null_count, Operator::Gt, literal_i64(0)), + ]) + .unwrap_or_else(literal_true), + ) + } + } + + fn rewrite_template_comparison( + &self, + template: ExprTemplate, + op: Operator, + value: Expr, + ) -> Expr { + if self.is_partition_column(template.column_name()) { + return binary( + template.apply(column_expr(template.column_name())), + op, + value, + ); + } + + let min_expr = template.apply(self.stats_bound_expr(template.column_name(), true)); + let max_expr = template.apply(self.stats_bound_expr(template.column_name(), false)); + match op { + Operator::Eq => or( + any_null([min_expr.clone(), max_expr.clone()]), + and( + binary(min_expr, Operator::LtEq, value.clone()), + binary(max_expr, Operator::GtEq, value), + ), + ), + Operator::NotEq => or( + any_null([min_expr.clone(), max_expr.clone()]), + disjunction(vec![ + binary(min_expr, Operator::Lt, value.clone()), + binary(max_expr, Operator::Gt, value), + ]) + .unwrap_or_else(literal_true), + ), + Operator::Lt => or( + min_expr.clone().is_null(), + binary(min_expr, Operator::Lt, value), + ), + Operator::LtEq => or( + min_expr.clone().is_null(), + binary(min_expr, Operator::LtEq, value), + ), + Operator::Gt => or( + max_expr.clone().is_null(), + binary(max_expr, Operator::Gt, value), + ), + Operator::GtEq => or( + max_expr.clone().is_null(), + binary(max_expr, Operator::GtEq, value), + ), + _ => literal_true(), + } + } + + fn is_partition_column(&self, name: &str) -> bool { + self.partition_columns.contains(name) + } + + fn stats_num_records_expr(&self) -> Expr { + get_field( + column_expr(FIELD_NAME_STATS_PARSED), + STATS_FIELD_NUM_RECORDS, + ) + } + + fn stats_null_count_expr(&self, name: &str) -> Expr { + self.stats_nested_expr(STATS_FIELD_NULL_COUNT, name) + } + + fn stats_bound_expr(&self, name: &str, is_min: bool) -> Expr { + self.stats_nested_expr( + if is_min { + STATS_FIELD_MIN_VALUES + } else { + STATS_FIELD_MAX_VALUES + }, + name, + ) + } + + fn stats_nested_expr(&self, root: &str, name: &str) -> Expr { + let mut expr = get_field(column_expr(FIELD_NAME_STATS_PARSED), root); + for segment in name.split('.') { + expr = get_field(expr, segment); + } + expr + } + + fn extract_template(expr: &Expr) -> Option { + match expr { + Expr::Column(column) => Some(ExprTemplate::Raw(column.clone())), + Expr::Cast(Cast { expr, data_type }) => match expr.as_ref() { + Expr::Column(column) => Some(ExprTemplate::Cast { + column: column.clone(), + data_type: data_type.clone(), + }), + _ => None, + }, + _ => None, + } + } +} + +impl ExprTemplate { + fn column_name(&self) -> &str { + match self { + ExprTemplate::Raw(column) | ExprTemplate::Cast { column, .. } => column.name.as_str(), + } + } + + fn apply(&self, expr: Expr) -> Expr { + match self { + ExprTemplate::Raw(_) => expr, + ExprTemplate::Cast { data_type, .. } => { + Expr::Cast(Cast::new(Box::new(expr), data_type.clone())) + } + } + } +} + +fn reverse_comparison(op: Operator) -> Option { + Some(match op { + Operator::Eq => Operator::Eq, + Operator::NotEq => Operator::NotEq, + Operator::Lt => Operator::Gt, + Operator::LtEq => Operator::GtEq, + Operator::Gt => Operator::Lt, + Operator::GtEq => Operator::LtEq, + _ => return None, + }) +} + +fn column_expr(name: &str) -> Expr { + Expr::Column(LogicalColumn::new_unqualified(name)) +} + +fn get_field(struct_expr: Expr, field_name: &str) -> Expr { + Expr::ScalarFunction(datafusion::logical_expr::expr::ScalarFunction::new_udf( + datafusion::functions::core::get_field(), + vec![ + struct_expr, + Expr::Literal(ScalarValue::Utf8(Some(field_name.to_string())), None), + ], + )) +} + +fn literal_true() -> Expr { + Expr::Literal(ScalarValue::Boolean(Some(true)), None) +} + +fn literal_i64(value: i64) -> Expr { + Expr::Literal(ScalarValue::Int64(Some(value)), None) +} + +fn binary(left: Expr, op: Operator, right: Expr) -> Expr { + Expr::BinaryExpr(BinaryExpr::new(Box::new(left), op, Box::new(right))) +} + +fn and(left: Expr, right: Expr) -> Expr { + binary(left, Operator::And, right) +} + +fn or(left: Expr, right: Expr) -> Expr { + binary(left, Operator::Or, right) +} + +fn any_null(exprs: impl IntoIterator) -> Expr { + let checks = exprs + .into_iter() + .map(|expr| expr.is_null()) + .collect::>(); + disjunction(checks).unwrap_or_else(literal_true) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::array::{Array, Int32Array, Int64Array, StringArray, StructArray}; + use datafusion::arrow::datatypes::Field; + use datafusion::arrow::record_batch::RecordBatch; + use datafusion::common::ToDFSchema; + use datafusion::logical_expr::Expr; + use datafusion::prelude::SessionContext; + + use super::*; + + fn metadata_batch() -> Result<(SchemaRef, RecordBatch)> { + let stats = StructArray::from(vec![ + ( + Arc::new(Field::new( + STATS_FIELD_NUM_RECORDS, + ArrowDataType::Int64, + true, + )), + Arc::new(Int64Array::from(vec![Some(3), Some(1), Some(1)])) as Arc<_>, + ), + ( + Arc::new(Field::new( + STATS_FIELD_MIN_VALUES, + ArrowDataType::Struct( + vec![Arc::new(Field::new("value", ArrowDataType::Int32, true))].into(), + ), + true, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("value", ArrowDataType::Int32, true)), + Arc::new(Int32Array::from(vec![Some(1), Some(4), None])) as Arc<_>, + )])) as Arc<_>, + ), + ( + Arc::new(Field::new( + STATS_FIELD_MAX_VALUES, + ArrowDataType::Struct( + vec![Arc::new(Field::new("value", ArrowDataType::Int32, true))].into(), + ), + true, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("value", ArrowDataType::Int32, true)), + Arc::new(Int32Array::from(vec![Some(3), Some(4), None])) as Arc<_>, + )])) as Arc<_>, + ), + ( + Arc::new(Field::new( + STATS_FIELD_NULL_COUNT, + ArrowDataType::Struct( + vec![Arc::new(Field::new("value", ArrowDataType::Int64, true))].into(), + ), + true, + )), + Arc::new(StructArray::from(vec![( + Arc::new(Field::new("value", ArrowDataType::Int64, true)), + Arc::new(Int64Array::from(vec![Some(0), Some(0), Some(1)])) as Arc<_>, + )])) as Arc<_>, + ), + ]); + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("p", ArrowDataType::Utf8, true), + Field::new(FIELD_NAME_STATS_PARSED, stats.data_type().clone(), true), + ])); + let batch = RecordBatch::try_new( + Arc::clone(&schema), + vec![ + Arc::new(StringArray::from(vec![Some("a"), Some("b"), Some("c")])), + Arc::new(stats), + ], + ) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None))?; + Ok((schema, batch)) + } + + #[test] + fn rewrites_partition_predicates_to_metadata_columns() { + let expr = binary( + Expr::Column(LogicalColumn::new_unqualified("p")), + Operator::Eq, + Expr::Literal(ScalarValue::Utf8(Some("b".to_string())), None), + ); + let rewritten = rewrite_predicate_for_metadata(expr.clone(), &["p".to_string()]); + assert_eq!(rewritten, expr); + } + + #[test] + fn rewritten_stats_predicate_filters_against_bounds() -> Result<()> { + let (schema, batch) = metadata_batch()?; + let expr = binary( + Expr::Column(LogicalColumn::new_unqualified("value")), + Operator::Gt, + literal_i64(3), + ); + let rewritten = rewrite_predicate_for_metadata(expr, &["p".to_string()]); + let ctx = SessionContext::new(); + let physical = simplify_expr(&ctx.state(), &schema.to_dfschema()?, rewritten)?; + let values = physical.evaluate(&batch)?.into_array(batch.num_rows())?; + let values = values + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal("expected boolean predicate output".to_string()) + })?; + + assert!(!values.value(0)); + assert!(values.value(1)); + assert!(values.value(2)); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/physical_plan/planner/mod.rs b/crates/sail-delta-lake/src/physical_plan/planner/mod.rs index b7576b8b1d..31e08adf22 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/mod.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/mod.rs @@ -15,11 +15,12 @@ use std::sync::Arc; use datafusion::common::Result; use datafusion::physical_expr::LexRequirement; use datafusion::physical_plan::ExecutionPlan; -use sail_common_datafusion::datasource::{MergeInfo, PhysicalSinkMode}; -use sail_common_datafusion::logical_expr::ExprWithSource; +use sail_common_datafusion::datasource::PhysicalSinkMode; pub mod context; mod log_scan; +mod log_segment; +pub(crate) mod metadata_predicate; pub mod utils; mod op_delete; @@ -28,6 +29,9 @@ mod op_update; mod op_write; pub use context::{DeltaTableConfig, PlannerContext}; +pub use op_delete::build_delete_plan as plan_delete; +pub use op_merge::build_merge_plan as plan_merge; +pub use op_update::build_update_plan as plan_update; pub struct DeltaPhysicalPlanner<'a> { ctx: PlannerContext<'a>, @@ -47,24 +51,3 @@ impl<'a> DeltaPhysicalPlanner<'a> { op_write::build_write_plan(&self.ctx, input, sink_mode, sort_order).await } } - -pub async fn plan_delete( - ctx: &PlannerContext<'_>, - condition: ExprWithSource, -) -> Result> { - op_delete::build_delete_plan(ctx, condition).await -} - -pub async fn plan_merge( - ctx: &PlannerContext<'_>, - merge_info: MergeInfo, -) -> Result> { - op_merge::build_merge_plan(ctx, merge_info).await -} - -pub async fn plan_update( - ctx: &PlannerContext<'_>, - input: Arc, -) -> Result> { - op_update::build_update_plan(ctx, input).await -} diff --git a/crates/sail-delta-lake/src/physical_plan/planner/op_delete.rs b/crates/sail-delta-lake/src/physical_plan/planner/op_delete.rs index 3179c0b1af..0fba40106c 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/op_delete.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/op_delete.rs @@ -24,9 +24,8 @@ use sail_common_datafusion::datasource::PhysicalSinkMode; use sail_common_datafusion::logical_expr::ExprWithSource; use super::context::PlannerContext; -use super::utils::{build_log_replay_pipeline_with_options, LogReplayFilter, LogReplayOptions}; -use crate::datasource::schema::DataFusionMixins; -use crate::datasource::PredicateProperties; +use super::metadata_predicate::{build_metadata_filter, predicate_requires_stats}; +use super::utils::{build_log_replay_pipeline_with_options, LogReplayOptions}; use crate::kernel::DeltaOperation; use crate::physical_plan::{ DeltaCommitExec, DeltaDiscoveryExec, DeltaRemoveActionsExec, DeltaScanByAddsExec, @@ -44,70 +43,46 @@ pub async fn build_delete_plan( let version = snapshot_state.version(); let table_schema = snapshot_state - .snapshot() - .arrow_schema() + .input_schema() .map_err(|e| DataFusionError::External(Box::new(e)))?; let partition_columns = snapshot_state.metadata().partition_columns().clone(); let table_df_schema = table_schema .clone() .to_dfschema() .map_err(|e| DataFusionError::External(Box::new(e)))?; + let condition_expr = condition.expr.clone(); let physical_condition = ctx .session() - .create_physical_expr(condition.expr, &table_df_schema)?; + .create_physical_expr(condition_expr.clone(), &table_df_schema)?; // Partition-only predicates can delete entire files without scanning data. In that case, // build a visible metadata pipeline over a log-derived meta table. - let mut expr_props = PredicateProperties::new(partition_columns.clone()); - expr_props - .analyze_predicate(&physical_condition) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let kernel_snapshot = snapshot_state.snapshot().snapshot().inner.clone(); - let log_segment = kernel_snapshot.log_segment(); - let checkpoint_files = log_segment - .checkpoint_parts - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let commit_files = log_segment - .ascending_commit_files - .iter() - .map(|p| p.filename.clone()) - .collect::>(); + let partition_only = !predicate_requires_stats(&condition_expr, &partition_columns); + let log_replay_options = LogReplayOptions { + include_stats_json: !partition_only, + ..Default::default() + }; - // Build a visible metadata pipeline over the Delta log. - let mut log_replay_options = LogReplayOptions::default(); - if expr_props.partition_only { - log_replay_options.log_filter = Some(LogReplayFilter { - predicate: physical_condition.clone(), - table_schema: table_schema.clone(), - }); - } - - let meta_scan: Arc = build_log_replay_pipeline_with_options( - ctx, - ctx.table_url().clone(), - version, - partition_columns.clone(), - checkpoint_files, - commit_files, - log_replay_options, - ) - .await?; + let meta_scan: Arc = + build_log_replay_pipeline_with_options(ctx, snapshot_state, log_replay_options).await?; + let meta_scan: Arc = + build_metadata_filter(ctx.session(), meta_scan, snapshot_state, condition_expr)?; // Always wrap with DeltaDiscoveryExec so EXPLAIN shows the metadata pipeline. let find_files_exec: Arc = Arc::new(DeltaDiscoveryExec::with_input( meta_scan, ctx.table_url().clone(), - Some(physical_condition.clone()), - Some(table_schema.clone()), + None, + None, version, partition_columns.clone(), - expr_props.partition_only, + partition_only, )?); // Spread Add actions across partitions so `DeltaScanByAddsExec` can scan files in parallel. + // TODO(adaptive-partitioning): Keep this aligned with `scan_planner.rs`. + // Plan: switch from fixed `target_partitions` + round-robin to size-driven partition count + // first, then size-aware distribution to avoid oversharding and worker skew. let target_partitions = ctx.session().config().target_partitions().max(1); let find_files_exec: Arc = Arc::new(RepartitionExec::try_new( find_files_exec, @@ -117,13 +92,21 @@ pub async fn build_delete_plan( let scan_exec = Arc::new(DeltaScanByAddsExec::new( Arc::clone(&find_files_exec), ctx.table_url().clone(), + version, table_schema.clone(), + table_schema.clone(), + crate::datasource::DeltaScanConfig::default(), + None, + None, + None, )); // Adapt the predicate to the scan schema. PhysicalExpr Column indices are schema-dependent, // and DeltaScanByAddsExec may reorder/augment the schema compared to the original table schema. let adapter_factory = Arc::new(crate::physical_plan::DeltaPhysicalExprAdapterFactory {}); - let adapter = adapter_factory.create(table_schema.clone(), scan_exec.schema()); + let adapter = adapter_factory + .create(table_schema.clone(), scan_exec.schema()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; let adapted_condition = adapter .rewrite(physical_condition.clone()) .map_err(|e| DataFusionError::External(Box::new(e)))?; @@ -138,6 +121,7 @@ pub async fn build_delete_plan( filter_exec, ctx.table_url().clone(), ctx.options().clone(), + ctx.metadata_configuration().clone(), partition_columns.clone(), PhysicalSinkMode::Append, ctx.table_exists(), diff --git a/crates/sail-delta-lake/src/physical_plan/planner/op_merge.rs b/crates/sail-delta-lake/src/physical_plan/planner/op_merge.rs index cfb70715fa..9c820ec1fe 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/op_merge.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/op_merge.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use datafusion::common::{internal_err, DataFusionError, Result}; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion::physical_plan::execution_plan::reset_plan_states; use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::projection::ProjectionExec; @@ -27,13 +28,14 @@ use sail_common_datafusion::datasource::{ use url::Url; use super::context::PlannerContext; -use super::utils::build_log_replay_pipeline; -use crate::datasource::{DataFusionMixins, PATH_COLUMN}; +use super::utils::{build_log_replay_pipeline_with_options, LogReplayOptions}; +use crate::datasource::PATH_COLUMN; use crate::kernel::{DeltaOperation, MergePredicate}; use crate::options::TableDeltaOptions; use crate::physical_plan::{ DeltaCommitExec, DeltaDiscoveryExec, DeltaRemoveActionsExec, DeltaWriterExec, }; +use crate::table::DeltaSnapshot; /// Entry point for MERGE execution. Expects the logical MERGE to be fully /// expanded (handled by ExpandMergeRule) and passed down as pre-expanded plans. @@ -54,24 +56,10 @@ pub async fn build_merge_plan( .clone(); let version = snapshot_state.version(); let table_schema = snapshot_state - .snapshot() - .arrow_schema() + .input_schema() .map_err(|e| DataFusionError::External(Box::new(e)))?; let partition_columns = snapshot_state.metadata().partition_columns().clone(); - let kernel_snapshot = snapshot_state.snapshot().snapshot().inner.clone(); - let log_segment = kernel_snapshot.log_segment(); - let checkpoint_files = log_segment - .checkpoint_parts - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let commit_files = log_segment - .ascending_commit_files - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let mut options = ctx.options().clone(); if merge_info.with_schema_evolution { options.merge_schema = true; @@ -111,14 +99,13 @@ pub async fn build_merge_plan( finalize_merge( ctx, expanded, + &snapshot_state, ctx.table_url().clone(), version, options, partition_columns, table_schema, merge_info.touched_file_plan.clone(), - checkpoint_files, - commit_files, merge_operation, ) .await @@ -128,14 +115,13 @@ pub async fn build_merge_plan( async fn finalize_merge( ctx: &PlannerContext<'_>, projected: Arc, + snapshot: &DeltaSnapshot, table_url: Url, version: i64, options: TableDeltaOptions, partition_columns: Vec, table_schema: datafusion::arrow::datatypes::SchemaRef, touched_file_plan: Option>, - checkpoint_files: Vec, - commit_files: Vec, operation_override: Option, ) -> Result> { let touched_plan_opt = touched_file_plan; @@ -146,6 +132,10 @@ async fn finalize_merge( // // Untouched files remain as-is (not removed, not rewritten). let writer_input: Arc = if let Some(touched_plan) = &touched_plan_opt { + // Physical plans can hold runtime state after execution. MERGE branches this subtree, + // so each consumer needs its own reset copy rather than sharing a multi-parent DAG. + let projected_for_touched = reset_plan_states(Arc::clone(&projected))?; + let touched_plan_for_writer = reset_plan_states(Arc::clone(touched_plan))?; let projected_schema = projected.schema(); if projected_schema.column_with_name(PATH_COLUMN).is_none() { return internal_err!( @@ -179,8 +169,8 @@ async fn finalize_merge( .map_err(|e| DataFusionError::Plan(format!("{e}")))?; let join = Arc::new(HashJoinExec::try_new( - Arc::clone(touched_plan), - Arc::clone(&projected), + touched_plan_for_writer, + projected_for_touched, vec![( Arc::new(Column::new(PATH_COLUMN, touched_idx)), Arc::new(Column::new(PATH_COLUMN, path_idx)), @@ -190,6 +180,7 @@ async fn finalize_merge( None, PartitionMode::CollectLeft, NullEquality::NullEqualsNothing, + false, )?); // Keep only the right side columns (original writer input schema) after join. @@ -244,6 +235,7 @@ async fn finalize_merge( writer_input, table_url.clone(), options, + ctx.metadata_configuration().clone(), partition_columns.clone(), PhysicalSinkMode::Append, true, @@ -254,20 +246,15 @@ async fn finalize_merge( let mut action_inputs: Vec> = vec![writer.clone()]; if let Some(touched_plan) = &touched_plan_opt { + let touched_plan_for_remove = reset_plan_states(Arc::clone(touched_plan))?; // Build a log-side stream of active Add rows using a visible log replay pipeline: // Union(DataSourceExec parquet/json) -> DeltaLogReplayExec -> ... -> DeltaDiscoveryExec. - let meta_scan: Arc = build_log_replay_pipeline( - ctx, - table_url.clone(), - version, - partition_columns.clone(), - checkpoint_files, - commit_files, - ) - .await?; + let meta_scan: Arc = + build_log_replay_pipeline_with_options(ctx, snapshot, LogReplayOptions::default()) + .await?; // Restrict to touched file paths by joining touched_paths with the metadata stream. - let touched_schema = touched_plan.schema(); + let touched_schema = touched_plan_for_remove.schema(); let touched_idx = touched_schema .index_of(PATH_COLUMN) .map_err(|e| DataFusionError::Plan(format!("{e}")))?; @@ -277,7 +264,7 @@ async fn finalize_merge( .map_err(|e| DataFusionError::Plan(format!("{e}")))?; let join = Arc::new(HashJoinExec::try_new( - Arc::clone(touched_plan), + touched_plan_for_remove, meta_scan, vec![( Arc::new(Column::new(PATH_COLUMN, touched_idx)), @@ -288,6 +275,7 @@ async fn finalize_merge( None, PartitionMode::CollectLeft, NullEquality::NullEqualsNothing, + false, )?); // Keep only the right side columns (metadata stream schema). diff --git a/crates/sail-delta-lake/src/physical_plan/planner/op_write.rs b/crates/sail-delta-lake/src/physical_plan/planner/op_write.rs index 2d479c71c1..5b77b9c963 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/op_write.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/op_write.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::SchemaRef; use datafusion::common::{DataFusionError, Result, ToDFSchema}; +use datafusion::logical_expr::Expr; use datafusion::physical_expr::expressions::NotExpr; use datafusion::physical_expr::{LexRequirement, PhysicalExpr}; use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; @@ -25,17 +26,17 @@ use sail_common_datafusion::datasource::PhysicalSinkMode; use sail_common_datafusion::logical_expr::ExprWithSource; use super::context::PlannerContext; +use super::metadata_predicate::{build_metadata_filter, predicate_requires_stats}; use super::utils::{ - align_schemas_for_union, build_log_replay_pipeline, build_log_replay_pipeline_with_options, - build_standard_write_layers, LogReplayFilter, LogReplayOptions, + align_schemas_for_union, build_log_replay_pipeline_with_options, build_standard_write_layers, + LogReplayOptions, }; -use crate::datasource::schema::DataFusionMixins; -use crate::datasource::PredicateProperties; use crate::kernel::{DeltaOperation, SaveMode}; use crate::physical_plan::{ create_projection, create_repartition, create_sort, DeltaCommitExec, DeltaDiscoveryExec, DeltaRemoveActionsExec, DeltaScanByAddsExec, DeltaWriterExec, }; +use crate::table::DeltaSnapshot; pub async fn build_write_plan( ctx: &PlannerContext<'_>, @@ -88,6 +89,7 @@ async fn build_full_overwrite_plan( plan, ctx.table_url().clone(), ctx.options().clone(), + ctx.metadata_configuration().clone(), ctx.partition_columns().to_vec(), PhysicalSinkMode::Overwrite, ctx.table_exists(), @@ -106,26 +108,10 @@ async fn build_full_overwrite_plan( let version = snapshot_state.version(); let partition_columns = snapshot_state.metadata().partition_columns().clone(); - let kernel_snapshot = snapshot_state.snapshot().snapshot().inner.clone(); - let log_segment = kernel_snapshot.log_segment(); - let checkpoint_files = log_segment - .checkpoint_parts - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let commit_files = log_segment - .ascending_commit_files - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - - let meta_scan: Arc = build_log_replay_pipeline( + let meta_scan: Arc = build_log_replay_pipeline_with_options( ctx, - ctx.table_url().clone(), - version, - partition_columns.clone(), - checkpoint_files, - commit_files, + &snapshot_state, + LogReplayOptions::default(), ) .await?; @@ -167,23 +153,24 @@ async fn build_overwrite_if_plan( .clone(); let version = snapshot_state.version(); let table_schema = snapshot_state - .snapshot() - .arrow_schema() + .input_schema() .map_err(|e| DataFusionError::External(Box::new(e)))?; let partition_columns = snapshot_state.metadata().partition_columns().clone(); let table_df_schema = table_schema .clone() .to_dfschema() .map_err(|e| DataFusionError::External(Box::new(e)))?; + let condition_expr = condition.expr.clone(); let physical_condition = ctx .session() - .create_physical_expr(condition.expr, &table_df_schema)?; + .create_physical_expr(condition_expr.clone(), &table_df_schema)?; let predicate_source = source.or(condition.source); let old_data_plan = build_old_data_plan( ctx, + condition_expr.clone(), physical_condition.clone(), - version, + &snapshot_state, table_schema.clone(), ) .await?; @@ -212,6 +199,7 @@ async fn build_overwrite_if_plan( Arc::clone(&union_plan), ctx.table_url().clone(), ctx.options().clone(), + ctx.metadata_configuration().clone(), ctx.partition_columns().to_vec(), PhysicalSinkMode::OverwriteIf { condition: None, @@ -222,50 +210,28 @@ async fn build_overwrite_if_plan( operation_override, )?); - let mut expr_props = PredicateProperties::new(partition_columns.clone()); - expr_props - .analyze_predicate(&physical_condition) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let kernel_snapshot = snapshot_state.snapshot().snapshot().inner.clone(); - let log_segment = kernel_snapshot.log_segment(); - let checkpoint_files = log_segment - .checkpoint_parts - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let commit_files = log_segment - .ascending_commit_files - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - - let mut log_replay_options = LogReplayOptions::default(); - if expr_props.partition_only { - log_replay_options.log_filter = Some(LogReplayFilter { - predicate: physical_condition.clone(), - table_schema: table_schema.clone(), - }); - } - let meta_scan: Arc = build_log_replay_pipeline_with_options( - ctx, - ctx.table_url().clone(), - version, - partition_columns.clone(), - checkpoint_files, - commit_files, - log_replay_options, - ) - .await?; + let partition_only = !predicate_requires_stats(&condition_expr, &partition_columns); + let log_replay_options = LogReplayOptions { + include_stats_json: !partition_only, + ..Default::default() + }; + let meta_scan: Arc = + build_log_replay_pipeline_with_options(ctx, &snapshot_state, log_replay_options).await?; + let meta_scan: Arc = build_metadata_filter( + ctx.session(), + meta_scan, + &snapshot_state, + condition_expr.clone(), + )?; let find_files_plan: Arc = Arc::new(DeltaDiscoveryExec::with_input( meta_scan, ctx.table_url().clone(), - Some(physical_condition.clone()), - Some(table_schema.clone()), + None, + None, version, partition_columns.clone(), - expr_props.partition_only, + partition_only, )?); let remove_plan = Arc::new(DeltaRemoveActionsExec::new(find_files_plan)?); @@ -286,64 +252,40 @@ async fn build_overwrite_if_plan( async fn build_old_data_plan( ctx: &PlannerContext<'_>, + condition_expr: Expr, condition: Arc, - version: i64, + snapshot_state: &DeltaSnapshot, table_schema: SchemaRef, ) -> Result> { - // For partition-only predicates, the scan-by-adds stage will be a no-op (partition_scan=true), - // so build the same log-derived metadata path as the main find-files plan. - let mut expr_props = PredicateProperties::new(ctx.partition_columns().to_vec()); - expr_props - .analyze_predicate(&condition) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let table = ctx.open_table().await?; - let snapshot_state = table - .snapshot() - .map_err(|e| DataFusionError::External(Box::new(e)))? - .clone(); - let kernel_snapshot = snapshot_state.snapshot().snapshot().inner.clone(); - let log_segment = kernel_snapshot.log_segment(); - let checkpoint_files = log_segment - .checkpoint_parts - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - let commit_files = log_segment - .ascending_commit_files - .iter() - .map(|p| p.filename.clone()) - .collect::>(); - - let mut log_replay_options = LogReplayOptions::default(); - if expr_props.partition_only { - log_replay_options.log_filter = Some(LogReplayFilter { - predicate: condition.clone(), - table_schema: table_schema.clone(), - }); - } - let meta_scan: Arc = build_log_replay_pipeline_with_options( - ctx, - ctx.table_url().clone(), - version, - ctx.partition_columns().to_vec(), - checkpoint_files, - commit_files, - log_replay_options, - ) - .await?; + let version = snapshot_state.version(); + let partition_only = !predicate_requires_stats(&condition_expr, ctx.partition_columns()); + let log_replay_options = LogReplayOptions { + include_stats_json: !partition_only, + ..Default::default() + }; + let meta_scan: Arc = + build_log_replay_pipeline_with_options(ctx, snapshot_state, log_replay_options).await?; + let meta_scan: Arc = build_metadata_filter( + ctx.session(), + meta_scan, + snapshot_state, + condition_expr.clone(), + )?; let find_files_exec: Arc = Arc::new(DeltaDiscoveryExec::with_input( meta_scan, ctx.table_url().clone(), - Some(condition.clone()), - Some(table_schema.clone()), + None, + None, version, ctx.partition_columns().to_vec(), - expr_props.partition_only, + partition_only, )?); // Spread Add actions across partitions so `DeltaScanByAddsExec` can scan files in parallel. + // TODO(adaptive-partitioning): Keep this aligned with `scan_planner.rs`. + // Plan: switch from fixed `target_partitions` + round-robin to size-driven partition count + // first, then size-aware distribution to avoid oversharding and worker skew. let target_partitions = ctx.session().config().target_partitions().max(1); let find_files_exec: Arc = Arc::new(RepartitionExec::try_new( find_files_exec, @@ -353,7 +295,13 @@ async fn build_old_data_plan( let scan_exec = Arc::new(DeltaScanByAddsExec::new( Arc::clone(&find_files_exec), ctx.table_url().clone(), + version, + table_schema.clone(), table_schema, + crate::datasource::DeltaScanConfig::default(), + None, + None, + None, )); let negated_condition = Arc::new(NotExpr::new(condition)); diff --git a/crates/sail-delta-lake/src/physical_plan/planner/utils.rs b/crates/sail-delta-lake/src/physical_plan/planner/utils.rs index 605400d1f0..e58aec83db 100644 --- a/crates/sail-delta-lake/src/physical_plan/planner/utils.rs +++ b/crates/sail-delta-lake/src/physical_plan/planner/utils.rs @@ -13,7 +13,7 @@ use std::sync::Arc; use datafusion::arrow::compute::SortOptions; -use datafusion::arrow::datatypes::{DataType, SchemaRef}; +use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; use datafusion::common::{ Column as LogicalColumn, DataFusionError, Result, ScalarValue, ToDFSchema, }; @@ -35,15 +35,21 @@ use sail_common_datafusion::datasource::PhysicalSinkMode; use url::Url; use super::context::PlannerContext; -use super::log_scan::{build_delta_log_datasource_union_with_options, LogScanOptions}; +use super::log_scan::{build_delta_log_datasource_scans_with_options, LogScanOptions}; +use super::log_segment::{resolve_log_segment_files, LogSegmentResolveOptions}; use crate::datasource::{ simplify_expr, COMMIT_TIMESTAMP_COLUMN, COMMIT_VERSION_COLUMN, PATH_COLUMN, }; +use crate::options::DeltaLogReplayStrategyOption; use crate::physical_plan::{ create_projection, create_repartition, create_sort, DeltaCommitExec, DeltaLogReplayExec, DeltaPhysicalExprAdapterFactory, DeltaWriterExec, COL_LOG_IS_REMOVE, COL_LOG_VERSION, COL_REPLAY_PATH, }; +use crate::spec::fields::{ + FIELD_NAME_MODIFICATION_TIME, FIELD_NAME_PATH, FIELD_NAME_SIZE, FIELD_NAME_STATS, +}; +use crate::table::DeltaSnapshot; /// Options that control what the log replay pipeline materializes as payload columns. /// @@ -79,6 +85,26 @@ impl Default for LogReplayOptions { } } +fn replay_output_schema( + partition_columns: &[(String, String)], + include_stats_json: bool, +) -> SchemaRef { + let mut fields = vec![ + Field::new(PATH_COLUMN, DataType::Utf8, true), + Field::new("size_bytes", DataType::Int64, true), + Field::new("modification_time", DataType::Int64, true), + Field::new(COMMIT_VERSION_COLUMN, DataType::Int64, true), + Field::new(COMMIT_TIMESTAMP_COLUMN, DataType::Int64, true), + ]; + for (logical, _) in partition_columns { + fields.push(Field::new(logical, DataType::Utf8, true)); + } + if include_stats_json { + fields.push(Field::new("stats_json", DataType::Utf8, true)); + } + Arc::new(Schema::new(fields)) +} + pub fn build_standard_write_layers( ctx: &PlannerContext<'_>, input: Arc, @@ -96,6 +122,7 @@ pub fn build_standard_write_layers( plan, ctx.table_url().clone(), ctx.options().clone(), + ctx.metadata_configuration().clone(), ctx.partition_columns().to_vec(), sink_mode.clone(), ctx.table_exists(), @@ -165,49 +192,63 @@ pub fn align_schemas_for_union( /// Build the standard log replay pipeline: /// `Union(DataSourceExec)` -> `Projection(payload + replay_keys)` -> `Repartition(Hash replay_path)` -/// -> `Sort(replay_path, log_version desc, preserve_partitioning)` -> `DeltaLogReplayExec`. +/// -> `[optional Sort(replay_path, log_version desc, preserve_partitioning)]` +/// -> `DeltaLogReplayExec`. pub async fn build_log_replay_pipeline( ctx: &PlannerContext<'_>, - table_url: Url, - version: i64, - partition_columns: Vec, - checkpoint_files: Vec, - commit_files: Vec, + snapshot: &DeltaSnapshot, +) -> Result> { + build_log_replay_pipeline_with_options(ctx, snapshot, LogReplayOptions::default()).await +} + +/// Same as [`build_log_replay_pipeline`], but allows controlling projected payload columns. +pub async fn build_log_replay_pipeline_with_options( + ctx: &PlannerContext<'_>, + snapshot: &DeltaSnapshot, + options: LogReplayOptions, ) -> Result> { - build_log_replay_pipeline_with_options( + let version = snapshot.version(); + let log_segment_files = resolve_log_segment_files( ctx, - table_url, version, - partition_columns, - checkpoint_files, - commit_files, - LogReplayOptions::default(), + LogSegmentResolveOptions { + commit_version_range: options.commit_version_range, + }, + ) + .await?; + build_log_replay_pipeline_with_files( + ctx, + ctx.table_url().clone(), + version, + snapshot.physical_partition_columns(), + log_segment_files.checkpoint_files, + log_segment_files.commit_files, + options, ) .await } -/// Same as [`build_log_replay_pipeline`], but allows controlling projected payload columns. -pub async fn build_log_replay_pipeline_with_options( +async fn build_log_replay_pipeline_with_files( ctx: &PlannerContext<'_>, table_url: Url, version: i64, - partition_columns: Vec, + partition_columns: Vec<(String, String)>, checkpoint_files: Vec, commit_files: Vec, options: LogReplayOptions, ) -> Result> { let log_scan_options = LogScanOptions { projection: Some(vec!["add".to_string(), "remove".to_string()]), - commit_version_range: options.commit_version_range, parquet_predicate: options.parquet_predicate, }; - let (raw_scan, checkpoint_files, commit_files) = build_delta_log_datasource_union_with_options( - ctx, - checkpoint_files, - commit_files, - log_scan_options, - ) - .await?; + let (checkpoint_scan_opt, commit_scan_opt, checkpoint_files, commit_files) = + build_delta_log_datasource_scans_with_options( + ctx, + checkpoint_files, + commit_files, + log_scan_options, + ) + .await?; // Projection#1: build a compact log scan schema for streaming replay. // @@ -215,14 +256,48 @@ pub async fn build_log_replay_pipeline_with_options( // - is_remove = remove_struct IS NOT NULL // - __sail_delta_log_version is passed through from the scan as a partition column // - payload columns are extracted up-front so the sort/replay does not carry wide structs - let input_schema = raw_scan.schema(); + let input_schema = checkpoint_scan_opt + .as_ref() + .map(|p| p.schema()) + .or_else(|| commit_scan_opt.as_ref().map(|p| p.schema())) + .ok_or_else(|| { + DataFusionError::Plan( + "no _delta_log scans available to build replay pipeline".to_string(), + ) + })?; let log_version_idx = input_schema.index_of(COL_LOG_VERSION)?; let df_schema = input_schema.clone().to_dfschema()?; let simplify = |expr: Expr| simplify_expr(ctx.session(), &df_schema, expr); + if input_schema.field_with_name("add").is_err() { + // Some tables/log ranges contain only metadata/protocol/remove actions. + // Without any `add` payload there are no data files to replay. + let replay: Arc = + Arc::new(datafusion::physical_plan::empty::EmptyExec::new( + replay_output_schema(&partition_columns, options.include_stats_json), + )); + + let replay: Arc = if let Some(filter) = options.log_filter { + let adapter_factory = Arc::new(DeltaPhysicalExprAdapterFactory {}); + let adapter = adapter_factory + .create(filter.table_schema, replay.schema()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let adapted = adapter + .rewrite(filter.predicate) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Arc::new(FilterExec::try_new(adapted, replay)?) + } else { + replay + }; + + return Ok(replay); + } + let col_expr = |name: &str| Expr::Column(LogicalColumn::new_unqualified(name)); let lit_str = |s: &str| Expr::Literal(ScalarValue::Utf8(Some(s.to_string())), None); let lit_i64 = |v: i64| Expr::Literal(ScalarValue::Int64(Some(v)), None); + let lit_bool = |v: bool| Expr::Literal(ScalarValue::Boolean(Some(v)), None); + let lit_utf8_null = || Expr::Literal(ScalarValue::Utf8(None), None); let get_field_expr = |struct_expr: Expr, field_name: &str| { Expr::ScalarFunction(ScalarFunction::new_udf( datafusion::functions::core::get_field(), @@ -237,22 +312,33 @@ pub async fn build_log_replay_pipeline_with_options( )) }; + // `add` is required for replay payload extraction. let add_col_expr = col_expr("add"); - let remove_col_expr = col_expr("remove"); + let has_remove_column = input_schema.field_with_name("remove").is_ok(); + let add_is_not_null = add_col_expr.clone().is_not_null(); - let remove_is_not_null = remove_col_expr.clone().is_not_null(); + let remove_col_expr = has_remove_column.then(|| col_expr("remove")); + let remove_is_not_null = remove_col_expr + .as_ref() + .map(|e| e.clone().is_not_null()) + .unwrap_or_else(|| lit_bool(false)); // NOTE: `get_field(struct, 'child')` does not apply the parent struct's // null buffer to the returned child array. We must guard child extraction with the // struct's validity to avoid spurious values. let add_path = guard_with( add_is_not_null.clone(), - get_field_expr(add_col_expr.clone(), "path"), - ); - let remove_path = guard_with( - remove_is_not_null.clone(), - get_field_expr(remove_col_expr.clone(), "path"), + get_field_expr(add_col_expr.clone(), FIELD_NAME_PATH), ); + let remove_path = remove_col_expr + .as_ref() + .map(|e| { + guard_with( + remove_is_not_null.clone(), + get_field_expr(e.clone(), FIELD_NAME_PATH), + ) + }) + .unwrap_or_else(lit_utf8_null); let replay_path = simplify(Expr::ScalarFunction(ScalarFunction::new_udf( datafusion::functions::core::coalesce(), @@ -274,8 +360,8 @@ pub async fn build_log_replay_pipeline_with_options( } }; let has_add_field = |name: &str| add_struct_fields.iter().any(|f| f.name() == name); - let mod_time_field = if has_add_field("modificationTime") { - "modificationTime" + let mod_time_field = if has_add_field(FIELD_NAME_MODIFICATION_TIME) { + FIELD_NAME_MODIFICATION_TIME } else { "modification_time" }; @@ -284,8 +370,8 @@ pub async fn build_log_replay_pipeline_with_options( } else { "partition_values" }; - let stats_field = if has_add_field("stats") { - "stats" + let stats_field = if has_add_field(FIELD_NAME_STATS) { + FIELD_NAME_STATS } else { "stats_json" }; @@ -294,12 +380,12 @@ pub async fn build_log_replay_pipeline_with_options( let guard_add = |e: Expr| guard_with(add_is_not_null.clone(), e); let path_expr = simplify(Expr::Cast(Cast::new( - Box::new(guard_add(get_add_field("path"))), + Box::new(guard_add(get_add_field(FIELD_NAME_PATH))), DataType::Utf8, )))?; let size_expr_i64 = Expr::Cast(Cast::new( - Box::new(guard_add(get_add_field("size"))), + Box::new(guard_add(get_add_field(FIELD_NAME_SIZE))), DataType::Int64, )); let size_expr = simplify(Expr::ScalarFunction(ScalarFunction::new_udf( @@ -326,15 +412,27 @@ pub async fn build_log_replay_pipeline_with_options( }; let part_values = guard_add(get_add_field(part_values_field)); - let part_expr_for = |key: &str| -> Result> { - let extracted = Expr::ScalarFunction(ScalarFunction::new_udf( - map_extract_udf(), - vec![part_values.clone(), lit_str(key)], - )); - let elem = Expr::ScalarFunction(ScalarFunction::new_udf( - array_element_udf(), - vec![extracted, lit_i64(1)], - )); + let part_expr_for = |logical: &str, physical: &str| -> Result> { + let extract_elem = |key: &str| { + let extracted = Expr::ScalarFunction(ScalarFunction::new_udf( + map_extract_udf(), + vec![part_values.clone(), lit_str(key)], + )); + Expr::ScalarFunction(ScalarFunction::new_udf( + array_element_udf(), + vec![extracted, lit_i64(1)], + )) + }; + let physical_elem = extract_elem(physical); + let elem = if physical == logical { + physical_elem + } else { + let logical_elem = extract_elem(logical); + Expr::ScalarFunction(ScalarFunction::new_udf( + datafusion::functions::core::coalesce(), + vec![physical_elem, logical_elem], + )) + }; simplify(Expr::Cast(Cast::new(Box::new(elem), DataType::Utf8))) }; @@ -353,8 +451,8 @@ pub async fn build_log_replay_pipeline_with_options( Arc::clone(&mod_time_expr), COMMIT_TIMESTAMP_COLUMN.to_string(), )); - for col in &partition_columns { - final_proj.push((part_expr_for(col)?, col.clone())); + for (logical, physical) in &partition_columns { + final_proj.push((part_expr_for(logical, physical)?, logical.clone())); } if let Some(stats_expr) = stats_expr { final_proj.push((stats_expr, "stats_json".to_string())); @@ -368,58 +466,130 @@ pub async fn build_log_replay_pipeline_with_options( COL_LOG_VERSION.to_string(), )); - let log_scan: Arc = Arc::new(ProjectionExec::try_new(final_proj, raw_scan)?); - let log_partitions = ctx.session().config().target_partitions().max(1); - let replay_path_idx = log_scan.schema().index_of(COL_REPLAY_PATH)?; - let log_version_idx = log_scan.schema().index_of(COL_LOG_VERSION)?; - - // Hash partition by replay_path so all actions for the same path are co-located. - let replay_expr: Arc = - Arc::new(PhysicalColumn::new(COL_REPLAY_PATH, replay_path_idx)); - let log_scan: Arc = Arc::new(RepartitionExec::try_new( - log_scan, - Partitioning::Hash(vec![replay_expr], log_partitions), - )?); - // Ensure per-partition ordering on (replay_path, log_version desc) so DeltaLogReplayExec can - // stream without materializing the full active set in memory. SortExec can spill. - // TODO: Add COL_LOG_IS_REMOVE ASC as a tie-breaker so Add sorts ahead of Remove for the - // same path/version (DV updates emit Remove+Add in one commit). - let ordering = LexOrdering::new(vec![ - PhysicalSortExpr { - expr: Arc::new(Column::new(COL_REPLAY_PATH, replay_path_idx)), - options: SortOptions { - descending: false, - nulls_first: false, + let replay_partition_cols = partition_columns + .iter() + .map(|(logical, _)| logical.clone()) + .collect::>(); + + let empty_scan = |schema: SchemaRef| -> Arc { + Arc::new(datafusion::physical_plan::empty::EmptyExec::new(schema)) + }; + + let build_branch = |scan: Arc, + sort: bool| + -> Result> { + // Preserve existing behavior: fan out to target partitions early for stable EXPLAIN and + // better parallelism. (This is a shuffle, but not a pipeline breaker like SortExec.) + let plan: Arc = Arc::new(RepartitionExec::try_new( + scan, + Partitioning::RoundRobinBatch(log_partitions), + )?); + + let plan: Arc = + Arc::new(ProjectionExec::try_new(final_proj.clone(), plan)?); + + // Hash partition by replay_path so all actions for the same path are co-located. + let replay_path_idx = plan.schema().index_of(COL_REPLAY_PATH)?; + let replay_expr: Arc = + Arc::new(PhysicalColumn::new(COL_REPLAY_PATH, replay_path_idx)); + let plan: Arc = Arc::new(RepartitionExec::try_new( + plan, + Partitioning::Hash(vec![replay_expr], log_partitions), + )?); + + if !sort { + return Ok(plan); + } + + // Ensure per-partition ordering on (replay_path, log_version desc, is_remove asc) + // for sort-based replay mode. + let replay_path_idx = plan.schema().index_of(COL_REPLAY_PATH)?; + let log_version_idx = plan.schema().index_of(COL_LOG_VERSION)?; + let is_remove_idx = plan.schema().index_of(COL_LOG_IS_REMOVE)?; + let ordering = LexOrdering::new(vec![ + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_REPLAY_PATH, replay_path_idx)), + options: SortOptions { + descending: false, + nulls_first: false, + }, }, - }, - PhysicalSortExpr { - expr: Arc::new(Column::new(COL_LOG_VERSION, log_version_idx)), - options: SortOptions { - descending: true, - nulls_first: false, + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_LOG_VERSION, log_version_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, }, - }, - ]) - .ok_or_else(|| { - DataFusionError::Internal("failed to create replay_path ordering requirement".to_string()) - })?; - let log_scan: Arc = - Arc::new(SortExec::new(ordering, log_scan).with_preserve_partitioning(true)); - - let replay: Arc = Arc::new(DeltaLogReplayExec::new( - log_scan, - table_url, - version, - partition_columns.clone(), - checkpoint_files, - commit_files, - )); + // Add beats Remove within the same path/version (DV update pattern). + PhysicalSortExpr { + expr: Arc::new(Column::new(COL_LOG_IS_REMOVE, is_remove_idx)), + options: SortOptions { + descending: false, + nulls_first: false, + }, + }, + ]) + .ok_or_else(|| { + DataFusionError::Internal("failed to create replay ordering requirement".to_string()) + })?; + Ok(Arc::new( + SortExec::new(ordering, plan).with_preserve_partitioning(true), + )) + }; + + let replay_strategy = ctx.options().delta_log_replay_strategy; + let replay_hash_threshold = ctx.options().delta_log_replay_hash_threshold.max(1); + let has_checkpoint = !checkpoint_files.is_empty(); + let use_hash = match replay_strategy { + DeltaLogReplayStrategyOption::Sort => false, + DeltaLogReplayStrategyOption::Hash => has_checkpoint, + DeltaLogReplayStrategyOption::Auto => { + has_checkpoint && commit_files.len() <= replay_hash_threshold + } + }; + + let replay: Arc = if has_checkpoint { + // Hash replay: stream checkpoint, build small commit-side map, then emit commit-only adds. + let checkpoint_scan = + checkpoint_scan_opt.unwrap_or_else(|| empty_scan(Arc::clone(&input_schema))); + let commit_scan = commit_scan_opt.unwrap_or_else(|| empty_scan(Arc::clone(&input_schema))); + + let checkpoint_branch = build_branch(checkpoint_scan, false)?; + let commit_branch = build_branch(commit_scan, !use_hash)?; + + Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_branch, + commit_branch, + table_url, + version, + replay_partition_cols, + checkpoint_files, + commit_files, + )) + } else { + // Sort replay (spill-friendly): for commit-only scenarios, avoid building a potentially + // large in-memory map. + let commit_scan = commit_scan_opt.unwrap_or_else(|| empty_scan(Arc::clone(&input_schema))); + let commit_branch = build_branch(commit_scan, true)?; + + Arc::new(DeltaLogReplayExec::new( + commit_branch, + table_url, + version, + replay_partition_cols, + checkpoint_files, + commit_files, + )) + }; let replay: Arc = if let Some(filter) = options.log_filter { let adapter_factory = Arc::new(DeltaPhysicalExprAdapterFactory {}); - let adapter = adapter_factory.create(filter.table_schema, replay.schema()); + let adapter = adapter_factory + .create(filter.table_schema, replay.schema()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; let adapted = adapter .rewrite(filter.predicate) .map_err(|e| DataFusionError::External(Box::new(e)))?; diff --git a/crates/sail-delta-lake/src/physical_plan/remove_actions_exec.rs b/crates/sail-delta-lake/src/physical_plan/remove_actions_exec.rs index ab4009b9b1..77320d7ce8 100644 --- a/crates/sail-delta-lake/src/physical_plan/remove_actions_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/remove_actions_exec.rs @@ -11,7 +11,6 @@ // limitations under the License. use std::any::Any; -use std::collections::HashMap; use std::fmt; use std::sync::Arc; use std::time::Instant; @@ -28,32 +27,32 @@ use datafusion::physical_plan::{ use datafusion_common::{internal_err, Result}; use datafusion_physical_expr::{Distribution, EquivalenceProperties}; use futures::stream::{self, StreamExt}; -use serde_json::Value; -use crate::kernel::models::{Add, Remove, RemoveOptions}; +use crate::kernel::transaction::OperationMetrics; use crate::physical_plan::{ current_timestamp_millis, decode_adds_from_batch, delta_action_schema, encode_actions, - meta_adds, CommitMeta, ExecAction, COL_ACTION, + meta_adds, ExecCommitMeta, COL_ACTION, }; +use crate::spec::{Action, Add, Remove, RemoveOptions}; /// Physical execution node to convert Add actions (from FindFiles) into Remove actions #[derive(Debug)] pub struct DeltaRemoveActionsExec { input: Arc, metrics: ExecutionPlanMetricsSet, - cache: PlanProperties, + cache: Arc, } impl DeltaRemoveActionsExec { pub fn new(input: Arc) -> Result { // Output schema must match DeltaWriterExec output schema (row-per-action). let schema = delta_action_schema()?; - let cache = PlanProperties::new( + let cache = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Ok(Self { input, metrics: ExecutionPlanMetricsSet::new(), @@ -102,7 +101,7 @@ impl ExecutionPlan for DeltaRemoveActionsExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -177,36 +176,23 @@ impl ExecutionPlan for DeltaRemoveActionsExec { output_rows.add(usize::try_from(num_removed_files).unwrap_or(usize::MAX)); output_bytes.add(usize::try_from(num_removed_bytes).unwrap_or(usize::MAX)); - let mut operation_metrics: HashMap = HashMap::new(); - operation_metrics.insert( - "numRemovedFiles".to_string(), - Value::from(num_removed_files), - ); - operation_metrics.insert( - "numRemovedBytes".to_string(), - Value::from(num_removed_bytes), - ); - operation_metrics.insert( - "executionTimeMs".to_string(), - Value::from(exec_start.elapsed().as_millis() as u64), - ); - - let mut exec_actions: Vec = Vec::new(); - - for remove in remove_actions { - exec_actions.push(remove.into()); - } + let operation_metrics = OperationMetrics { + execution_time_ms: Some(exec_start.elapsed().as_millis() as u64), + num_removed_files: Some(num_removed_files), + num_removed_bytes: Some(num_removed_bytes), + ..Default::default() + }; + + let output_actions = remove_actions.into_iter().map(Action::Remove).collect(); - exec_actions.push( - CommitMeta { + encode_actions( + output_actions, + Some(ExecCommitMeta { row_count: 0, operation: None, operation_metrics, - } - .try_into()?, - ); - - encode_actions(exec_actions) + }), + ) }; let stream = stream::once(future); diff --git a/crates/sail-delta-lake/src/physical_plan/scan_by_adds_exec.rs b/crates/sail-delta-lake/src/physical_plan/scan_by_adds_exec.rs index 56f5b4ad28..00f1b28aa4 100644 --- a/crates/sail-delta-lake/src/physical_plan/scan_by_adds_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/scan_by_adds_exec.rs @@ -17,6 +17,7 @@ use std::sync::Arc; use async_trait::async_trait; use datafusion::arrow::datatypes::SchemaRef; use datafusion::arrow::record_batch::RecordBatch; +use datafusion::common::stats::ColumnStatistics; use datafusion::execution::context::TaskContext; use datafusion::execution::SessionStateBuilder; use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; @@ -25,64 +26,83 @@ use datafusion::physical_plan::{ DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, Partitioning, PlanProperties, SendableRecordBatchStream, }; -use datafusion_common::{internal_err, DataFusionError, Result}; -use datafusion_physical_expr::{Distribution, EquivalenceProperties}; +use datafusion_common::{internal_err, DataFusionError, Result, Statistics}; +use datafusion_physical_expr::{Distribution, EquivalenceProperties, PhysicalExpr}; use futures::stream::{self, StreamExt, TryStreamExt}; +use sail_common_datafusion::array::record_batch::cast_record_batch_relaxed_tz; +use sail_common_datafusion::extension::SessionExtensionAccessor; +use sail_common_datafusion::rename::physical_plan::rename_projected_physical_plan; use url::Url; -use crate::datasource::scan::FileScanParams; -use crate::datasource::{build_file_scan_config, DeltaScanConfigBuilder}; +use crate::datasource::scan::{FileScanParams, TableStatsMode}; +use crate::datasource::{build_file_scan_config, df_logical_schema, DeltaScanConfig}; use crate::physical_plan::{decode_adds_from_batch, meta_adds, COL_ACTION}; -use crate::storage::StorageConfig; -use crate::table::open_table_with_object_store; +use crate::schema::{arrow_field_physical_name, get_physical_schema}; +use crate::session_extension::{load_table_uncached, DeltaTableCache}; +use crate::spec::StructType; +// TODO(dynamic-file-scheduling): Replace fixed file-count chunking with byte-aware chunking +// and optional work-stealing so executors pull remaining file work dynamically under skew. const ADD_SCAN_CHUNK_FILES: usize = 1024; struct ScanByAddsStreamState { input: SendableRecordBatchStream, context: Arc, table_url: Url, - table_schema: SchemaRef, + table_version: i64, output_schema: SchemaRef, + scan_config: DeltaScanConfig, + projection: Option>, + limit: Option, + pushdown_filter: Option>, // Lazy init table_opened: bool, - snapshot: Option, + snapshot: Option>, log_store: Option, - scan_config: Option, session_state: Option, file_schema: Option, partition_columns: Option>, + logical_names: Option>, // control partition_scan: Option, emitted_partition_empty: bool, - pending_adds: Vec, + pending_adds: Vec, current_scan: Option, input_done: bool, } impl ScanByAddsStreamState { + #[expect(clippy::too_many_arguments)] fn new( input: SendableRecordBatchStream, context: Arc, table_url: Url, - table_schema: SchemaRef, + table_version: i64, output_schema: SchemaRef, + scan_config: DeltaScanConfig, + projection: Option>, + limit: Option, + pushdown_filter: Option>, ) -> Self { Self { input, context, table_url, - table_schema, + table_version, output_schema, + scan_config, + projection, + limit, + pushdown_filter, table_opened: false, snapshot: None, log_store: None, - scan_config: None, session_state: None, file_schema: None, partition_columns: None, + logical_names: None, partition_scan: None, emitted_partition_empty: false, pending_adds: Vec::new(), @@ -95,45 +115,85 @@ impl ScanByAddsStreamState { if self.table_opened { return Ok(()); } - let object_store = self - .context - .runtime_env() - .object_store_registry - .get_store(&self.table_url) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - let table = - open_table_with_object_store(self.table_url.clone(), object_store, StorageConfig) - .await - .map_err(|e| DataFusionError::External(Box::new(e)))?; - let snapshot_state = table - .snapshot() - .map_err(|e| DataFusionError::External(Box::new(e)))? - .clone(); + // Prefer a session-scoped cache. This avoids leaking state across sessions / RuntimeEnvs. + // If the cache extension is not installed, fall back to no caching. + let cached = match self.context.as_ref().extension::() { + Ok(cache) => { + cache + .get(self.context.as_ref(), &self.table_url, self.table_version) + .await? + } + Err(_) => { + load_table_uncached( + self.context.runtime_env(), + &self.table_url, + self.table_version, + ) + .await? + } + }; + + let snapshot_state = cached.snapshot.clone(); let partition_columns = snapshot_state.metadata().partition_columns().clone(); - let scan_config = DeltaScanConfigBuilder::new() - .with_schema(self.table_schema.clone()) - .build(&snapshot_state) - .map_err(|e| DataFusionError::External(Box::new(e)))?; let session_state = SessionStateBuilder::new() .with_runtime_env(self.context.runtime_env().clone()) .build(); + let mut scan_config = self.scan_config.clone(); + if scan_config.schema.is_none() { + let schema = snapshot_state + .input_schema() + .map_err(|e| DataFusionError::External(Box::new(e)))?; + scan_config.schema = Some(schema); + } + + let logical_schema = df_logical_schema( + snapshot_state.as_ref(), + &scan_config.file_column_name, + &scan_config.commit_version_column_name, + &scan_config.commit_timestamp_column_name, + scan_config.schema.clone(), + ) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + let logical_names = logical_schema + .fields() + .iter() + .map(|f| f.name().clone()) + .collect::>(); + let table_partition_cols = snapshot_state.metadata().partition_columns(); + let kmode = snapshot_state.effective_column_mapping_mode(); + let kschema_arc = snapshot_state.schema(); + let logical_kernel = StructType::try_from(kschema_arc) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let physical_arrow = get_physical_schema(&logical_kernel, kmode); + let physical_partition_cols: std::collections::HashSet = table_partition_cols + .iter() + .map(|col| { + kschema_arc + .field_with_name(col) + .map(|f| arrow_field_physical_name(f, kmode).to_string()) + .unwrap_or_else(|_| col.clone()) + }) + .collect(); + let file_schema = Arc::new(datafusion::arrow::datatypes::Schema::new( - self.table_schema + physical_arrow .fields() .iter() - .filter(|f| !table_partition_cols.contains(f.name())) + .filter(|f| !physical_partition_cols.contains(f.name())) .cloned() .collect::>(), )); - self.log_store = Some(table.log_store()); + self.log_store = Some(cached.log_store.clone()); self.snapshot = Some(snapshot_state); - self.scan_config = Some(scan_config); self.session_state = Some(session_state); self.file_schema = Some(file_schema); self.partition_columns = Some(partition_columns); + self.logical_names = Some(logical_names); + self.scan_config = scan_config; self.table_opened = true; Ok(()) } @@ -162,16 +222,12 @@ impl ScanByAddsStreamState { let snapshot = self .snapshot - .as_ref() + .as_deref() .ok_or_else(|| DataFusionError::Internal("missing snapshot".into()))?; let log_store = self .log_store .as_ref() .ok_or_else(|| DataFusionError::Internal("missing log_store".into()))?; - let scan_config = self - .scan_config - .as_ref() - .ok_or_else(|| DataFusionError::Internal("missing scan_config".into()))?; let session_state = self .session_state .as_ref() @@ -181,19 +237,27 @@ impl ScanByAddsStreamState { .as_ref() .ok_or_else(|| DataFusionError::Internal("missing file_schema".into()))? .clone(); + let logical_names = self + .logical_names + .as_ref() + .ok_or_else(|| DataFusionError::Internal("missing logical_names".into()))? + .clone(); + // TODO(size-aware-bin-packing): Build file groups from `Add.size` with bin-packing + // instead of static grouping to reduce per-partition size skew. let adds = std::mem::take(&mut self.pending_adds); let file_scan_config = build_file_scan_config( snapshot, log_store, &adds, - scan_config, + &self.scan_config, FileScanParams { pruning_mask: None, - projection: None, - limit: None, - pushdown_filter: None, + projection: self.projection.as_ref(), + limit: self.limit, + pushdown_filter: self.pushdown_filter.clone(), sort_order: None, + table_stats_mode: TableStatsMode::AddsOnly, }, session_state, file_schema, @@ -203,13 +267,25 @@ impl ScanByAddsStreamState { let partitions = file_scan_config.file_groups.len().max(1); let scan_exec = datafusion::datasource::source::DataSourceExec::from_data_source(file_scan_config); + let scan_exec = + rename_projected_physical_plan(scan_exec, &logical_names, self.projection.as_ref()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; let mut scans = Vec::with_capacity(partitions); for partition in 0..partitions { scans.push(scan_exec.execute(partition, Arc::clone(&self.context))?); } + let output_schema = Arc::clone(&self.output_schema); let combined = stream::iter(scans) .map(Ok::<_, DataFusionError>) - .try_flatten(); + .try_flatten() + .and_then(move |batch| { + let output_schema = Arc::clone(&output_schema); + async move { + let casted = cast_record_batch_relaxed_tz(&batch, &output_schema) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + Ok(casted) + } + }); self.current_scan = Some(Box::pin(RecordBatchStreamAdapter::new( Arc::clone(&self.output_schema), combined, @@ -220,13 +296,12 @@ impl ScanByAddsStreamState { async fn decode_adds_from_meta_batch( &mut self, batch: &RecordBatch, - ) -> Result> { + ) -> Result> { self.ensure_table().await?; let partition_columns = self .partition_columns - .as_ref() - .ok_or_else(|| DataFusionError::Internal("missing partition_columns".into()))? - .clone(); + .clone() + .unwrap_or_else(|| meta_adds::infer_partition_columns_from_schema(&batch.schema())); meta_adds::decode_adds_from_meta_batch(batch, Some(&partition_columns)) } } @@ -240,24 +315,72 @@ impl ScanByAddsStreamState { pub struct DeltaScanByAddsExec { input: Arc, table_url: Url, + version: i64, table_schema: SchemaRef, - cache: PlanProperties, + output_schema: SchemaRef, + scan_config: DeltaScanConfig, + projection: Option>, + limit: Option, + pushdown_filter: Option>, + statistics: Statistics, + cache: Arc, } impl DeltaScanByAddsExec { - pub fn new(input: Arc, table_url: Url, table_schema: SchemaRef) -> Self { + #[expect(clippy::too_many_arguments)] + pub fn new( + input: Arc, + table_url: Url, + version: i64, + table_schema: SchemaRef, + output_schema: SchemaRef, + scan_config: DeltaScanConfig, + projection: Option>, + limit: Option, + pushdown_filter: Option>, + ) -> Self { + let statistics = Statistics::new_unknown(output_schema.as_ref()); let cache = Self::compute_properties( - table_schema.clone(), + output_schema.clone(), input.output_partitioning().partition_count(), ); Self { input, table_url, + version, table_schema, + output_schema, + scan_config, + projection, + limit, + pushdown_filter, + statistics, cache, } } + pub fn with_table_statistics(mut self, table_statistics: Option) -> Self { + self.statistics = table_statistics + .as_ref() + .map(|s| map_statistics_to_schema(s, &self.table_schema, &self.output_schema)) + .unwrap_or_else(|| Statistics::new_unknown(self.output_schema.as_ref())); + self + } + + pub fn with_output_statistics(mut self, output_statistics: Option) -> Self { + self.statistics = output_statistics + .as_ref() + .map(|statistics| { + if statistics.column_statistics.len() == self.output_schema.fields().len() { + sanitize_statistics_to_schema(statistics.clone(), &self.output_schema) + } else { + map_statistics_to_schema(statistics, &self.table_schema, &self.output_schema) + } + }) + .unwrap_or_else(|| Statistics::new_unknown(self.output_schema.as_ref())); + self + } + pub fn input(&self) -> &Arc { &self.input } @@ -266,17 +389,45 @@ impl DeltaScanByAddsExec { &self.table_url } + pub fn version(&self) -> i64 { + self.version + } + pub fn table_schema(&self) -> &SchemaRef { &self.table_schema } - fn compute_properties(schema: SchemaRef, partition_count: usize) -> PlanProperties { - PlanProperties::new( + pub fn output_schema(&self) -> &SchemaRef { + &self.output_schema + } + + pub fn scan_config(&self) -> &DeltaScanConfig { + &self.scan_config + } + + pub fn projection(&self) -> Option<&[usize]> { + self.projection.as_deref() + } + + pub fn limit(&self) -> Option { + self.limit + } + + pub fn pushdown_filter(&self) -> Option<&Arc> { + self.pushdown_filter.as_ref() + } + + pub fn statistics(&self) -> &Statistics { + &self.statistics + } + + fn compute_properties(schema: SchemaRef, partition_count: usize) -> Arc { + Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(partition_count.max(1)), EmissionType::Final, Boundedness::Bounded, - ) + )) } } @@ -290,7 +441,7 @@ impl ExecutionPlan for DeltaScanByAddsExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -309,11 +460,13 @@ impl ExecutionPlan for DeltaScanByAddsExec { if children.len() != 1 { return internal_err!("DeltaScanByAddsExec requires exactly one child"); } - Ok(Arc::new(Self::new( - children[0].clone(), - self.table_url.clone(), - self.table_schema.clone(), - ))) + let mut cloned = (*self).clone(); + cloned.input = children[0].clone(); + cloned.cache = Self::compute_properties( + cloned.output_schema.clone(), + cloned.input.output_partitioning().partition_count(), + ); + Ok(Arc::new(cloned)) } fn execute( @@ -323,14 +476,22 @@ impl ExecutionPlan for DeltaScanByAddsExec { ) -> Result { let input_stream = self.input.execute(partition, Arc::clone(&context))?; let table_url = self.table_url.clone(); - let table_schema = self.table_schema.clone(); + let table_version = self.version; let output_schema = self.schema(); + let scan_config = self.scan_config.clone(); + let projection = self.projection.clone(); + let limit = self.limit; + let pushdown_filter = self.pushdown_filter.clone(); let state = ScanByAddsStreamState::new( input_stream, context, table_url, - table_schema, + table_version, Arc::clone(&output_schema), + scan_config, + projection, + limit, + pushdown_filter, ); let s = stream::try_unfold(state, |mut st| async move { @@ -370,6 +531,9 @@ impl ExecutionPlan for DeltaScanByAddsExec { continue; } st.update_partition_scan_from_batch(&batch)?; + if st.partition_scan == Some(true) { + continue; + } if batch.column_by_name(COL_ACTION).is_some() { st.pending_adds.extend(decode_adds_from_batch(&batch)?); @@ -403,17 +567,309 @@ impl ExecutionPlan for DeltaScanByAddsExec { Ok(Box::pin(RecordBatchStreamAdapter::new(output_schema, s))) } + + fn partition_statistics(&self, partition: Option) -> Result { + if partition.is_none() { + Ok(self.statistics.clone()) + } else { + Ok(Statistics::new_unknown(self.schema().as_ref())) + } + } +} + +fn map_statistics_to_schema( + statistics: &Statistics, + source_schema: &SchemaRef, + target_schema: &SchemaRef, +) -> Statistics { + let column_statistics = target_schema + .fields() + .iter() + .map(|field| { + let mut column_statistics = source_schema + .index_of(field.name()) + .ok() + .and_then(|idx| statistics.column_statistics.get(idx).cloned()) + .unwrap_or_else(ColumnStatistics::new_unknown); + sanitize_column_statistics_for_field( + &mut column_statistics, + field.name(), + field.data_type(), + ); + column_statistics + }) + .collect(); + + Statistics { + num_rows: statistics.num_rows, + total_byte_size: statistics.total_byte_size, + column_statistics, + } +} + +fn sanitize_statistics_to_schema(mut statistics: Statistics, schema: &SchemaRef) -> Statistics { + if statistics.column_statistics.len() != schema.fields().len() { + return Statistics::new_unknown(schema.as_ref()); + } + + for (field, column_stats) in schema + .fields() + .iter() + .zip(&mut statistics.column_statistics) + { + sanitize_column_statistics_for_field(column_stats, field.name(), field.data_type()); + } + + statistics +} + +fn sanitize_column_statistics_for_field( + column_stats: &mut ColumnStatistics, + _column_name: &str, + data_type: &datafusion::arrow::datatypes::DataType, +) { + column_stats.min_value = sanitize_bound_for_type(&column_stats.min_value, data_type); + column_stats.max_value = sanitize_bound_for_type(&column_stats.max_value, data_type); +} + +fn sanitize_bound_for_type( + bound: &datafusion::common::stats::Precision, + data_type: &datafusion::arrow::datatypes::DataType, +) -> datafusion::common::stats::Precision { + let sanitize_value = |value: &datafusion::common::ScalarValue| { + if value.is_null() { + return None; + } + if value.data_type() == *data_type { + return Some(value.clone()); + } + value + .cast_to(data_type) + .ok() + .filter(|casted| !casted.is_null()) + }; + + match bound { + datafusion::common::stats::Precision::Exact(value) => sanitize_value(value) + .map(datafusion::common::stats::Precision::Exact) + .unwrap_or(datafusion::common::stats::Precision::Absent), + datafusion::common::stats::Precision::Inexact(value) => sanitize_value(value) + .map(datafusion::common::stats::Precision::Inexact) + .unwrap_or(datafusion::common::stats::Precision::Absent), + datafusion::common::stats::Precision::Absent => { + datafusion::common::stats::Precision::Absent + } + } } impl DisplayAs for DeltaScanByAddsExec { fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { match t { DisplayFormatType::Default | DisplayFormatType::Verbose => { - write!(f, "DeltaScanByAddsExec(table_path={})", self.table_url) + write!( + f, + "DeltaScanByAddsExec(table_path={}, version={}, projection={:?}, limit={:?}, pushdown={})", + self.table_url, + self.version, + self.projection, + self.limit, + self.pushdown_filter.is_some() + ) } DisplayFormatType::TreeRender => { - write!(f, "DeltaScanByAddsExec: table_path={}", self.table_url) + write!( + f, + "DeltaScanByAddsExec: table_path={}, version={}, projection={:?}, limit={:?}, pushdown={}", + self.table_url, + self.version, + self.projection, + self.limit, + self.pushdown_filter.is_some() + ) } } } } + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::ExecutionPlan; + use datafusion_common::stats::{ColumnStatistics, Precision, Statistics}; + use datafusion_common::{DataFusionError, Result, ScalarValue}; + use url::Url; + + use super::{map_statistics_to_schema, DeltaScanByAddsExec}; + + #[test] + fn test_map_statistics_to_schema_by_name() { + let source_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + ])); + let target_schema = Arc::new(Schema::new(vec![ + Field::new("b", DataType::Int64, true), + Field::new("a", DataType::Int64, true), + Field::new("_virtual", DataType::Utf8, true), + ])); + + let source_stats = Statistics { + num_rows: Precision::Exact(42), + total_byte_size: Precision::Exact(4096), + column_statistics: vec![ + ColumnStatistics { + null_count: Precision::Exact(1), + max_value: Precision::Exact(ScalarValue::Int64(Some(9))), + min_value: Precision::Exact(ScalarValue::Null), + sum_value: Precision::Absent, + distinct_count: Precision::Exact(7), + byte_size: Precision::Absent, + }, + ColumnStatistics { + null_count: Precision::Exact(2), + max_value: Precision::Exact(ScalarValue::Int64(Some(99))), + min_value: Precision::Exact(ScalarValue::Int64(Some(10))), + sum_value: Precision::Absent, + distinct_count: Precision::Exact(11), + byte_size: Precision::Absent, + }, + ], + }; + + let mapped = map_statistics_to_schema(&source_stats, &source_schema, &target_schema); + assert_eq!(mapped.num_rows, Precision::Exact(42)); + assert_eq!(mapped.total_byte_size, Precision::Exact(4096)); + assert_eq!(mapped.column_statistics.len(), 3); + + // `b` lands first in target schema. + assert_eq!(mapped.column_statistics[0].null_count, Precision::Exact(2)); + assert_eq!( + mapped.column_statistics[0].min_value, + Precision::Exact(ScalarValue::Int64(Some(10))) + ); + + // `a` lands second in target schema. + assert_eq!(mapped.column_statistics[1].null_count, Precision::Exact(1)); + assert_eq!(mapped.column_statistics[1].min_value, Precision::Absent); + assert_eq!( + mapped.column_statistics[1].max_value, + Precision::Exact(ScalarValue::Int64(Some(9))) + ); + + // Unknown column gets unknown stats. + assert_eq!(mapped.column_statistics[2], ColumnStatistics::new_unknown()); + } + + #[test] + fn test_scan_by_adds_exposes_known_statistics() { + let table_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + ])); + let output_schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int64, true)])); + let input_schema = Arc::new(Schema::new(vec![Field::new( + "action", + DataType::Utf8, + true, + )])); + + let input = Arc::new(EmptyExec::new(input_schema)); + let table_stats = Statistics { + num_rows: Precision::Exact(123), + total_byte_size: Precision::Exact(2048), + column_statistics: vec![ + ColumnStatistics::new_unknown(), + ColumnStatistics { + null_count: Precision::Exact(4), + max_value: Precision::Exact(ScalarValue::Int64(Some(88))), + min_value: Precision::Exact(ScalarValue::Int64(Some(1))), + sum_value: Precision::Absent, + distinct_count: Precision::Exact(12), + byte_size: Precision::Absent, + }, + ], + }; + + let table_url = Url::parse("file:///tmp/table").ok(); + assert!(table_url.is_some()); + let table_url = match table_url { + Some(url) => url, + None => return, + }; + + let scan = DeltaScanByAddsExec::new( + input, + table_url, + 1, + table_schema, + output_schema, + crate::datasource::DeltaScanConfig::default(), + None, + None, + None, + ) + .with_table_statistics(Some(table_stats)); + + let stats = scan.partition_statistics(None).ok(); + assert!(stats.is_some()); + let stats = match stats { + Some(s) => s, + None => return, + }; + assert_eq!(stats.num_rows, Precision::Exact(123)); + assert_eq!(stats.column_statistics.len(), 1); + assert_eq!(stats.column_statistics[0].null_count, Precision::Exact(4)); + } + + #[test] + fn test_scan_by_adds_accepts_output_statistics_directly() -> Result<()> { + let table_schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int64, true), + Field::new("b", DataType::Int64, true), + ])); + let output_schema = Arc::new(Schema::new(vec![Field::new("b", DataType::Int64, true)])); + let input_schema = Arc::new(Schema::new(vec![Field::new( + "action", + DataType::Utf8, + true, + )])); + let input = Arc::new(EmptyExec::new(input_schema)); + let table_url = + Url::parse("file:///tmp/table").map_err(|e| DataFusionError::External(Box::new(e)))?; + + let output_stats = Statistics { + num_rows: Precision::Exact(88), + total_byte_size: Precision::Exact(1024), + column_statistics: vec![ColumnStatistics { + null_count: Precision::Exact(6), + max_value: Precision::Exact(ScalarValue::Int64(Some(10))), + min_value: Precision::Exact(ScalarValue::Int64(Some(2))), + sum_value: Precision::Absent, + distinct_count: Precision::Exact(5), + byte_size: Precision::Absent, + }], + }; + + let scan = DeltaScanByAddsExec::new( + input, + table_url, + 1, + table_schema, + output_schema, + crate::datasource::DeltaScanConfig::default(), + None, + None, + None, + ) + .with_output_statistics(Some(output_stats)); + + let stats = scan.partition_statistics(None)?; + assert_eq!(stats.num_rows, Precision::Exact(88)); + assert_eq!(stats.column_statistics.len(), 1); + assert_eq!(stats.column_statistics[0].null_count, Precision::Exact(6)); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/physical_plan/writer_exec.rs b/crates/sail-delta-lake/src/physical_plan/writer_exec.rs index 96c5345c9e..b6dc7746a4 100644 --- a/crates/sail-delta-lake/src/physical_plan/writer_exec.rs +++ b/crates/sail-delta-lake/src/physical_plan/writer_exec.rs @@ -44,23 +44,22 @@ use datafusion::physical_plan::{ }; use datafusion_common::{internal_err, DataFusionError, Result}; use datafusion_physical_expr::{Distribution, EquivalenceProperties}; -use delta_kernel::engine::arrow_conversion::{TryIntoArrow, TryIntoKernel}; -use delta_kernel::schema::StructType; -use delta_kernel::table_features::ColumnMappingMode; use futures::stream::{once, StreamExt}; use sail_common_datafusion::datasource::PhysicalSinkMode; -use serde_json::Value; use url::Url; use crate::conversion::DeltaTypeConverter; -use crate::kernel::models::{contains_timestampntz, Action, Metadata, Protocol}; +use crate::kernel::transaction::OperationMetrics; use crate::kernel::{DeltaOperation, SaveMode}; use crate::operations::write::writer::{DeltaWriter, WriterConfig}; -use crate::options::{ColumnMappingModeOption, TableDeltaOptions}; -use crate::physical_plan::{delta_action_schema, encode_actions, CommitMeta, ExecAction}; +use crate::options::TableDeltaOptions; +use crate::physical_plan::{delta_action_schema, encode_actions, ExecCommitMeta}; use crate::schema::{ annotate_for_column_mapping, compute_max_column_id, evolve_schema, get_physical_schema, - normalize_delta_schema, + metadata_for_create_with_struct_type, normalize_delta_schema, protocol_for_create, +}; +use crate::spec::{ + contains_timestampntz_arrow, Action, ColumnMappingMode, StructType, TableProperties, }; use crate::storage::{get_object_store_from_context, StorageConfig}; use crate::table::open_table_with_object_store; @@ -80,6 +79,7 @@ pub struct DeltaWriterExec { input: Arc, table_url: Url, options: TableDeltaOptions, + metadata_configuration: HashMap, partition_columns: Vec, sink_mode: PhysicalSinkMode, table_exists: bool, @@ -87,7 +87,7 @@ pub struct DeltaWriterExec { /// Optional override for commit operation metadata. operation_override: Option, metrics: ExecutionPlanMetricsSet, - cache: PlanProperties, + cache: Arc, } impl DeltaWriterExec { @@ -105,10 +105,12 @@ impl DeltaWriterExec { } map } + #[expect(clippy::too_many_arguments)] pub fn new( input: Arc, table_url: Url, options: TableDeltaOptions, + metadata_configuration: HashMap, partition_columns: Vec, sink_mode: PhysicalSinkMode, table_exists: bool, @@ -122,6 +124,7 @@ impl DeltaWriterExec { input, table_url, options, + metadata_configuration, partition_columns, sink_mode, table_exists, @@ -132,13 +135,13 @@ impl DeltaWriterExec { }) } - fn compute_properties(schema: SchemaRef, output_partitions: usize) -> PlanProperties { - PlanProperties::new( + fn compute_properties(schema: SchemaRef, output_partitions: usize) -> Arc { + Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(output_partitions.max(1)), EmissionType::Final, Boundedness::Bounded, - ) + )) } pub fn table_url(&self) -> &Url { @@ -149,6 +152,10 @@ impl DeltaWriterExec { &self.options } + pub fn metadata_configuration(&self) -> &HashMap { + &self.metadata_configuration + } + pub fn partition_columns(&self) -> &[String] { &self.partition_columns } @@ -184,7 +191,7 @@ impl ExecutionPlan for DeltaWriterExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -263,6 +270,7 @@ impl ExecutionPlan for DeltaWriterExec { Arc::clone(&children[0]), self.table_url.clone(), self.options.clone(), + self.metadata_configuration.clone(), self.partition_columns.clone(), self.sink_mode.clone(), self.table_exists, @@ -304,6 +312,7 @@ impl DeltaWriterExec { let table_url = self.table_url.clone(); let options = self.options.clone(); + let metadata_configuration = self.metadata_configuration.clone(); let partition_columns = self.partition_columns.clone(); let sink_mode = self.sink_mode.clone(); let table_exists = self.table_exists; @@ -398,7 +407,7 @@ impl DeltaWriterExec { // Still update execution metrics so callers see a completed node. output_rows.add(0); output_bytes.add(0); - return encode_actions(vec![CommitMeta::default().try_into()?]); + return encode_actions(vec![], Some(ExecCommitMeta::default())); } } PhysicalSinkMode::OverwritePartitions => { @@ -428,203 +437,152 @@ impl DeltaWriterExec { // Determine effective column mapping mode let effective_mode = if let Some(table) = &table { - let mode = table + table .snapshot() .map_err(|e| DataFusionError::External(Box::new(e)))? - .effective_column_mapping_mode(); - match mode { - ColumnMappingMode::Name => ColumnMappingModeOption::Name, - ColumnMappingMode::Id => ColumnMappingModeOption::Id, - _ => ColumnMappingModeOption::None, - } + .effective_column_mapping_mode() } else { - options.column_mapping_mode + // For new tables, column mapping only comes from the metadata configuration + // that will be written into the initial Metadata action. + metadata_configuration + .get("delta.columnMapping.mode") + .and_then(|v| ColumnMappingMode::try_from(v.as_str()).ok()) + .unwrap_or_default() }; // Determine the kernel column mapping mode once for downstream conversions - let kernel_mode = match effective_mode { - ColumnMappingModeOption::Name => ColumnMappingMode::Name, - ColumnMappingModeOption::Id => ColumnMappingMode::Id, - ColumnMappingModeOption::None => ColumnMappingMode::None, - }; + let kernel_mode = effective_mode; - // If creating a new table and column mapping or timestampNtz features are required, - // prepare initial protocol+metadata + // If creating a new table, always materialize protocol+metadata so explicit + // table properties are persisted in the first Delta log commit. let mut annotated_schema_opt: Option = None; if !table_exists { // Build kernel schema for feature detection - let kernel_schema: StructType = final_schema - .as_ref() - .try_into_kernel() + let has_timestamp_ntz = contains_timestampntz_arrow(final_schema.as_ref()); + let kernel_schema = StructType::try_from(final_schema.as_ref()) .map_err(|e| DataFusionError::External(Box::new(e)))?; - let has_timestamp_ntz = contains_timestampntz(kernel_schema.fields()); - - if matches!( - effective_mode, - ColumnMappingModeOption::Name | ColumnMappingModeOption::Id - ) { + let mut configuration = metadata_configuration.clone(); + let metadata_schema = if !matches!(effective_mode, ColumnMappingMode::None) { let annotated_schema = annotate_for_column_mapping(&kernel_schema); - annotated_schema_opt = Some(annotated_schema.clone()); - - let mut reader_features = vec!["columnMapping"]; - let mut writer_features = vec!["columnMapping"]; - if has_timestamp_ntz { - reader_features.push("timestampNtz"); - writer_features.push("timestampNtz"); - } - - let protocol: Protocol = serde_json::from_value(serde_json::json!({ - "minReaderVersion": 3, - "minWriterVersion": 7, - "readerFeatures": reader_features, - "writerFeatures": writer_features - })) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - let mut configuration = HashMap::new(); - let mode_str = match effective_mode { - ColumnMappingModeOption::Name => "name", - ColumnMappingModeOption::Id => "id", - ColumnMappingModeOption::None => "none", - }; - configuration - .insert("delta.columnMapping.mode".to_string(), mode_str.to_string()); - // Set maxColumnId for new tables - let max_id = compute_max_column_id(&annotated_schema); configuration.insert( - "delta.columnMapping.maxColumnId".to_string(), - max_id.to_string(), + "delta.columnMapping.mode".to_string(), + effective_mode.as_ref().to_string(), ); - - let metadata = Metadata::try_new( - None, - None, - annotated_schema.clone(), - partition_columns.clone(), - Utc::now().timestamp_millis(), - configuration, - ) - .map_err(|e| DataFusionError::External(Box::new(e)))?; - - initial_actions.push(Action::Protocol(protocol.clone())); - initial_actions.push(Action::Metadata(metadata.clone())); - log::trace!( - "init_protocol: {:?}, init_metadata_has_mode: {:?}", - &protocol, - metadata.configuration().get("delta.columnMapping.mode") + configuration.insert( + "delta.columnMapping.maxColumnId".to_string(), + compute_max_column_id(&annotated_schema).to_string(), ); + annotated_schema_opt = Some(annotated_schema.clone()); + annotated_schema + } else { + kernel_schema + }; - operation = Some(DeltaOperation::Create { - mode: SaveMode::ErrorIfExists, - location: table_url.to_string(), - protocol, - metadata, - }); - } else if has_timestamp_ntz { - let protocol: Protocol = serde_json::from_value(serde_json::json!({ - "minReaderVersion": 3, - "minWriterVersion": 7, - "readerFeatures": ["timestampNtz"], - "writerFeatures": ["timestampNtz"] - })) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let protocol = protocol_for_create( + !matches!(effective_mode, ColumnMappingMode::None), + has_timestamp_ntz, + TableProperties::from(configuration.iter()).enable_in_commit_timestamps(), + ) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let metadata = metadata_for_create_with_struct_type( + metadata_schema, + partition_columns.clone(), + Utc::now().timestamp_millis(), + configuration, + ) + .map_err(|e| DataFusionError::External(Box::new(e)))?; - let metadata = Metadata::try_new( - None, - None, - kernel_schema, - partition_columns.clone(), - Utc::now().timestamp_millis(), - HashMap::new(), - ) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + initial_actions.push(Action::Protocol(protocol.clone())); + initial_actions.push(Action::Metadata(metadata.clone())); - initial_actions.push(Action::Protocol(protocol.clone())); - initial_actions.push(Action::Metadata(metadata.clone())); + log::trace!( + "init_protocol: {:?}, init_metadata_has_mode: {:?}", + &protocol, + metadata.configuration().get("delta.columnMapping.mode") + ); - operation = Some(DeltaOperation::Create { - mode: SaveMode::ErrorIfExists, - location: table_url.to_string(), - protocol, - metadata, - }); - } + operation = Some(DeltaOperation::Create { + mode: SaveMode::ErrorIfExists, + location: table_url.to_string(), + protocol: Box::new(protocol.clone()), + metadata: Box::new(metadata.clone()), + }); } // Build physical writer schema (use physical names and set parquet field ids) // Prefer schema from pending Metadata action (schema evolution) if present - let (writer_schema, physical_partition_columns, logical_kernel_for_mapping) = if matches!( - effective_mode, - ColumnMappingModeOption::Name | ColumnMappingModeOption::Id - ) { - // Determine logical kernel schema (annotated for new tables; from snapshot for existing tables) - let logical_kernel: StructType = if let Some(meta_action_schema) = schema_actions - .iter() - .find_map(|a| match a { - Action::Metadata(m) => Some( - m.parse_schema() - .map_err(|e| DataFusionError::External(Box::new(e))), - ), - _ => None, - }) - .transpose()? - { - meta_action_schema - } else if table_exists { - let table = table.as_ref().ok_or_else(|| { - DataFusionError::Internal( - "table exists but was not loaded for column-mapped write planning" - .to_string(), + let (writer_schema, physical_partition_columns, logical_kernel_for_mapping) = + if !matches!(effective_mode, ColumnMappingMode::None) { + // Determine logical kernel schema (annotated for new tables; from snapshot for existing tables) + let logical_kernel: StructType = if let Some(meta_action_schema) = + schema_actions + .iter() + .find_map(|a| match a { + Action::Metadata(m) => Some( + m.parse_schema() + .map_err(|e| DataFusionError::External(Box::new(e))), + ), + _ => None, + }) + .transpose()? + { + meta_action_schema + } else if table_exists { + let table = table.as_ref().ok_or_else(|| { + DataFusionError::Internal( + "table exists but was not loaded for column-mapped write planning" + .to_string(), + ) + })?; + StructType::try_from( + table + .snapshot() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .schema(), ) - })?; - table - .snapshot() .map_err(|e| DataFusionError::External(Box::new(e)))? - .snapshot() - .schema() - .clone() - } else { - annotated_schema_opt.clone().ok_or_else(|| { - DataFusionError::Plan( + } else { + annotated_schema_opt.clone().ok_or_else(|| { + DataFusionError::Plan( "Annotated schema should be present for new table with column mapping" .to_string(), ) - })? - }; - - // Build physical Arrow schema enriched with PARQUET:field_id - let enriched_arrow = get_physical_schema(&logical_kernel, kernel_mode); - let arc_schema = Arc::new(enriched_arrow); - let writer_field_names: Vec = arc_schema - .fields() - .iter() - .map(|f| f.name().clone()) - .collect(); - log::trace!( - "effective_mode: {:?}, writer_schema_fields: {:?}", - effective_mode, - &writer_field_names - ); + })? + }; - // Resolve logical partition columns to their physical names so that the - // writer can locate them in the batch when column mapping is enabled. - let resolved_partitions = partition_columns - .iter() - .map(|logical_name| { - let field = logical_kernel.field(logical_name).ok_or_else(|| { - DataFusionError::Plan(format!( - "Partition column '{}' not found in logical schema", - logical_name - )) - })?; - Ok(field.physical_name(kernel_mode).to_string()) - }) - .collect::>>()?; + // Build physical Arrow schema enriched with PARQUET:field_id + let enriched_arrow = get_physical_schema(&logical_kernel, kernel_mode); + let arc_schema = Arc::new(enriched_arrow); + let writer_field_names: Vec = arc_schema + .fields() + .iter() + .map(|f| f.name().clone()) + .collect(); + log::trace!( + "effective_mode: {:?}, writer_schema_fields: {:?}", + effective_mode, + &writer_field_names + ); - (arc_schema, resolved_partitions, Some(logical_kernel)) - } else { - (final_schema.clone(), partition_columns.clone(), None) - }; + // Resolve logical partition columns to their physical names so that the + // writer can locate them in the batch when column mapping is enabled. + let resolved_partitions = partition_columns + .iter() + .map(|logical_name| { + let field = logical_kernel.field(logical_name).ok_or_else(|| { + DataFusionError::Plan(format!( + "Partition column '{}' not found in logical schema", + logical_name + )) + })?; + Ok(field.physical_name(kernel_mode).to_string()) + }) + .collect::>>()?; + + (arc_schema, resolved_partitions, Some(logical_kernel)) + } else { + (final_schema.clone(), partition_columns.clone(), None) + }; let writer_config = WriterConfig::new( writer_schema.clone(), @@ -710,21 +668,17 @@ impl DeltaWriterExec { let operation = operation_override.or(operation); - let mut operation_metrics: HashMap = HashMap::new(); - operation_metrics.insert("numOutputRows".to_string(), Value::from(total_rows)); - operation_metrics.insert("numFiles".to_string(), Value::from(num_added_files)); - operation_metrics.insert("numOutputFiles".to_string(), Value::from(num_added_files)); - operation_metrics.insert("numAddedFiles".to_string(), Value::from(num_added_files)); - operation_metrics.insert("numOutputBytes".to_string(), Value::from(num_added_bytes)); - operation_metrics.insert("numAddedBytes".to_string(), Value::from(num_added_bytes)); - operation_metrics.insert( - "writeTimeMs".to_string(), - Value::from(write_time_ms.saturating_add(close_time_ms)), - ); - operation_metrics.insert( - "executionTimeMs".to_string(), - Value::from(exec_start.elapsed().as_millis() as u64), - ); + let operation_metrics = OperationMetrics { + num_files: Some(num_added_files), + num_output_rows: Some(total_rows), + num_output_bytes: Some(num_added_bytes), + execution_time_ms: Some(exec_start.elapsed().as_millis() as u64), + num_added_files: Some(num_added_files), + num_output_files: Some(num_added_files), + num_added_bytes: Some(num_added_bytes), + write_time_ms: Some(write_time_ms.saturating_add(close_time_ms)), + ..Default::default() + }; output_rows.add(usize::try_from(total_rows).unwrap_or(usize::MAX)); output_bytes.add(usize::try_from(num_added_bytes).unwrap_or(usize::MAX)); @@ -734,21 +688,23 @@ impl DeltaWriterExec { // - schema evolution actions (metadata) // - Add actions (one row per file) // - CommitMeta row (row_count + operation + metrics) - let mut exec_actions: Vec = Vec::new(); + let mut output_actions: Vec = Vec::new(); if partition == 0 { for ia in &initial_actions { match ia { - Action::Protocol(p) => exec_actions.push(p.clone().try_into()?), - Action::Metadata(m) => exec_actions.push(m.clone().try_into()?), + Action::Protocol(_) | Action::Metadata(_) => { + output_actions.push(ia.clone()) + } _ => {} } } for sa in &actions { match sa { - Action::Metadata(m) => exec_actions.push(m.clone().try_into()?), - Action::Protocol(p) => exec_actions.push(p.clone().try_into()?), + Action::Metadata(_) | Action::Protocol(_) => { + output_actions.push(sa.clone()) + } _ => {} } } @@ -756,20 +712,18 @@ impl DeltaWriterExec { for action in actions { if let Action::Add(add) = action { - exec_actions.push(add.into()); + output_actions.push(Action::Add(add)); } } - exec_actions.push( - CommitMeta { + encode_actions( + output_actions, + Some(ExecCommitMeta { row_count: total_rows, operation, operation_metrics, - } - .try_into()?, - ); - - encode_actions(exec_actions) + }), + ) }; let stream = once(future); @@ -812,10 +766,11 @@ impl DeltaWriterExec { schema_mode: Option, ) -> Result<(SchemaRef, Vec)> { let table_metadata = table.snapshot()?.metadata(); - let table_schema = table_metadata - .parse_schema() - .map_err(|e| DataFusionError::External(Box::new(e)))?; - let table_arrow_schema = Arc::new((&table_schema).try_into_arrow()?); + let table_arrow_schema = Arc::new( + table_metadata + .parse_schema_arrow() + .map_err(|e| DataFusionError::External(Box::new(e)))?, + ); match schema_mode { Some(SchemaMode::Merge) => { @@ -823,14 +778,13 @@ impl DeltaWriterExec { let merged_schema = Self::merge_schemas(&table_arrow_schema, input_schema)?; if merged_schema.fields() != table_arrow_schema.fields() { // Schema has changed, create metadata action - let candidate_kernel: StructType = merged_schema - .as_ref() - .try_into_kernel() + let candidate_kernel = StructType::try_from(merged_schema.as_ref()) .map_err(|e| DataFusionError::External(Box::new(e)))?; let snapshot = table.snapshot()?; let current_metadata = snapshot.metadata(); - let current_kernel = snapshot.snapshot().schema().clone(); + let current_kernel = StructType::try_from(snapshot.schema()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; let kmode = snapshot.effective_column_mapping_mode(); // Delegate schema evolution to SchemaManager @@ -846,14 +800,13 @@ impl DeltaWriterExec { } Some(SchemaMode::Overwrite) => { // Use input schema as-is - let candidate_kernel: StructType = input_schema - .as_ref() - .try_into_kernel() + let candidate_kernel = StructType::try_from(input_schema.as_ref()) .map_err(|e| DataFusionError::External(Box::new(e)))?; let snapshot = table.snapshot()?; let current_metadata = snapshot.metadata(); - let current_kernel = snapshot.snapshot().schema().clone(); + let current_kernel = StructType::try_from(snapshot.schema()) + .map_err(|e| DataFusionError::External(Box::new(e)))?; let kmode = snapshot.effective_column_mapping_mode(); // Delegate schema overwrite to SchemaManager diff --git a/crates/sail-delta-lake/src/schema/arrow_conversions.rs b/crates/sail-delta-lake/src/schema/arrow_conversions.rs new file mode 100644 index 0000000000..b58f610f3c --- /dev/null +++ b/crates/sail-delta-lake/src/schema/arrow_conversions.rs @@ -0,0 +1,261 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{ + DataType as ArrowDataType, Field as ArrowField, Schema as ArrowSchema, + SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use datafusion::arrow::error::ArrowError; +use itertools::Itertools; + +use crate::spec::schema::{ + ArrayType, DataType, MapType, MetadataValue, PrimitiveType, StructField, StructType, +}; + +// ── Delta → Arrow ──────────────────────────────────────────────────────────── + +impl TryFrom<&StructType> for ArrowSchema { + type Error = ArrowError; + fn try_from(s: &StructType) -> Result { + let fields: Vec = s.fields().map(ArrowField::try_from).try_collect()?; + Ok(Self::new(fields)) + } +} + +impl TryFrom<&StructField> for ArrowField { + type Error = ArrowError; + fn try_from(f: &StructField) -> Result { + let metadata = f + .metadata() + .iter() + .map(|(key, val)| match val { + MetadataValue::String(val) => Ok((key.clone(), val.clone())), + _ => Ok((key.clone(), serde_json::to_string(val)?)), + }) + .collect::, serde_json::Error>>() + .map_err(|err| ArrowError::JsonError(err.to_string()))?; + + Ok(ArrowField::new( + f.name(), + ArrowDataType::try_from(f.data_type())?, + f.is_nullable(), + ) + .with_metadata(metadata)) + } +} + +impl TryFrom<&ArrayType> for ArrowField { + type Error = ArrowError; + fn try_from(a: &ArrayType) -> Result { + Ok(ArrowField::new( + "element", + ArrowDataType::try_from(a.element_type())?, + a.contains_null(), + )) + } +} + +impl TryFrom<&MapType> for ArrowField { + type Error = ArrowError; + fn try_from(m: &MapType) -> Result { + Ok(ArrowField::new( + "key_value", + ArrowDataType::Struct( + vec![ + ArrowField::new("key", ArrowDataType::try_from(m.key_type())?, false), + ArrowField::new( + "value", + ArrowDataType::try_from(m.value_type())?, + m.value_contains_null(), + ), + ] + .into(), + ), + false, + )) + } +} + +impl TryFrom<&DataType> for ArrowDataType { + type Error = ArrowError; + fn try_from(t: &DataType) -> Result { + match t { + DataType::Primitive(p) => match p { + PrimitiveType::String => Ok(Self::Utf8), + PrimitiveType::Long => Ok(Self::Int64), + PrimitiveType::Integer => Ok(Self::Int32), + PrimitiveType::Short => Ok(Self::Int16), + PrimitiveType::Byte => Ok(Self::Int8), + PrimitiveType::Float => Ok(Self::Float32), + PrimitiveType::Double => Ok(Self::Float64), + PrimitiveType::Boolean => Ok(Self::Boolean), + PrimitiveType::Binary => Ok(Self::Binary), + PrimitiveType::Decimal(dtype) => { + Ok(Self::Decimal128(dtype.precision(), dtype.scale() as i8)) + } + PrimitiveType::Date => Ok(Self::Date32), + PrimitiveType::Timestamp => { + Ok(Self::Timestamp(TimeUnit::Microsecond, Some("UTC".into()))) + } + PrimitiveType::TimestampNtz => Ok(Self::Timestamp(TimeUnit::Microsecond, None)), + }, + DataType::Struct(s) => Ok(Self::Struct( + s.fields() + .map(ArrowField::try_from) + .collect::, ArrowError>>()? + .into(), + )), + DataType::Array(a) => Ok(Self::List(Arc::new(ArrowField::try_from(a.as_ref())?))), + DataType::Map(m) => Ok(Self::Map( + Arc::new(ArrowField::try_from(m.as_ref())?), + false, + )), + DataType::Variant(s) => { + if *t == DataType::unshredded_variant() { + Ok(Self::Struct( + s.fields() + .map(ArrowField::try_from) + .collect::, ArrowError>>()? + .into(), + )) + } else { + Err(ArrowError::SchemaError( + "Incorrect Variant Schema: only unshredded variant is supported" + .to_string(), + )) + } + } + } + } +} + +// ── Arrow → Delta ──────────────────────────────────────────────────────────── + +impl TryFrom<&ArrowSchema> for StructType { + type Error = ArrowError; + fn try_from(arrow_schema: &ArrowSchema) -> Result { + StructType::try_from_results( + arrow_schema + .fields() + .iter() + .map(|field| StructField::try_from(field.as_ref())), + ) + .map_err(|e| ArrowError::from_external_error(Box::new(e))) + } +} + +impl TryFrom for StructType { + type Error = ArrowError; + fn try_from(arrow_schema: ArrowSchemaRef) -> Result { + StructType::try_from(arrow_schema.as_ref()) + } +} + +impl TryFrom<&ArrowField> for StructField { + type Error = ArrowError; + fn try_from(arrow_field: &ArrowField) -> Result { + Ok(StructField::new( + arrow_field.name().clone(), + DataType::try_from(arrow_field.data_type())?, + arrow_field.is_nullable(), + ) + .with_metadata( + arrow_field + .metadata() + .iter() + .map(|(k, v)| (k.clone(), parse_metadata_value(v))), + )) + } +} + +fn parse_metadata_value(v: &str) -> MetadataValue { + match serde_json::from_str::(v) { + Ok(serde_json::Value::Number(n)) => n + .as_i64() + .map(MetadataValue::Number) + .unwrap_or_else(|| MetadataValue::String(v.to_string())), + Ok(serde_json::Value::Bool(b)) => MetadataValue::Boolean(b), + Ok(serde_json::Value::String(s)) => MetadataValue::String(s), + Ok(other) => MetadataValue::Other(other), + Err(_) => MetadataValue::String(v.to_string()), + } +} + +impl TryFrom<&ArrowDataType> for DataType { + type Error = ArrowError; + fn try_from(arrow_datatype: &ArrowDataType) -> Result { + match arrow_datatype { + ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Utf8View => { + Ok(DataType::STRING) + } + ArrowDataType::Int64 | ArrowDataType::UInt64 => Ok(DataType::LONG), + ArrowDataType::Int32 | ArrowDataType::UInt32 => Ok(DataType::INTEGER), + ArrowDataType::Int16 | ArrowDataType::UInt16 => Ok(DataType::SHORT), + ArrowDataType::Int8 | ArrowDataType::UInt8 => Ok(DataType::BYTE), + ArrowDataType::Float32 => Ok(DataType::FLOAT), + ArrowDataType::Float64 => Ok(DataType::DOUBLE), + ArrowDataType::Boolean => Ok(DataType::BOOLEAN), + ArrowDataType::Binary + | ArrowDataType::FixedSizeBinary(_) + | ArrowDataType::LargeBinary + | ArrowDataType::BinaryView => Ok(DataType::BINARY), + ArrowDataType::Decimal128(p, s) => { + if *s < 0 { + return Err(ArrowError::SchemaError( + "Negative scales are not supported in Delta".to_string(), + )); + } + DataType::decimal(*p, *s as u8) + .map_err(|e| ArrowError::from_external_error(Box::new(e))) + } + ArrowDataType::Date32 | ArrowDataType::Date64 => Ok(DataType::DATE), + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => Ok(DataType::TIMESTAMP_NTZ), + ArrowDataType::Timestamp(TimeUnit::Microsecond, Some(tz)) + if tz.eq_ignore_ascii_case("utc") => + { + Ok(DataType::TIMESTAMP) + } + ArrowDataType::Struct(fields) => DataType::try_struct_type_from_results( + fields + .iter() + .map(|field| StructField::try_from(field.as_ref())), + ) + .map_err(|e| ArrowError::from_external_error(Box::new(e))), + ArrowDataType::List(field) + | ArrowDataType::ListView(field) + | ArrowDataType::LargeList(field) + | ArrowDataType::LargeListView(field) + | ArrowDataType::FixedSizeList(field, _) => Ok(ArrayType::new( + DataType::try_from(field.data_type())?, + field.is_nullable(), + ) + .into()), + ArrowDataType::Map(field, _) => { + if let ArrowDataType::Struct(struct_fields) = field.data_type() { + let key_type = DataType::try_from(struct_fields[0].data_type())?; + let value_type = DataType::try_from(struct_fields[1].data_type())?; + Ok(MapType::new(key_type, value_type, struct_fields[1].is_nullable()).into()) + } else { + Err(ArrowError::SchemaError( + "DataType::Map should contain a struct field child".to_string(), + )) + } + } + ArrowDataType::Dictionary(_, value_type) => { + Ok(DataType::try_from(value_type.as_ref())?) + } + unsupported => Err(ArrowError::SchemaError(format!( + "Invalid data type for Delta Lake: {unsupported}" + ))), + } + } +} + +impl TryFrom for DataType { + type Error = crate::spec::DeltaError; + + fn try_from(schema: ArrowSchemaRef) -> Result { + let struct_type = StructType::try_from(schema)?; + Ok(DataType::Struct(Box::new(struct_type))) + } +} diff --git a/crates/sail-delta-lake/src/schema/converter.rs b/crates/sail-delta-lake/src/schema/converter.rs index ddcc0902eb..5c41d9c6dc 100644 --- a/crates/sail-delta-lake/src/schema/converter.rs +++ b/crates/sail-delta-lake/src/schema/converter.rs @@ -2,25 +2,13 @@ use std::collections::HashMap; use std::sync::Arc; use datafusion::arrow::datatypes::{ - DataType as ArrowDataType, Field, Schema as ArrowSchema, SchemaRef, + DataType as ArrowDataType, Field, Fields, Schema as ArrowSchema, SchemaRef, }; -use delta_kernel::engine::arrow_conversion::{TryIntoArrow, TryIntoKernel}; -use delta_kernel::schema::{ - ColumnMetadataKey, MetadataValue, StructField as KernelStructField, StructType, -}; -use delta_kernel::table_features::ColumnMappingMode; - -use crate::kernel::{DeltaResult, DeltaTableError}; - -pub fn logical_arrow_to_kernel(arrow: &ArrowSchema) -> DeltaResult { - Ok(arrow.try_into_kernel()?) -} -pub fn kernel_to_logical_arrow(schema: &StructType) -> ArrowSchema { - schema - .try_into_arrow() - .unwrap_or_else(|_| ArrowSchema::empty()) -} +use crate::spec::{ + ColumnMappingMode, ColumnMetadataKey, DataType, DeltaError as DeltaTableError, DeltaResult, + MetadataValue, StructField, StructType, +}; pub fn arrow_schema_from_struct_type( schema: &StructType, @@ -50,9 +38,8 @@ pub fn arrow_schema_from_struct_type( pub fn get_physical_arrow_schema(logical: &StructType, mode: ColumnMappingMode) -> ArrowSchema { let physical_kernel = logical.make_physical(mode); - let physical_arrow: ArrowSchema = (&physical_kernel) - .try_into_arrow() - .unwrap_or_else(|_| ArrowSchema::empty()); + let physical_arrow: ArrowSchema = + ArrowSchema::try_from(&physical_kernel).unwrap_or_else(|_| ArrowSchema::empty()); match mode { ColumnMappingMode::Name | ColumnMappingMode::Id => { enrich_arrow_with_parquet_field_ids(&physical_arrow, logical) @@ -61,8 +48,120 @@ pub fn get_physical_arrow_schema(logical: &StructType, mode: ColumnMappingMode) } } -fn field_from_struct_field(field: &KernelStructField) -> Result { - let arrow_field: Field = field.try_into_arrow()?; +/// Apply Delta column mapping to an Arrow schema, renaming logical→physical column names. +/// +/// This is the Arrow-native equivalent of `StructType::make_physical`. It reads the +/// `delta.columnMapping.physicalName` metadata from each Arrow field and renames the +/// field accordingly. +pub fn make_physical_arrow_schema(logical: &ArrowSchema, mode: ColumnMappingMode) -> ArrowSchema { + let new_fields: Vec = logical + .fields() + .iter() + .map(|f| make_physical_arrow_field(f.as_ref(), mode)) + .collect(); + ArrowSchema::new(new_fields).with_metadata(logical.metadata().clone()) +} + +/// Get the physical name of an Arrow field under a given column mapping mode. +/// +/// This is the Arrow-native equivalent of `StructField::physical_name`. +pub fn arrow_field_physical_name(field: &Field, mode: ColumnMappingMode) -> &str { + match mode { + ColumnMappingMode::None => field.name().as_str(), + ColumnMappingMode::Id | ColumnMappingMode::Name => field + .metadata() + .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) + .map(|s| s.as_str()) + .unwrap_or_else(|| field.name().as_str()), + } +} + +fn make_physical_arrow_field(field: &Field, mode: ColumnMappingMode) -> Field { + let physical_name_key = ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(); + let field_id_key = ColumnMetadataKey::ColumnMappingId.as_ref(); + let parquet_field_id_key = ColumnMetadataKey::ParquetFieldId.as_ref(); + + let mut meta = field.metadata().clone(); + + let name = match mode { + ColumnMappingMode::None => field.name().clone(), + ColumnMappingMode::Id | ColumnMappingMode::Name => meta + .get(physical_name_key) + .cloned() + .unwrap_or_else(|| field.name().clone()), + }; + + match mode { + ColumnMappingMode::Id => { + if let Some(fid) = meta.get(field_id_key).cloned() { + meta.insert(parquet_field_id_key.to_string(), fid); + } + } + ColumnMappingMode::Name => { + meta.remove(field_id_key); + meta.remove(parquet_field_id_key); + } + ColumnMappingMode::None => { + meta.remove(physical_name_key); + meta.remove(field_id_key); + meta.remove(parquet_field_id_key); + } + } + + let new_dt = make_physical_arrow_data_type(field.data_type(), mode); + Field::new(name, new_dt, field.is_nullable()).with_metadata(meta) +} + +fn make_physical_arrow_data_type(dt: &ArrowDataType, mode: ColumnMappingMode) -> ArrowDataType { + match dt { + ArrowDataType::Struct(fields) => { + let new_fields: Fields = fields + .iter() + .map(|f| Arc::new(make_physical_arrow_field(f.as_ref(), mode))) + .collect(); + ArrowDataType::Struct(new_fields) + } + other => other.clone(), + } +} + +/// Build an Arrow schema from an Arrow schema, reordering partition columns to the end +/// and optionally wrapping partition column types in a dictionary type. +pub fn arrow_schema_reorder_partitions( + schema: &ArrowSchema, + partition_columns: &[String], + wrap_partitions: bool, +) -> DeltaResult { + let mut non_partition_fields: Vec = schema + .fields() + .iter() + .filter(|f| !partition_columns.contains(f.name())) + .map(|f| f.as_ref().clone()) + .collect(); + + let partition_fields: Vec = + partition_columns + .iter() + .map(|col| { + let f = schema + .field_with_name(col) + .map_err(|_| DeltaTableError::missing_column(col))?; + let corrected = if wrap_partitions { + wrap_partition_type(f.data_type()) + } else { + f.data_type().clone() + }; + Ok(Field::new(f.name(), corrected, f.is_nullable()) + .with_metadata(f.metadata().clone())) + }) + .collect::, DeltaTableError>>()?; + + non_partition_fields.extend(partition_fields); + Ok(Arc::new(ArrowSchema::new(non_partition_fields))) +} + +fn field_from_struct_field(field: &StructField) -> Result { + let arrow_field: Field = Field::try_from(field)?; let field_type = arrow_field.data_type().clone(); Ok(Field::new( field.name().to_string(), @@ -105,16 +204,14 @@ pub fn enrich_arrow_with_parquet_field_ids( out.insert(path.clone(), (id, f.name().clone())); match f.data_type() { - delta_kernel::schema::DataType::Struct(nst) => { - build_path_map(nst.as_ref(), path, out) - } - delta_kernel::schema::DataType::Array(at) => { - if let delta_kernel::schema::DataType::Struct(nst) = at.element_type() { + DataType::Struct(nst) => build_path_map(nst.as_ref(), path, out), + DataType::Array(at) => { + if let DataType::Struct(nst) = at.element_type() { build_path_map(nst.as_ref(), path, out) } } - delta_kernel::schema::DataType::Map(mt) => { - if let delta_kernel::schema::DataType::Struct(nst) = mt.value_type() { + DataType::Map(mt) => { + if let DataType::Struct(nst) = mt.value_type() { build_path_map(nst.as_ref(), path, out) } } diff --git a/crates/sail-delta-lake/src/schema/manager.rs b/crates/sail-delta-lake/src/schema/manager.rs index ece0e084fd..5ddc67a3fa 100644 --- a/crates/sail-delta-lake/src/schema/manager.rs +++ b/crates/sail-delta-lake/src/schema/manager.rs @@ -10,22 +10,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -use datafusion::arrow::datatypes::Schema as ArrowSchema; -use delta_kernel::schema::StructType; -use delta_kernel::table_features::ColumnMappingMode; +use std::collections::HashMap; -use super::converter::get_physical_arrow_schema; -use super::mapping::{ - annotate_new_fields_for_column_mapping, annotate_schema_for_column_mapping, - compute_max_column_id, -}; -use crate::kernel::models::{Metadata, MetadataExt}; -use crate::kernel::DeltaResult; - -/// Annotate a kernel schema for column mapping (assign ids + physical names). -pub fn annotate_for_column_mapping(schema: &StructType) -> StructType { - annotate_schema_for_column_mapping(schema) -} +use super::mapping::{annotate_new_fields_for_column_mapping, compute_max_column_id}; +use crate::spec::{ColumnMappingMode, DeltaResult, Metadata, Protocol, StructType, TableFeature}; /// Evolve table schema and update metadata according to column mapping mode. pub fn evolve_schema( @@ -48,7 +36,7 @@ pub fn evolve_schema( let meta_with_max = meta_with_schema.add_config_key( "delta.columnMapping.maxColumnId".to_string(), last_id.to_string(), - )?; + ); (annotated, meta_with_max) } else { let meta = metadata.clone().with_schema(candidate)?; @@ -57,8 +45,72 @@ pub fn evolve_schema( Ok(updated) } -/// Get the Arrow physical schema for reading/writing files, enriched with PARQUET:field_id -/// when column mapping Name/Id mode is active. -pub fn get_physical_schema(logical: &StructType, mode: ColumnMappingMode) -> ArrowSchema { - get_physical_arrow_schema(logical, mode) +/// Build Metadata for table creation from an existing kernel StructType. +pub fn metadata_for_create_with_struct_type( + schema: StructType, + partition_columns: Vec, + created_time: i64, + configuration: HashMap, +) -> DeltaResult { + Metadata::try_new( + None, + None, + schema, + partition_columns, + created_time, + configuration, + ) +} + +/// Build Protocol for a create/write path based on required table features. +pub fn protocol_for_create( + enable_column_mapping: bool, + enable_timestamp_ntz: bool, + enable_in_commit_timestamps: bool, +) -> DeltaResult { + if !enable_column_mapping && !enable_timestamp_ntz && !enable_in_commit_timestamps { + return Ok(Protocol::new(1, 2, None, None)); + } + + let mut reader_features = Vec::new(); + let mut writer_features = Vec::new(); + if enable_column_mapping { + reader_features.push(TableFeature::ColumnMapping); + writer_features.push(TableFeature::ColumnMapping); + } + if enable_timestamp_ntz { + reader_features.push(TableFeature::TimestampWithoutTimezone); + writer_features.push(TableFeature::TimestampWithoutTimezone); + } + if enable_in_commit_timestamps { + writer_features.push(TableFeature::InCommitTimestamp); + } + + let min_reader_version = if reader_features.is_empty() { 1 } else { 3 }; + + Ok(Protocol::new( + min_reader_version, + 7, + Some(reader_features), + Some(writer_features), + )) +} + +#[cfg(test)] +mod tests { + use super::protocol_for_create; + use crate::spec::{DeltaResult, TableFeature}; + + #[test] + fn protocol_for_create_treats_in_commit_timestamp_as_writer_only() -> DeltaResult<()> { + let protocol = protocol_for_create(false, false, true)?; + assert_eq!(protocol.min_reader_version(), 1); + assert_eq!(protocol.min_writer_version(), 7); + assert_eq!(protocol.reader_features(), None); + assert_eq!( + protocol.writer_features(), + Some([TableFeature::InCommitTimestamp].as_slice()) + ); + Ok(()) + } } diff --git a/crates/sail-delta-lake/src/schema/mapping.rs b/crates/sail-delta-lake/src/schema/mapping.rs index 32d3f5d880..2e7d3339af 100644 --- a/crates/sail-delta-lake/src/schema/mapping.rs +++ b/crates/sail-delta-lake/src/schema/mapping.rs @@ -13,7 +13,7 @@ use std::collections::HashMap; use std::sync::atomic::{AtomicI64, Ordering}; -use delta_kernel::schema::{ArrayType, DataType, MapType, MetadataValue, StructField, StructType}; +use crate::spec::{ArrayType, DataType, MapType, MetadataValue, StructField, StructType}; /// Annotate a logical kernel schema with column mapping metadata (id + physicalName) /// using a sequential id assignment. Intended only for new table creation (name mode). diff --git a/crates/sail-delta-lake/src/schema/mod.rs b/crates/sail-delta-lake/src/schema/mod.rs index 16ef7a741f..701979ffb5 100644 --- a/crates/sail-delta-lake/src/schema/mod.rs +++ b/crates/sail-delta-lake/src/schema/mod.rs @@ -10,17 +10,19 @@ // See the License for the specific language governing permissions and // limitations under the License. +pub mod arrow_conversions; pub mod converter; pub mod manager; pub mod mapping; pub mod normalize; pub use converter::{ - arrow_schema_from_struct_type, kernel_to_logical_arrow, logical_arrow_to_kernel, + arrow_field_physical_name, arrow_schema_from_struct_type, arrow_schema_reorder_partitions, + get_physical_arrow_schema as get_physical_schema, make_physical_arrow_schema, }; -pub use manager::{annotate_for_column_mapping, evolve_schema, get_physical_schema}; +pub use manager::{evolve_schema, metadata_for_create_with_struct_type, protocol_for_create}; pub use mapping::{ annotate_new_fields_for_column_mapping, annotate_schema_for_column_mapping, - compute_max_column_id, + annotate_schema_for_column_mapping as annotate_for_column_mapping, compute_max_column_id, }; pub use normalize::normalize_delta_schema; diff --git a/crates/sail-delta-lake/src/session_extension.rs b/crates/sail-delta-lake/src/session_extension.rs new file mode 100644 index 0000000000..d558a6aa3b --- /dev/null +++ b/crates/sail-delta-lake/src/session_extension.rs @@ -0,0 +1,99 @@ +use std::sync::Arc; + +use datafusion::execution::runtime_env::RuntimeEnv; +use datafusion::execution::TaskContext; +use datafusion_common::{DataFusionError, Result}; +use moka::future::Cache as FutureCache; +use url::Url; + +use crate::kernel::DeltaTableConfig; +use crate::storage::StorageConfig; +use crate::table::{open_table_with_object_store_and_table_config_at_version, DeltaSnapshot}; + +const DEFAULT_MAX_ENTRIES: u64 = 1024; + +#[derive(Clone, Debug, Eq, PartialEq, Hash)] +pub(crate) struct TableCacheKey { + pub(crate) table_url: String, + pub(crate) version: i64, +} + +pub(crate) struct CachedTable { + pub(crate) snapshot: Arc, + pub(crate) log_store: crate::storage::LogStoreRef, +} + +pub struct DeltaTableCache { + cache: FutureCache>, +} + +impl DeltaTableCache { + pub fn new(max_entries: u64) -> Self { + let cache = FutureCache::builder().max_capacity(max_entries).build(); + Self { cache } + } + + pub(crate) async fn get( + &self, + context: &TaskContext, + table_url: &Url, + version: i64, + ) -> Result> { + let key = TableCacheKey { + table_url: table_url.to_string(), + version, + }; + let runtime_env = context.runtime_env(); + let table_url = table_url.clone(); + self.cache + .try_get_with(key, async move { + load_table_uncached(runtime_env, &table_url, version).await + }) + .await + .map_err(|e| DataFusionError::External(Box::new(e))) + } +} + +impl Default for DeltaTableCache { + fn default() -> Self { + Self::new(DEFAULT_MAX_ENTRIES) + } +} + +impl sail_common_datafusion::extension::SessionExtension for DeltaTableCache { + fn name() -> &'static str { + "delta_table_cache" + } +} + +pub(crate) async fn load_table_uncached( + runtime_env: Arc, + table_url: &Url, + version: i64, +) -> Result> { + let object_store = runtime_env + .object_store_registry + .get_store(table_url) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let table_config = DeltaTableConfig { + require_files: false, + ..Default::default() + }; + let table = open_table_with_object_store_and_table_config_at_version( + table_url.clone(), + object_store, + StorageConfig, + table_config, + version, + ) + .await + .map_err(|e| DataFusionError::External(Box::new(e)))?; + let snapshot_state = table + .snapshot() + .map_err(|e| DataFusionError::External(Box::new(e)))? + .clone(); + Ok(Arc::new(CachedTable { + snapshot: snapshot_state, + log_store: table.log_store(), + })) +} diff --git a/crates/sail-delta-lake/src/spec/action_schema.rs b/crates/sail-delta-lake/src/spec/action_schema.rs new file mode 100644 index 0000000000..0a432aba40 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/action_schema.rs @@ -0,0 +1,124 @@ +use crate::spec::schema::{ArrayType, DataType, MapType, StructField, StructType}; + +fn string_map_type(value_contains_null: bool) -> DataType { + MapType::new(DataType::STRING, DataType::STRING, value_contains_null).into() +} + +fn string_list_type() -> DataType { + ArrayType::new(DataType::STRING, true).into() +} + +fn deletion_vector_data_type() -> DataType { + deletion_vector_struct_type().into() +} + +fn metadata_format_data_type() -> DataType { + StructType::new_unchecked([ + StructField::not_null("provider", DataType::STRING), + StructField::not_null("options", string_map_type(true)), + ]) + .into() +} + +pub fn deletion_vector_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("storageType", DataType::STRING), + StructField::not_null("pathOrInlineDv", DataType::STRING), + StructField::nullable("offset", DataType::INTEGER), + StructField::not_null("sizeInBytes", DataType::INTEGER), + StructField::not_null("cardinality", DataType::LONG), + ]) +} + +pub fn add_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::not_null("partitionValues", string_map_type(true)), + StructField::not_null("size", DataType::LONG), + StructField::not_null("modificationTime", DataType::LONG), + StructField::not_null("dataChange", DataType::BOOLEAN), + StructField::nullable("stats", DataType::STRING), + StructField::nullable("tags", string_map_type(true)), + StructField::nullable("deletionVector", deletion_vector_data_type()), + StructField::nullable("baseRowId", DataType::LONG), + StructField::nullable("defaultRowCommitVersion", DataType::LONG), + StructField::nullable("clusteringProvider", DataType::STRING), + ]) +} + +pub fn remove_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("path", DataType::STRING), + StructField::not_null("dataChange", DataType::BOOLEAN), + StructField::nullable("deletionTimestamp", DataType::LONG), + StructField::nullable("extendedFileMetadata", DataType::BOOLEAN), + StructField::nullable("partitionValues", string_map_type(true)), + StructField::nullable("size", DataType::LONG), + StructField::nullable("stats", DataType::STRING), + StructField::nullable("tags", string_map_type(true)), + StructField::nullable("deletionVector", deletion_vector_data_type()), + StructField::nullable("baseRowId", DataType::LONG), + StructField::nullable("defaultRowCommitVersion", DataType::LONG), + ]) +} + +pub fn protocol_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("minReaderVersion", DataType::INTEGER), + StructField::not_null("minWriterVersion", DataType::INTEGER), + StructField::nullable("readerFeatures", string_list_type()), + StructField::nullable("writerFeatures", string_list_type()), + ]) +} + +pub fn metadata_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("id", DataType::STRING), + StructField::nullable("name", DataType::STRING), + StructField::nullable("description", DataType::STRING), + StructField::not_null("format", metadata_format_data_type()), + StructField::not_null("schemaString", DataType::STRING), + StructField::not_null("partitionColumns", string_list_type()), + StructField::nullable("createdTime", DataType::LONG), + StructField::not_null("configuration", string_map_type(true)), + ]) +} + +pub fn transaction_struct_type() -> StructType { + StructType::new_unchecked([ + StructField::not_null("appId", DataType::STRING), + StructField::not_null("version", DataType::LONG), + StructField::nullable("lastUpdated", DataType::LONG), + ]) +} + +#[cfg(test)] +mod tests { + use super::{add_struct_type, metadata_struct_type, remove_struct_type}; + + #[test] + fn add_schema_keeps_extended_fields() { + let add = add_struct_type(); + assert!(add.field("stats").is_some()); + assert!(add.field("tags").is_some()); + assert!(add.field("deletionVector").is_some()); + assert!(add.field("clusteringProvider").is_some()); + } + + #[test] + fn remove_schema_keeps_optional_partition_values() { + let remove = remove_struct_type(); + #[expect(clippy::expect_used)] + let partition_values = remove + .field("partitionValues") + .expect("remove schema should contain partitionValues"); + assert!(partition_values.is_nullable()); + } + + #[test] + fn metadata_schema_keeps_configuration_field() { + let metadata = metadata_struct_type(); + assert!(metadata.field("configuration").is_some()); + assert!(metadata.field("schemaString").is_some()); + } +} diff --git a/crates/sail-delta-lake/src/kernel/models/actions.rs b/crates/sail-delta-lake/src/spec/actions.rs similarity index 79% rename from crates/sail-delta-lake/src/kernel/models/actions.rs rename to crates/sail-delta-lake/src/spec/actions.rs index ac3f62d102..8191b52d85 100644 --- a/crates/sail-delta-lake/src/kernel/models/actions.rs +++ b/crates/sail-delta-lake/src/spec/actions.rs @@ -1,8 +1,8 @@ // https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt // // Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. +// Portions Copyright 2025-2026 LakeSail, Inc. +// Modified in 2026 by LakeSail, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,10 +16,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [Credit]: -// [Credit]: -// [Credit]: - use std::borrow::Borrow; use std::collections::HashMap; use std::fmt; @@ -27,14 +23,15 @@ use std::hash::{Hash, Hasher}; use std::str::FromStr; use chrono::DateTime; -use delta_kernel::actions::{Metadata, Protocol}; use object_store::path::Path; use object_store::ObjectMeta; use serde::{Deserialize, Serialize}; -use crate::kernel::statistics::Stats; -use crate::kernel::{DeltaResult, DeltaTableError}; +use crate::spec::statistics::Stats; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult, IsolationLevel, Metadata, Protocol}; +// [Credit]: +// [Credit]: #[derive(Serialize, Deserialize, Copy, Clone, Debug, PartialEq, Eq, Default)] pub enum StorageType { #[serde(rename = "u")] @@ -77,7 +74,7 @@ impl fmt::Display for StorageType { } } -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq, Default)] #[serde(rename_all = "camelCase")] pub struct DeletionVectorDescriptor { pub storage_type: StorageType, @@ -92,7 +89,7 @@ pub struct DeletionVectorDescriptor { #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub enum Action { - #[serde(rename = "metaData")] + #[serde(rename = "metaData", alias = "metadata")] Metadata(Metadata), Protocol(Protocol), Add(Add), @@ -187,6 +184,7 @@ impl Add { extended_file_metadata: options.extended_file_metadata, partition_values: Some(self.partition_values), size: Some(self.size), + stats: None, tags, deletion_vector: self.deletion_vector, base_row_id: self.base_row_id, @@ -210,7 +208,7 @@ impl Borrow for Remove { #[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct Add { - #[serde(with = "serde_path")] + #[serde(with = "crate::spec::utils::serde_path")] pub path: String, pub partition_values: HashMap>, pub size: i64, @@ -236,7 +234,7 @@ pub struct Add { #[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct Remove { - #[serde(with = "serde_path")] + #[serde(with = "crate::spec::utils::serde_path")] pub path: String, pub data_change: bool, #[serde(skip_serializing_if = "Option::is_none")] @@ -248,6 +246,8 @@ pub struct Remove { #[serde(skip_serializing_if = "Option::is_none")] pub size: Option, #[serde(skip_serializing_if = "Option::is_none")] + pub stats: Option, + #[serde(skip_serializing_if = "Option::is_none")] pub tags: Option>>, #[serde(skip_serializing_if = "Option::is_none")] pub deletion_vector: Option, @@ -276,7 +276,7 @@ impl Default for RemoveOptions { #[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] pub struct AddCDCFile { - #[serde(with = "serde_path")] + #[serde(with = "crate::spec::utils::serde_path")] pub path: String, pub partition_values: HashMap>, pub size: i64, @@ -295,40 +295,6 @@ pub struct Transaction { pub last_updated: Option, } -/// The isolation level applied during a transaction. -#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Default)] -pub enum IsolationLevel { - #[default] - Serializable, - WriteSerializable, - SnapshotIsolation, -} - -impl AsRef for IsolationLevel { - fn as_ref(&self) -> &str { - match self { - Self::Serializable => "Serializable", - Self::WriteSerializable => "WriteSerializable", - Self::SnapshotIsolation => "SnapshotIsolation", - } - } -} - -impl FromStr for IsolationLevel { - type Err = DeltaTableError; - - fn from_str(s: &str) -> Result { - match s.to_ascii_lowercase().as_str() { - "serializable" => Ok(Self::Serializable), - "writeserializable" | "write_serializable" => Ok(Self::WriteSerializable), - "snapshotisolation" | "snapshot_isolation" => Ok(Self::SnapshotIsolation), - _ => Err(DeltaTableError::generic(format!( - "Invalid string for IsolationLevel: {s}" - ))), - } - } -} - /// Commit metadata action. #[derive(Debug, Clone, Default, PartialEq, Eq, Deserialize, Serialize)] #[serde(rename_all = "camelCase")] @@ -351,6 +317,8 @@ pub struct CommitInfo { pub is_blind_append: Option, #[serde(skip_serializing_if = "Option::is_none")] pub engine_info: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub in_commit_timestamp: Option, #[serde(flatten, default)] pub info: HashMap, #[serde(skip_serializing_if = "Option::is_none")] @@ -417,54 +385,3 @@ impl TryFrom<&Add> for ObjectMeta { }) } } - -/// Serde helpers for encoding/decoding log paths. -pub(crate) mod serde_path { - use std::str::Utf8Error; - - use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; - use serde::{Deserialize, Deserializer, Serialize, Serializer}; - - pub fn deserialize<'de, D>(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - let s = String::deserialize(deserializer)?; - decode_path(&s).map_err(serde::de::Error::custom) - } - - pub fn serialize(value: &str, serializer: S) -> Result - where - S: Serializer, - { - let encoded = encode_path(value); - String::serialize(&encoded, serializer) - } - - const INVALID: &AsciiSet = &CONTROLS - .add(b'\\') - .add(b'{') - .add(b'^') - .add(b'}') - .add(b'%') - .add(b'`') - .add(b']') - .add(b'"') - .add(b'>') - .add(b'[') - .add(b'<') - .add(b'#') - .add(b'|') - .add(b'\r') - .add(b'\n') - .add(b'*') - .add(b'?'); - - fn encode_path(path: &str) -> String { - percent_encode(path.as_bytes(), INVALID).to_string() - } - - pub fn decode_path(path: &str) -> Result { - Ok(percent_decode_str(path).decode_utf8()?.to_string()) - } -} diff --git a/crates/sail-delta-lake/src/spec/checkpoint.rs b/crates/sail-delta-lake/src/spec/checkpoint.rs new file mode 100644 index 0000000000..9328998542 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/checkpoint.rs @@ -0,0 +1,81 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::spec::{ + add_struct_type, metadata_struct_type, protocol_struct_type, remove_struct_type, + transaction_struct_type, Add, DataType, Metadata, Protocol, Remove, StructField, StructType, + Transaction, +}; + +// [Credit]: +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct CheckpointActionRow { + #[serde(skip_serializing_if = "Option::is_none")] + pub add: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub remove: Option, + #[serde( + rename = "metaData", + alias = "metadata", + skip_serializing_if = "Option::is_none" + )] + pub metadata: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub protocol: Option, + #[serde(rename = "txn", skip_serializing_if = "Option::is_none")] + pub txn: Option, +} + +impl CheckpointActionRow { + pub fn struct_type() -> StructType { + // TODO: sidecar + StructType::new_unchecked([ + StructField::nullable("add", DataType::from(add_struct_type())), + StructField::nullable("remove", DataType::from(remove_struct_type())), + StructField::nullable("metaData", DataType::from(metadata_struct_type())), + StructField::nullable("protocol", DataType::from(protocol_struct_type())), + StructField::nullable("txn", DataType::from(transaction_struct_type())), + ]) + } +} + +// [Credit]: +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct LastCheckpointHint { + pub version: i64, + #[serde(skip_serializing_if = "Option::is_none")] + pub size: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub parts: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub size_in_bytes: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub num_of_add_files: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub checkpoint_schema: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub checksum: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub tags: Option>, +} diff --git a/crates/sail-delta-lake/src/spec/checksum.rs b/crates/sail-delta-lake/src/spec/checksum.rs new file mode 100644 index 0000000000..b241180c3e --- /dev/null +++ b/crates/sail-delta-lake/src/spec/checksum.rs @@ -0,0 +1,95 @@ +// https://github.com/delta-io/delta/blob/master/PROTOCOL.md +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +use crate::spec::actions::DomainMetadata; +use crate::spec::{Add, Metadata, Protocol, Transaction}; + +// Sidecar checksum for a committed Delta log version. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct VersionChecksum { + #[serde(skip_serializing_if = "Option::is_none")] + pub txn_id: Option, + pub table_size_bytes: i64, + pub num_files: i64, + pub num_metadata: i64, + pub num_protocol: i64, + #[serde(skip_serializing_if = "Option::is_none")] + pub in_commit_timestamp_opt: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub set_transactions: Option>, + // TODO: Populate once replay retains the latest DomainMetadata actions. + #[serde(skip_serializing_if = "Option::is_none")] + pub domain_metadata: Option>, + pub metadata: Metadata, + pub protocol: Protocol, + // TODO: Fill these optional fields when we can compute them + // faithfully from reconciled state instead of omitting them. + #[serde(skip_serializing_if = "Option::is_none")] + pub file_size_histogram: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub all_files: Option>, +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::VersionChecksum; + use crate::spec::{ + DataType, DeltaResult, Metadata, Protocol, StructField, StructType, TableFeature, + }; + + fn test_metadata() -> DeltaResult { + Metadata::try_new( + Some("test".to_string()), + Some("checksum".to_string()), + StructType::try_new([StructField::not_null("id", DataType::LONG)])?, + Vec::new(), + 123, + HashMap::from([("delta.appendOnly".to_string(), "true".to_string())]), + ) + } + + #[test] + fn version_checksum_json_roundtrip() -> DeltaResult<()> { + let checksum = VersionChecksum { + txn_id: Some("txn-123".to_string()), + table_size_bytes: 42, + num_files: 3, + num_metadata: 1, + num_protocol: 1, + in_commit_timestamp_opt: Some(456), + set_transactions: None, + domain_metadata: None, + metadata: test_metadata()?, + protocol: Protocol::new( + 3, + 7, + Some(vec![TableFeature::ColumnMapping]), + Some(vec![TableFeature::AppendOnly, TableFeature::ColumnMapping]), + ), + file_size_histogram: None, + all_files: None, + }; + + let json = serde_json::to_string(&checksum)?; + let decoded: VersionChecksum = serde_json::from_str(&json)?; + assert_eq!(decoded, checksum); + Ok(()) + } +} diff --git a/crates/sail-delta-lake/src/spec/error.rs b/crates/sail-delta-lake/src/spec/error.rs new file mode 100644 index 0000000000..54eb970673 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/error.rs @@ -0,0 +1,266 @@ +// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright (2020) QP Hou and a number of other contributors. +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use datafusion_common::{Column, DataFusionError, SchemaError}; +use object_store::Error as ObjectStoreError; +use thiserror::Error; + +use crate::spec::protocol::TableFeature; + +pub type DeltaResult = Result; + +/// Conflict during commit due to concurrent changes. +// [Credit]: +#[derive(Error, Debug)] +pub enum CommitConflictError { + #[error("Commit failed: a concurrent transactions added new data.\nHelp: This transaction's query must be rerun to include the new data. Also, if you don't care to require this check to pass in the future, the isolation level can be set to Snapshot Isolation.")] + ConcurrentAppend, + + #[error("Commit failed: a concurrent transaction deleted data this operation read.\nHelp: This transaction's query must be rerun to exclude the removed data. Also, if you don't care to require this check to pass in the future, the isolation level can be set to Snapshot Isolation.")] + ConcurrentDeleteRead, + + #[error("Commit failed: a concurrent transaction deleted the same data your transaction deletes.\nHelp: you should retry this write operation. If it was based on data contained in the table, you should rerun the query generating the data.")] + ConcurrentDeleteDelete, + + #[error("Metadata changed since last commit.")] + MetadataChanged, + + #[error("Concurrent transaction failed.")] + ConcurrentTransaction, + + #[error("Protocol changed since last commit: {0}")] + ProtocolChanged(String), + + #[error("Sail Delta Lake does not support writer version {0}")] + UnsupportedWriterVersion(i32), + + #[error("Sail Delta Lake does not support reader version {0}")] + UnsupportedReaderVersion(i32), + + #[error("Snapshot is corrupted: {source}")] + CorruptedState { + source: Box, + }, + + #[error("Error evaluating predicate: {source}")] + Predicate { + source: Box, + }, + + #[error("No metadata found, please make sure table is loaded.")] + NoMetadata, +} + +// [Credit]: +#[derive(Error, Debug)] +pub enum TransactionError { + #[error("Tried committing existing table version: {0}")] + VersionAlreadyExists(i64), + + #[error("Error serializing commit log to json: {json_err}")] + SerializeLogJson { json_err: serde_json::error::Error }, + + #[error("Log storage error: {source}")] + ObjectStore { + #[from] + source: object_store::Error, + }, + + #[error("Failed to commit transaction: {0}")] + CommitConflict(#[from] CommitConflictError), + + #[error("Failed to commit transaction: {0}")] + MaxCommitAttempts(i32), + + #[error( + "The transaction includes Remove action with data change but Delta table is append-only" + )] + DeltaTableAppendOnly, + + #[error("Unsupported table features required: {0:?}")] + UnsupportedTableFeatures(Vec), + + #[error("Table features must be specified, please specify: {0:?}")] + TableFeaturesRequired(TableFeature), + + #[error("Transaction failed: {msg}")] + LogStoreError { + msg: String, + #[source] + source: Box, + }, +} + +// [Credit]: +#[derive(Debug, Error)] +pub enum DeltaError { + #[error("No table version found.")] + MissingVersion, + + #[error("Invalid table location: {0}")] + InvalidTableLocation(String), + + #[error("File not found: {0}")] + FileNotFound(String), + + #[error("Missing column: {0}")] + MissingColumn(String), + + #[error("{0}")] + Schema(String), + + #[error("{0}")] + Generic(String), + + #[error(transparent)] + External(#[from] Box), + + #[error(transparent)] + Arrow(#[from] datafusion::arrow::error::ArrowError), + + #[error(transparent)] + Io(#[from] std::io::Error), + + #[error(transparent)] + ObjectStore(#[from] ObjectStoreError), + + #[error(transparent)] + ObjectStorePath(#[from] object_store::path::Error), + + #[error(transparent)] + Parquet(#[from] parquet::errors::ParquetError), + + #[error(transparent)] + InvalidUrl(#[from] url::ParseError), + + #[error("{0}")] + Unsupported(String), + + #[error("{0}")] + InternalError(String), + + #[error("No table metadata found in delta log.")] + MissingMetadata, + + #[error("No protocol found in delta log.")] + MissingProtocol, + + #[error("No table metadata or protocol found in delta log.")] + MissingMetadataAndProtocol, + + #[error("Failed to parse value '{0}' as '{1}'")] + ParseError(String, String), + + #[error(transparent)] + DataFusion(#[from] DataFusionError), + + #[error(transparent)] + Json(#[from] serde_json::Error), + + #[error("Invalid configuration: {0}")] + Config(String), + + #[error("Delta transaction error: {0}")] + Transaction(#[from] TransactionError), +} + +// [Credit]: +impl DeltaError { + pub fn generic(msg: impl ToString) -> Self { + Self::Generic(msg.to_string()) + } + + pub fn generic_err( + source: impl Into>, + ) -> Self { + Self::External(source.into()) + } + + pub fn schema(msg: impl ToString) -> Self { + Self::Schema(msg.to_string()) + } + + pub fn invalid_table_location(location: impl ToString) -> Self { + Self::InvalidTableLocation(location.to_string()) + } + + pub fn missing_column(name: impl ToString) -> Self { + Self::MissingColumn(name.to_string()) + } +} + +impl From for DataFusionError { + fn from(err: DeltaError) -> Self { + match err { + DeltaError::DataFusion(inner) => inner, + DeltaError::Io(err) => DataFusionError::IoError(err), + DeltaError::Arrow(err) => DataFusionError::ArrowError(Box::new(err), None), + DeltaError::ObjectStore(err) => DataFusionError::ObjectStore(Box::new(err)), + DeltaError::ObjectStorePath(source) => { + DataFusionError::ObjectStore(Box::new(ObjectStoreError::InvalidPath { source })) + } + DeltaError::Parquet(err) => DataFusionError::ParquetError(Box::new(err)), + DeltaError::Json(err) => DataFusionError::External(Box::new(err)), + DeltaError::Config(msg) => DataFusionError::Configuration(msg), + DeltaError::Transaction(err) => DataFusionError::External(Box::new(err)), + DeltaError::FileNotFound(path) => { + DataFusionError::ObjectStore(Box::new(ObjectStoreError::NotFound { + path, + source: Box::new(std::io::Error::new( + std::io::ErrorKind::NotFound, + "File not found in Delta kernel", + )), + })) + } + DeltaError::MissingColumn(column) => DataFusionError::SchemaError( + Box::new(SchemaError::FieldNotFound { + field: Box::new(Column::from_name(column)), + valid_fields: vec![], + }), + Box::new(None), + ), + DeltaError::InvalidUrl(err) => { + DataFusionError::Configuration(format!("Invalid Delta URL: {err}")) + } + DeltaError::InvalidTableLocation(location) => { + DataFusionError::Configuration(format!("Invalid table location: {location}")) + } + DeltaError::MissingVersion => { + DataFusionError::Execution("No table version found.".to_string()) + } + DeltaError::Unsupported(msg) => DataFusionError::NotImplemented(msg), + DeltaError::Generic(msg) | DeltaError::Schema(msg) => DataFusionError::Execution(msg), + DeltaError::External(source) => DataFusionError::External(source), + DeltaError::InternalError(msg) => DataFusionError::Internal(msg), + DeltaError::MissingMetadata => { + DataFusionError::Execution("No table metadata found in delta log.".to_string()) + } + DeltaError::MissingProtocol => { + DataFusionError::Execution("No protocol found in delta log.".to_string()) + } + DeltaError::MissingMetadataAndProtocol => DataFusionError::Execution( + "No table metadata or protocol found in delta log.".to_string(), + ), + DeltaError::ParseError(value, ty) => { + DataFusionError::Execution(format!("Failed to parse value '{value}' as '{ty}'")) + } + } + } +} diff --git a/crates/sail-delta-lake/src/spec/fields.rs b/crates/sail-delta-lake/src/spec/fields.rs new file mode 100644 index 0000000000..b6f7029671 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/fields.rs @@ -0,0 +1,43 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// [Credit]: +pub const FIELD_NAME_PATH: &str = "path"; +pub const FIELD_NAME_SIZE: &str = "size"; +pub const FIELD_NAME_MODIFICATION_TIME: &str = "modificationTime"; +pub const FIELD_NAME_STATS: &str = "stats"; +pub const FIELD_NAME_STATS_PARSED: &str = "stats_parsed"; +#[expect(dead_code)] +const FIELD_NAME_FILE_CONSTANT_VALUES: &str = "fileConstantValues"; +#[expect(dead_code)] +const FIELD_NAME_PARTITION_VALUES: &str = "partitionValues"; +pub const FIELD_NAME_PARTITION_VALUES_PARSED: &str = "partitionValues_parsed"; +pub const FIELD_NAME_DELETION_VECTOR: &str = "deletionVector"; + +// [Credit]: +pub const STATS_FIELD_NUM_RECORDS: &str = "numRecords"; +pub const STATS_FIELD_MIN_VALUES: &str = "minValues"; +pub const STATS_FIELD_MAX_VALUES: &str = "maxValues"; +pub const STATS_FIELD_NULL_COUNT: &str = "nullCount"; + +// [Credit]: +pub const DV_FIELD_STORAGE_TYPE: &str = "storageType"; +pub const DV_FIELD_PATH_OR_INLINE_DV: &str = "pathOrInlineDv"; +pub const DV_FIELD_SIZE_IN_BYTES: &str = "sizeInBytes"; +pub const DV_FIELD_CARDINALITY: &str = "cardinality"; +pub const DV_FIELD_OFFSET: &str = "offset"; diff --git a/crates/sail-delta-lake/src/spec/log.rs b/crates/sail-delta-lake/src/spec/log.rs new file mode 100644 index 0000000000..398f1d4351 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/log.rs @@ -0,0 +1,90 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use object_store::path::{Path, DELIMITER}; + +// [Credit]: +pub const DELTA_LOG_DIR: &str = "_delta_log"; + +// [Credit]: +pub const LAST_CHECKPOINT_FILE: &str = "_last_checkpoint"; + +pub fn delta_log_root_path() -> Path { + Path::from(DELTA_LOG_DIR) +} + +pub fn last_checkpoint_path() -> Path { + Path::from(format!("{DELTA_LOG_DIR}/{LAST_CHECKPOINT_FILE}")) +} + +pub fn delta_log_prefix_path(version: i64) -> Path { + Path::from(format!("{DELTA_LOG_DIR}/{version:020}")) +} + +// [Credit]: +pub fn checkpoint_path(version: i64) -> Path { + Path::from(format!("{DELTA_LOG_DIR}/{version:020}.checkpoint.parquet")) +} + +pub fn commit_path(version: i64) -> Path { + Path::from_iter([DELTA_LOG_DIR, &format!("{version:020}.json")]) +} + +pub fn checksum_path(version: i64) -> Path { + Path::from_iter([DELTA_LOG_DIR, &format!("{version:020}.crc")]) +} + +pub fn temp_commit_path(token: &str) -> Path { + Path::from_iter([DELTA_LOG_DIR, &format!("_commit_{token}.json.tmp")]) +} + +pub fn delta_log_file_path(table_root_path: &str, filename: &str) -> Path { + Path::from(format!( + "{}{}{}{}{}", + table_root_path, DELIMITER, DELTA_LOG_DIR, DELIMITER, filename + )) +} + +pub fn parse_version_prefix(filename: &str) -> Option { + let prefix = filename.get(0..20)?; + if !prefix.as_bytes().iter().all(|b| b.is_ascii_digit()) { + return None; + } + prefix.parse::().ok() +} + +pub fn parse_commit_version(filename: &str) -> Option { + if filename.len() != 25 || !filename.ends_with(".json") { + return None; + } + parse_version_prefix(filename) +} + +pub fn parse_checksum_version(filename: &str) -> Option { + if filename.len() != 24 || !filename.ends_with(".crc") { + return None; + } + parse_version_prefix(filename) +} + +pub fn parse_checkpoint_version(filename: &str) -> Option { + if !filename.contains(".checkpoint") || !filename.ends_with(".parquet") { + return None; + } + parse_version_prefix(filename) +} diff --git a/crates/sail-delta-lake/src/spec/metadata.rs b/crates/sail-delta-lake/src/spec/metadata.rs new file mode 100644 index 0000000000..5b3a82cb74 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/metadata.rs @@ -0,0 +1,161 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use datafusion::arrow::datatypes::Schema as ArrowSchema; +use serde::{Deserialize, Serialize}; + +use crate::spec::schema::StructType; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct Format { + provider: String, + options: HashMap, +} + +impl Format { + pub(crate) fn default_parquet() -> Self { + Self { + provider: "parquet".to_string(), + options: HashMap::new(), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct Metadata { + id: String, + name: Option, + description: Option, + format: Format, + schema_string: String, + partition_columns: Vec, + created_time: Option, + configuration: HashMap, +} + +impl Metadata { + // [Credit]: + pub fn try_new( + name: Option, + description: Option, + schema: StructType, + partition_columns: Vec, + created_time: i64, + configuration: HashMap, + ) -> DeltaResult { + Ok(Self { + id: uuid::Uuid::new_v4().to_string(), + name, + description, + format: Format::default_parquet(), + schema_string: serde_json::to_string(&schema)?, + partition_columns, + created_time: Some(created_time), + configuration, + }) + } + + pub fn id(&self) -> &str { + &self.id + } + + pub fn name(&self) -> Option<&str> { + self.name.as_deref() + } + + pub fn description(&self) -> Option<&str> { + self.description.as_deref() + } + + pub fn created_time(&self) -> Option { + self.created_time + } + + pub fn configuration(&self) -> &HashMap { + &self.configuration + } + + /// Parse the schema string into a `StructType` (Delta JSON format). + pub fn parse_schema(&self) -> DeltaResult { + Ok(serde_json::from_str(&self.schema_string)?) + } + + /// Parse the schema string and convert to an Arrow `Schema`. + pub fn parse_schema_arrow(&self) -> DeltaResult { + let struct_type: StructType = serde_json::from_str(&self.schema_string)?; + ArrowSchema::try_from(&struct_type) + .map_err(|e| DeltaTableError::generic(format!("Failed to convert schema: {e}"))) + } + + pub fn partition_columns(&self) -> &Vec { + &self.partition_columns + } + + pub fn with_table_id(self, table_id: String) -> Metadata { + Metadata { + id: table_id, + ..self + } + } + + pub fn with_name(self, name: String) -> Metadata { + Metadata { + name: Some(name), + ..self + } + } + + pub fn with_description(self, description: String) -> Metadata { + Metadata { + description: Some(description), + ..self + } + } + + pub fn with_schema(self, schema: &StructType) -> DeltaResult { + Ok(Metadata { + schema_string: serde_json::to_string(schema)?, + ..self + }) + } + + pub fn add_config_key(self, key: String, value: String) -> Metadata { + let mut configuration = self.configuration; + configuration.insert(key, value); + Metadata { + configuration, + ..self + } + } + + pub fn remove_config_key(self, key: &str) -> Metadata { + let mut configuration = self.configuration; + configuration.remove(key); + Metadata { + configuration, + ..self + } + } +} diff --git a/crates/sail-delta-lake/src/spec/mod.rs b/crates/sail-delta-lake/src/spec/mod.rs new file mode 100644 index 0000000000..8b749fdb4e --- /dev/null +++ b/crates/sail-delta-lake/src/spec/mod.rs @@ -0,0 +1,46 @@ +pub mod action_schema; +pub mod actions; +pub mod checkpoint; +pub mod checksum; +pub mod error; +pub mod fields; +pub mod log; +pub mod metadata; +pub mod operation; +pub mod properties; +pub mod protocol; +pub mod schema; +pub mod statistics; +pub(crate) mod utils; +pub use action_schema::{ + add_struct_type, deletion_vector_struct_type, metadata_struct_type, protocol_struct_type, + remove_struct_type, transaction_struct_type, +}; +pub use actions::{ + Action, Add, CommitInfo, DeletionVectorDescriptor, Remove, RemoveOptions, StorageType, + Transaction, +}; +pub use checkpoint::{CheckpointActionRow, LastCheckpointHint}; +pub use checksum::VersionChecksum; +pub use datafusion::arrow::datatypes::SchemaRef; +pub use error::{CommitConflictError, DeltaError, DeltaResult, TransactionError}; +pub use log::{ + checkpoint_path, checksum_path, commit_path, delta_log_file_path, delta_log_prefix_path, + delta_log_root_path, last_checkpoint_path, parse_checkpoint_version, parse_checksum_version, + parse_commit_version, parse_version_prefix, temp_commit_path, DELTA_LOG_DIR, + LAST_CHECKPOINT_FILE, +}; +pub use metadata::{Format, Metadata}; +pub use operation::{DeltaOperation, MergePredicate, SaveMode}; +pub use properties::{ + canonicalize_and_validate_table_properties, route_table_property_key, + DataSkippingNumIndexedCols, IsolationLevel, TableProperties, +}; +pub use protocol::{Protocol, TableFeature}; +pub use schema::{ + ArrayType, ColumnMappingMode, ColumnMetadataKey, ColumnName, DataType, DecimalType, MapType, + MetadataValue, PrimitiveType, Schema, StructField, StructType, +}; +pub(crate) use statistics::stats_schema; +pub use statistics::{ColumnCountStat, ColumnValueStat, StatValue, Stats}; +pub(crate) use utils::{contains_timestampntz, contains_timestampntz_arrow}; diff --git a/crates/sail-delta-lake/src/kernel/operation.rs b/crates/sail-delta-lake/src/spec/operation.rs similarity index 60% rename from crates/sail-delta-lake/src/kernel/operation.rs rename to crates/sail-delta-lake/src/spec/operation.rs index 02d849f05e..dd82b867a6 100644 --- a/crates/sail-delta-lake/src/kernel/operation.rs +++ b/crates/sail-delta-lake/src/spec/operation.rs @@ -1,8 +1,8 @@ // https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt // // Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. +// Portions Copyright 2025-2026 LakeSail, Inc. +// Modified in 2026 by LakeSail, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,16 +16,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -// [Credit]: use std::collections::HashMap; -use delta_kernel::actions::Protocol; use serde::{Deserialize, Serialize}; use serde_json::Value; -use crate::kernel::models::{CommitInfo, Metadata}; -use crate::kernel::{DeltaResult, DeltaTableError}; +use crate::spec::actions::CommitInfo; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult, Metadata, Protocol}; +// [Credit]: #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub struct MergePredicate { @@ -61,15 +60,14 @@ impl std::str::FromStr for SaveMode { } } -#[expect(clippy::large_enum_variant)] #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub enum DeltaOperation { Create { mode: SaveMode, location: String, - protocol: Protocol, - metadata: Metadata, + protocol: Box, + metadata: Box, }, Write { mode: SaveMode, @@ -117,31 +115,90 @@ impl DeltaOperation { } } - pub fn operation_parameters(&self) -> DeltaResult> { - if let Some(Some(Some(map))) = serde_json::to_value(self)? - .as_object() - .map(|p| p.values().next().map(|q| q.as_object())) - { - Ok(map - .iter() - .filter(|item| !item.1.is_null()) - .map(|(k, v)| { - // Delta commitInfo.operationParameters expects values to be strings. - ( - k.to_owned(), - Value::String(if v.is_string() { - String::from(v.as_str().unwrap_or_default()) - } else { - v.to_string() - }), - ) - }) - .collect()) - } else { - Err(DeltaTableError::generic( - "Operation parameters serialized into unexpected shape", - )) + pub fn operation_parameters_string_map(&self) -> DeltaResult> { + fn insert_json( + map: &mut HashMap, + key: &str, + value: &T, + ) -> DeltaResult<()> { + map.insert( + key.to_string(), + serde_json::to_string(value).map_err(DeltaTableError::generic_err)?, + ); + Ok(()) + } + + fn insert_opt(map: &mut HashMap, key: &str, value: Option) { + if let Some(value) = value { + map.insert(key.to_string(), value.to_string()); + } } + + let mut parameters = HashMap::new(); + match self { + Self::Create { + mode, + location, + protocol, + metadata, + } => { + parameters.insert("mode".to_string(), format!("{mode:?}")); + parameters.insert("location".to_string(), location.clone()); + insert_json(&mut parameters, "protocol", protocol.as_ref())?; + insert_json(&mut parameters, "metadata", metadata.as_ref())?; + } + Self::Write { + mode, + partition_by, + predicate, + } => { + parameters.insert("mode".to_string(), format!("{mode:?}")); + if let Some(partition_by) = partition_by { + insert_json(&mut parameters, "partitionBy", partition_by)?; + } + insert_opt(&mut parameters, "predicate", predicate.clone()); + } + Self::Delete { predicate } => { + insert_opt(&mut parameters, "predicate", predicate.clone()); + } + Self::Merge { + predicate, + merge_predicate, + matched_predicates, + not_matched_predicates, + not_matched_by_source_predicates, + } => { + insert_opt(&mut parameters, "predicate", predicate.clone()); + insert_opt(&mut parameters, "mergePredicate", merge_predicate.clone()); + insert_json(&mut parameters, "matchedPredicates", matched_predicates)?; + insert_json( + &mut parameters, + "notMatchedPredicates", + not_matched_predicates, + )?; + insert_json( + &mut parameters, + "notMatchedBySourcePredicates", + not_matched_by_source_predicates, + )?; + } + Self::FileSystemCheck {} => {} + Self::Restore { version, datetime } => { + insert_opt(&mut parameters, "version", *version); + insert_opt(&mut parameters, "datetime", *datetime); + } + } + + Ok(parameters) + } + + pub fn operation_parameters(&self) -> DeltaResult> { + self.operation_parameters_string_map().map(|parameters| { + parameters + .into_iter() + .map(|(key, value)| (key, Value::String(value))) + .collect() + }) } pub fn changes_data(&self) -> bool { diff --git a/crates/sail-delta-lake/src/spec/properties.rs b/crates/sail-delta-lake/src/spec/properties.rs new file mode 100644 index 0000000000..34a3b0bf6d --- /dev/null +++ b/crates/sail-delta-lake/src/spec/properties.rs @@ -0,0 +1,529 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::num::NonZeroU64; +use std::str::FromStr; +use std::time::Duration; + +use serde::{Deserialize, Serialize}; + +use crate::spec::schema::{ColumnMappingMode, ColumnName}; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +// [Credit]: +pub enum DataSkippingNumIndexedCols { + AllColumns, + NumColumns(u64), +} + +impl TryFrom<&str> for DataSkippingNumIndexedCols { + type Error = DeltaTableError; + + fn try_from(value: &str) -> Result { + let num: i64 = value.parse().map_err(|_| { + DeltaTableError::generic("couldn't parse DataSkippingNumIndexedCols to an integer") + })?; + match num { + -1 => Ok(Self::AllColumns), + x if x >= 0 => Ok(Self::NumColumns(x as u64)), + _ => Err(DeltaTableError::generic( + "couldn't parse DataSkippingNumIndexedCols to positive integer", + )), + } + } +} + +#[derive(Serialize, Deserialize, Debug, Copy, Clone, PartialEq, Eq, Default)] +// [Credit]: +pub enum IsolationLevel { + #[default] + Serializable, + WriteSerializable, + SnapshotIsolation, +} + +impl AsRef for IsolationLevel { + fn as_ref(&self) -> &str { + match self { + Self::Serializable => "Serializable", + Self::WriteSerializable => "WriteSerializable", + Self::SnapshotIsolation => "SnapshotIsolation", + } + } +} + +impl FromStr for IsolationLevel { + type Err = DeltaTableError; + + fn from_str(s: &str) -> Result { + match s.to_ascii_lowercase().as_str() { + "serializable" => Ok(Self::Serializable), + "writeserializable" | "write_serializable" => Ok(Self::WriteSerializable), + "snapshotisolation" | "snapshot_isolation" => Ok(Self::SnapshotIsolation), + _ => Err(DeltaTableError::generic(format!( + "Invalid string for IsolationLevel: {s}" + ))), + } + } +} + +#[derive(Debug, Clone, Eq, PartialEq, Default)] +// [Credit]: +pub struct TableProperties { + pub append_only: Option, + pub checkpoint_interval: Option, + pub checkpoint_write_stats_as_json: Option, + pub checkpoint_write_stats_as_struct: Option, + pub write_checksum_file_enabled: Option, + pub enable_in_commit_timestamps: Option, + pub in_commit_timestamp_enablement_version: Option, + pub in_commit_timestamp_enablement_timestamp: Option, + pub column_mapping_mode: Option, + pub data_skipping_num_indexed_cols: Option, + pub data_skipping_stats_columns: Option>, + pub deleted_file_retention_duration: Option, + pub isolation_level: Option, + pub log_retention_duration: Option, + pub enable_expired_log_cleanup: Option, + pub unknown_properties: HashMap, +} + +impl From for TableProperties +where + I: IntoIterator, + K: AsRef + Into, + V: AsRef + Into, +{ + // [Credit]: + fn from(unparsed: I) -> Self { + let mut props = TableProperties::default(); + let unparsed = unparsed.into_iter().filter(|(k, v)| { + try_parse_table_property(&mut props, k.as_ref(), v.as_ref()).is_none() + }); + props.unknown_properties = unparsed.map(|(k, v)| (k.into(), v.into())).collect(); + props + } +} + +const DEFAULT_LOG_RETENTION_SECS: u64 = 30 * 24 * 60 * 60; +const DEFAULT_DELETED_FILE_RETENTION_SECS: u64 = 7 * 24 * 60 * 60; +// Sail aligns with Spark/Delta's default of checkpointing every 10 committed versions. +const DEFAULT_CHECKPOINT_INTERVAL: NonZeroU64 = + NonZeroU64::new(10).expect("non-zero checkpoint interval"); + +impl TableProperties { + pub fn append_only(&self) -> bool { + self.append_only.unwrap_or(false) + } + + pub fn log_retention_duration(&self) -> Duration { + self.log_retention_duration + .unwrap_or(Duration::from_secs(DEFAULT_LOG_RETENTION_SECS)) + } + + pub fn enable_expired_log_cleanup(&self) -> bool { + self.enable_expired_log_cleanup.unwrap_or(true) + } + + pub fn checkpoint_interval(&self) -> NonZeroU64 { + self.checkpoint_interval + .unwrap_or(DEFAULT_CHECKPOINT_INTERVAL) + } + + pub fn write_checksum_file_enabled(&self) -> bool { + self.write_checksum_file_enabled.unwrap_or(true) + } + + pub fn enable_in_commit_timestamps(&self) -> bool { + self.enable_in_commit_timestamps.unwrap_or(false) + } + + pub fn in_commit_timestamp_enablement_version(&self) -> Option { + self.in_commit_timestamp_enablement_version + } + + pub fn in_commit_timestamp_enablement_timestamp(&self) -> Option { + self.in_commit_timestamp_enablement_timestamp + } + + pub fn in_commit_timestamp_enablement(&self) -> Option<(i64, i64)> { + self.in_commit_timestamp_enablement_version + .zip(self.in_commit_timestamp_enablement_timestamp) + } + + pub fn deleted_file_retention_duration(&self) -> Duration { + self.deleted_file_retention_duration + .unwrap_or(Duration::from_secs(DEFAULT_DELETED_FILE_RETENTION_SECS)) + } + + pub fn isolation_level(&self) -> IsolationLevel { + self.isolation_level.unwrap_or_default() + } +} + +pub fn canonicalize_and_validate_table_properties( + properties: I, +) -> DeltaResult> +where + I: IntoIterator, + K: AsRef, + V: AsRef, +{ + let mut canonicalized = HashMap::new(); + for (key, value) in properties { + let key = canonicalize_table_property_key(key.as_ref()).unwrap_or_else(|| key.as_ref()); + validate_table_property(key, value.as_ref())?; + canonicalized.insert(key.to_string(), value.as_ref().to_string()); + } + Ok(canonicalized) +} + +/// Map supported property aliases to their canonical Delta table property key. +/// +/// Returns `Some(canonical_key)` for recognized modeled properties and aliases, and `None` +/// for unrecognized keys that should be preserved as-is. +fn canonicalize_table_property_key(key: &str) -> Option<&'static str> { + match key.to_ascii_lowercase().as_str() { + "delta.appendonly" | "append_only" | "appendonly" => Some("delta.appendOnly"), + "delta.checkpointinterval" | "checkpoint_interval" | "checkpointinterval" => { + Some("delta.checkpointInterval") + } + "delta.checkpoint.writestatsasjson" + | "checkpoint_write_stats_as_json" + | "checkpointwritestatsasjson" => Some("delta.checkpoint.writeStatsAsJson"), + "delta.checkpoint.writestatsasstruct" + | "checkpoint_write_stats_as_struct" + | "checkpointwritestatsasstruct" => Some("delta.checkpoint.writeStatsAsStruct"), + "delta.writechecksumfile.enabled" + | "write_checksum_file_enabled" + | "writechecksumfileenabled" => Some("delta.writeChecksumFile.enabled"), + "delta.enableincommittimestamps" + | "enable_in_commit_timestamps" + | "enableincommittimestamps" => Some("delta.enableInCommitTimestamps"), + "delta.incommittimestampenablementversion" + | "in_commit_timestamp_enablement_version" + | "incommittimestampenablementversion" => Some("delta.inCommitTimestampEnablementVersion"), + "delta.incommittimestampenablementtimestamp" + | "in_commit_timestamp_enablement_timestamp" + | "incommittimestampenablementtimestamp" => { + Some("delta.inCommitTimestampEnablementTimestamp") + } + "delta.columnmapping.mode" + | "column_mapping_mode" + | "columnmappingmode" + | "column_mapping" => Some("delta.columnMapping.mode"), + "delta.dataskippingnumindexedcols" + | "data_skipping_num_indexed_cols" + | "dataskippingnumindexedcols" => Some("delta.dataSkippingNumIndexedCols"), + "delta.dataskippingstatscolumns" + | "data_skipping_stats_columns" + | "dataskippingstatscolumns" => Some("delta.dataSkippingStatsColumns"), + "delta.deletedfileretentionduration" + | "deleted_file_retention_duration" + | "deletedfileretentionduration" => Some("delta.deletedFileRetentionDuration"), + "delta.isolationlevel" | "isolation_level" | "isolationlevel" => { + Some("delta.isolationLevel") + } + "delta.logretentionduration" | "log_retention_duration" | "logretentionduration" => { + Some("delta.logRetentionDuration") + } + "delta.enableexpiredlogcleanup" + | "enable_expired_log_cleanup" + | "enableexpiredlogcleanup" => Some("delta.enableExpiredLogCleanup"), + _ => None, + } +} + +/// Resolve whether an external option key should be routed into Delta table properties. +/// +/// Known modeled aliases are canonicalized to the exact Delta table property name. Any other key +/// with a `delta.` prefix is treated as a pass-through table property so newer protocol features +/// can still be persisted without first teaching Sail about them. +pub fn route_table_property_key(key: &str) -> Option { + if let Some(canonical) = canonicalize_table_property_key(key) { + return Some(canonical.to_string()); + } + + if key.len() >= 6 && key[..6].eq_ignore_ascii_case("delta.") { + if key.starts_with("delta.") { + return Some(key.to_string()); + } + return Some(format!("delta.{}", &key[6..])); + } + + None +} + +/// Validate modeled Delta table property values while allowing unknown properties through. +/// +/// Known properties are parsed using the same type-specific rules Delta snapshots rely on +/// (boolean, positive integer, interval, column mapping mode, etc.). Unknown properties are +/// accepted so they can still be persisted in `metaData.configuration`. +fn validate_table_property(key: &str, value: &str) -> DeltaResult<()> { + match key { + "delta.appendOnly" + | "delta.checkpoint.writeStatsAsJson" + | "delta.checkpoint.writeStatsAsStruct" + | "delta.writeChecksumFile.enabled" + | "delta.enableInCommitTimestamps" + | "delta.enableExpiredLogCleanup" => parse_bool(value).map(|_| ()).ok_or_else(|| { + DeltaTableError::generic(format!("invalid boolean value for {key}: {value}")) + }), + "delta.checkpointInterval" => parse_positive_int(value).map(|_| ()).ok_or_else(|| { + DeltaTableError::generic(format!( + "invalid value for {key}: expected positive integer" + )) + }), + "delta.inCommitTimestampEnablementVersion" => { + parse_non_negative_i64(value).map(|_| ()).ok_or_else(|| { + DeltaTableError::generic(format!( + "invalid value for {key}: expected non-negative integer" + )) + }) + } + "delta.inCommitTimestampEnablementTimestamp" => { + parse_i64(value).map(|_| ()).ok_or_else(|| { + DeltaTableError::generic(format!("invalid value for {key}: expected integer")) + }) + } + "delta.columnMapping.mode" => ColumnMappingMode::try_from(value).map(|_| ()), + "delta.dataSkippingNumIndexedCols" => { + DataSkippingNumIndexedCols::try_from(value).map(|_| ()) + } + "delta.dataSkippingStatsColumns" => ColumnName::parse_column_name_list(value).map(|_| ()), + "delta.deletedFileRetentionDuration" | "delta.logRetentionDuration" => { + parse_interval(value).map(|_| ()).ok_or_else(|| { + DeltaTableError::generic(format!( + "invalid value for {key}: expected Delta interval literal" + )) + }) + } + "delta.isolationLevel" => IsolationLevel::from_str(value).map(|_| ()), + _ => Ok(()), + } +} + +fn try_parse_table_property(props: &mut TableProperties, key: &str, value: &str) -> Option<()> { + // [Credit]: + match key { + "delta.appendOnly" => props.append_only = Some(parse_bool(value)?), + "delta.checkpointInterval" => props.checkpoint_interval = Some(parse_positive_int(value)?), + "delta.checkpoint.writeStatsAsJson" => { + props.checkpoint_write_stats_as_json = Some(parse_bool(value)?) + } + "delta.checkpoint.writeStatsAsStruct" => { + props.checkpoint_write_stats_as_struct = Some(parse_bool(value)?) + } + "delta.writeChecksumFile.enabled" => { + props.write_checksum_file_enabled = Some(parse_bool(value)?) + } + "delta.enableInCommitTimestamps" => { + props.enable_in_commit_timestamps = Some(parse_bool(value)?) + } + "delta.inCommitTimestampEnablementVersion" => { + props.in_commit_timestamp_enablement_version = Some(parse_non_negative_i64(value)?) + } + "delta.inCommitTimestampEnablementTimestamp" => { + props.in_commit_timestamp_enablement_timestamp = Some(parse_i64(value)?) + } + "delta.columnMapping.mode" => { + props.column_mapping_mode = ColumnMappingMode::try_from(value).ok() + } + "delta.dataSkippingNumIndexedCols" => { + props.data_skipping_num_indexed_cols = DataSkippingNumIndexedCols::try_from(value).ok() + } + "delta.dataSkippingStatsColumns" => { + props.data_skipping_stats_columns = ColumnName::parse_column_name_list(value).ok() + } + "delta.deletedFileRetentionDuration" => { + props.deleted_file_retention_duration = parse_interval(value) + } + "delta.isolationLevel" => { + props.isolation_level = IsolationLevel::from_str(value).ok(); + } + "delta.logRetentionDuration" => props.log_retention_duration = parse_interval(value), + "delta.enableExpiredLogCleanup" => { + props.enable_expired_log_cleanup = Some(parse_bool(value)?) + } + _ => return None, + } + Some(()) +} + +fn parse_positive_int(s: &str) -> Option { + // [Credit]: + let n: i64 = s.parse().ok()?; + if n <= 0 { + return None; + } + NonZeroU64::new(n as u64) +} + +fn parse_bool(s: &str) -> Option { + // [Credit]: + match s { + "true" => Some(true), + "false" => Some(false), + _ => None, + } +} + +fn parse_i64(s: &str) -> Option { + s.parse().ok() +} + +fn parse_non_negative_i64(s: &str) -> Option { + let value = parse_i64(s)?; + (value >= 0).then_some(value) +} + +fn parse_interval(s: &str) -> Option { + // [Credit]: + const SECONDS_PER_MINUTE: u64 = 60; + const SECONDS_PER_HOUR: u64 = 60 * SECONDS_PER_MINUTE; + const SECONDS_PER_DAY: u64 = 24 * SECONDS_PER_HOUR; + const SECONDS_PER_WEEK: u64 = 7 * SECONDS_PER_DAY; + + let mut it = s.split_whitespace(); + if it.next() != Some("interval") { + return None; + } + let number = it.next()?.parse::().ok()?; + if number < 0 { + return None; + } + let number = number as u64; + match it.next()? { + "nanosecond" | "nanoseconds" => Some(Duration::from_nanos(number)), + "microsecond" | "microseconds" => Some(Duration::from_micros(number)), + "millisecond" | "milliseconds" => Some(Duration::from_millis(number)), + "second" | "seconds" => Some(Duration::from_secs(number)), + "minute" | "minutes" => Some(Duration::from_secs(number * SECONDS_PER_MINUTE)), + "hour" | "hours" => Some(Duration::from_secs(number * SECONDS_PER_HOUR)), + "day" | "days" => Some(Duration::from_secs(number * SECONDS_PER_DAY)), + "week" | "weeks" => Some(Duration::from_secs(number * SECONDS_PER_WEEK)), + _ => None, + } +} + +/// Resolve the effective `DataSkippingNumIndexedCols` for a given table configuration. +pub fn resolve_data_skipping_num_indexed_cols( + props: &TableProperties, +) -> DeltaResult { + Ok(props + .data_skipping_num_indexed_cols + .unwrap_or(DataSkippingNumIndexedCols::AllColumns)) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_checkpoint_interval_default_is_ten() { + assert_eq!(TableProperties::default().checkpoint_interval().get(), 10); + } + + #[test] + fn test_write_checksum_file_enabled_default_is_true() { + assert!(TableProperties::default().write_checksum_file_enabled()); + } + + #[test] + fn test_canonicalize_table_property_aliases() -> DeltaResult<()> { + let props = canonicalize_and_validate_table_properties([ + ("column_mapping_mode", "name"), + ("checkpoint_interval", "7"), + ("write_checksum_file_enabled", "false"), + ("enable_in_commit_timestamps", "true"), + ("custom.key", "value"), + ])?; + + assert_eq!( + props.get("delta.columnMapping.mode"), + Some(&"name".to_string()) + ); + assert_eq!( + props.get("delta.checkpointInterval"), + Some(&"7".to_string()) + ); + assert_eq!( + props.get("delta.writeChecksumFile.enabled"), + Some(&"false".to_string()) + ); + assert_eq!( + props.get("delta.enableInCommitTimestamps"), + Some(&"true".to_string()) + ); + assert_eq!(props.get("custom.key"), Some(&"value".to_string())); + Ok(()) + } + + #[test] + fn test_invalid_modeled_property_is_rejected() { + let result = + canonicalize_and_validate_table_properties([("delta.checkpointInterval", "0")]); + assert!(result.is_err()); + if let Err(err) = result { + assert!(err + .to_string() + .contains("invalid value for delta.checkpointInterval")); + } + } + + #[test] + fn test_route_table_property_key() { + assert_eq!( + route_table_property_key("column_mapping_mode"), + Some("delta.columnMapping.mode".to_string()) + ); + assert_eq!( + route_table_property_key("enable_in_commit_timestamps"), + Some("delta.enableInCommitTimestamps".to_string()) + ); + assert_eq!( + route_table_property_key("write_checksum_file_enabled"), + Some("delta.writeChecksumFile.enabled".to_string()) + ); + assert_eq!( + route_table_property_key("append_only"), + Some("delta.appendOnly".to_string()) + ); + assert_eq!( + route_table_property_key("Delta.featureFlag"), + Some("delta.featureFlag".to_string()) + ); + assert_eq!(route_table_property_key("mergeSchema"), None); + } + + #[test] + fn test_in_commit_timestamp_properties_are_typed() { + let props = TableProperties::from([ + ("delta.enableInCommitTimestamps", "true"), + ("delta.inCommitTimestampEnablementVersion", "3"), + ("delta.inCommitTimestampEnablementTimestamp", "123"), + ]); + + assert!(props.enable_in_commit_timestamps()); + assert_eq!(props.in_commit_timestamp_enablement_version(), Some(3)); + assert_eq!(props.in_commit_timestamp_enablement_timestamp(), Some(123)); + assert_eq!(props.in_commit_timestamp_enablement(), Some((3, 123))); + } +} diff --git a/crates/sail-delta-lake/src/spec/protocol.rs b/crates/sail-delta-lake/src/spec/protocol.rs new file mode 100644 index 0000000000..a1adcb090b --- /dev/null +++ b/crates/sail-delta-lake/src/spec/protocol.rs @@ -0,0 +1,137 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use serde::{Deserialize, Serialize}; + +use crate::spec::properties::TableProperties; +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub enum TableFeature { + AppendOnly, + Invariants, + CheckConstraints, + ChangeDataFeed, + GeneratedColumns, + IdentityColumns, + ColumnMapping, + #[serde(rename = "inCommitTimestamp")] + InCommitTimestamp, + #[serde(rename = "timestampNtz")] + TimestampWithoutTimezone, + #[serde(other)] + Unknown, +} + +impl TableFeature { + pub fn as_str(&self) -> &str { + match self { + Self::AppendOnly => "appendOnly", + Self::Invariants => "invariants", + Self::CheckConstraints => "checkConstraints", + Self::ChangeDataFeed => "changeDataFeed", + Self::GeneratedColumns => "generatedColumns", + Self::IdentityColumns => "identityColumns", + Self::ColumnMapping => "columnMapping", + Self::InCommitTimestamp => "inCommitTimestamp", + Self::TimestampWithoutTimezone => "timestampNtz", + Self::Unknown => "unknown", + } + } + + pub fn parse_str_name(value: &str) -> DeltaResult { + match value { + "appendOnly" => Ok(Self::AppendOnly), + "invariants" => Ok(Self::Invariants), + "checkConstraints" => Ok(Self::CheckConstraints), + "changeDataFeed" => Ok(Self::ChangeDataFeed), + "generatedColumns" => Ok(Self::GeneratedColumns), + "identityColumns" => Ok(Self::IdentityColumns), + "columnMapping" => Ok(Self::ColumnMapping), + "inCommitTimestamp" => Ok(Self::InCommitTimestamp), + "timestampNtz" => Ok(Self::TimestampWithoutTimezone), + _ => Err(DeltaTableError::generic(format!( + "Unknown table feature: {value}" + ))), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct Protocol { + min_reader_version: i32, + min_writer_version: i32, + #[serde(skip_serializing_if = "Option::is_none")] + reader_features: Option>, + #[serde(skip_serializing_if = "Option::is_none")] + writer_features: Option>, +} + +impl Protocol { + pub fn new( + min_reader_version: i32, + min_writer_version: i32, + reader_features: Option>, + writer_features: Option>, + ) -> Self { + Self { + min_reader_version, + min_writer_version, + reader_features: reader_features.filter(|features| !features.is_empty()), + writer_features: writer_features.filter(|features| !features.is_empty()), + } + } + + pub fn min_reader_version(&self) -> i32 { + self.min_reader_version + } + + pub fn min_writer_version(&self) -> i32 { + self.min_writer_version + } + + pub fn reader_features(&self) -> Option<&[TableFeature]> { + self.reader_features.as_deref() + } + + pub fn writer_features(&self) -> Option<&[TableFeature]> { + self.writer_features.as_deref() + } + + pub fn has_reader_feature(&self, feature: &TableFeature) -> bool { + self.reader_features() + .is_some_and(|features| features.contains(feature)) + } + + pub fn has_writer_feature(&self, feature: &TableFeature) -> bool { + self.writer_features() + .is_some_and(|features| features.contains(feature)) + } + + pub fn supports_in_commit_timestamps(&self) -> bool { + self.min_writer_version() >= 7 && self.has_writer_feature(&TableFeature::InCommitTimestamp) + } + + pub fn is_in_commit_timestamps_enabled(&self, table_properties: &TableProperties) -> bool { + self.supports_in_commit_timestamps() && table_properties.enable_in_commit_timestamps() + } +} diff --git a/crates/sail-delta-lake/src/spec/schema.rs b/crates/sail-delta-lake/src/spec/schema.rs new file mode 100644 index 0000000000..04693d1c23 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/schema.rs @@ -0,0 +1,841 @@ +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; +use std::fmt::{Debug, Display, Formatter}; +use std::iter::{DoubleEndedIterator, FusedIterator}; +use std::ops::Deref; +use std::str::FromStr; + +use indexmap::IndexMap; +use serde::{Deserialize, Serialize}; + +use crate::spec::{DeltaError as DeltaTableError, DeltaResult}; + +pub type Schema = StructType; + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(untagged)] +// [Credit]: +pub enum MetadataValue { + Number(i64), + String(String), + Boolean(bool), + Other(serde_json::Value), +} + +impl Display for MetadataValue { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Number(n) => write!(f, "{n}"), + Self::String(s) => write!(f, "{s}"), + Self::Boolean(b) => write!(f, "{b}"), + Self::Other(v) => write!(f, "{v}"), + } + } +} + +impl From for MetadataValue { + fn from(value: String) -> Self { + Self::String(value) + } +} + +impl From<&String> for MetadataValue { + fn from(value: &String) -> Self { + Self::String(value.clone()) + } +} + +impl From<&str> for MetadataValue { + fn from(value: &str) -> Self { + Self::String(value.to_string()) + } +} + +impl From for MetadataValue { + fn from(value: i64) -> Self { + Self::Number(value) + } +} + +impl From for MetadataValue { + fn from(value: bool) -> Self { + Self::Boolean(value) + } +} + +#[derive(Debug)] +// [Credit]: +pub enum ColumnMetadataKey { + ColumnMappingId, + ColumnMappingPhysicalName, + ParquetFieldId, + GenerationExpression, + IdentityStart, + IdentityStep, + IdentityHighWaterMark, + IdentityAllowExplicitInsert, + InternalColumn, + Invariants, + MetadataSpec, +} + +impl AsRef for ColumnMetadataKey { + fn as_ref(&self) -> &str { + match self { + Self::ColumnMappingId => "delta.columnMapping.id", + Self::ColumnMappingPhysicalName => "delta.columnMapping.physicalName", + Self::ParquetFieldId => "parquet.field.id", + Self::GenerationExpression => "delta.generationExpression", + Self::IdentityAllowExplicitInsert => "delta.identity.allowExplicitInsert", + Self::IdentityHighWaterMark => "delta.identity.highWaterMark", + Self::IdentityStart => "delta.identity.start", + Self::IdentityStep => "delta.identity.step", + Self::InternalColumn => "delta.isInternalColumn", + Self::Invariants => "delta.invariants", + Self::MetadataSpec => "delta.metadataSpec", + } + } +} + +#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Hash)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub enum ColumnMappingMode { + #[default] + None, + Name, + Id, +} + +impl AsRef for ColumnMappingMode { + fn as_ref(&self) -> &str { + match self { + Self::None => "none", + Self::Name => "name", + Self::Id => "id", + } + } +} + +impl TryFrom<&str> for ColumnMappingMode { + type Error = DeltaTableError; + + fn try_from(value: &str) -> Result { + match value.to_ascii_lowercase().as_str() { + "none" => Ok(Self::None), + "name" => Ok(Self::Name), + "id" => Ok(Self::Id), + other => Err(DeltaTableError::generic(format!( + "Invalid column mapping mode: {other}" + ))), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash, PartialOrd, Ord, Default)] +// [Credit]: +pub struct ColumnName { + path: Vec, +} + +impl ColumnName { + pub fn new(iter: impl IntoIterator) -> Self + where + Self: FromIterator, + { + iter.into_iter().collect() + } + + pub fn path(&self) -> &[String] { + &self.path + } + + pub fn into_inner(self) -> Vec { + self.path + } + + pub fn parse_column_name_list(names: impl AsRef) -> DeltaResult> { + let mut result = Vec::new(); + let raw = names.as_ref().trim(); + if raw.is_empty() { + return Ok(result); + } + + for column in raw.split(',') { + let column = column.trim(); + if column.is_empty() { + continue; + } + let path = column + .split('.') + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(|segment| { + segment + .trim_matches('`') + .replace("``", "`") + .trim() + .to_string() + }) + .collect::>(); + if path.is_empty() { + return Err(DeltaTableError::generic(format!( + "invalid column name list: {raw}" + ))); + } + result.push(Self { path }); + } + Ok(result) + } +} + +impl> FromIterator for ColumnName { + fn from_iter>(iter: T) -> Self { + let path = iter.into_iter().map(Into::into).collect(); + Self { path } + } +} + +impl FromIterator for ColumnName { + fn from_iter>(iter: T) -> Self { + let path = iter.into_iter().flat_map(|c| c.into_iter()).collect(); + Self { path } + } +} + +impl IntoIterator for ColumnName { + type Item = String; + type IntoIter = std::vec::IntoIter; + + fn into_iter(self) -> Self::IntoIter { + self.path.into_iter() + } +} + +impl AsRef<[String]> for ColumnName { + fn as_ref(&self) -> &[String] { + &self.path + } +} + +impl Deref for ColumnName { + type Target = [String]; + + fn deref(&self) -> &Self::Target { + &self.path + } +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq)] +// [Credit]: +pub struct DecimalType { + precision: u8, + scale: u8, +} + +impl DecimalType { + pub fn try_new(precision: u8, scale: u8) -> DeltaResult { + if !(1..=38).contains(&precision) { + return Err(DeltaTableError::generic(format!( + "precision must be in range 1..=38, found {precision}" + ))); + } + if scale > precision { + return Err(DeltaTableError::generic(format!( + "scale must be <= precision ({precision}), found {scale}" + ))); + } + Ok(Self { precision, scale }) + } + + pub fn precision(&self) -> u8 { + self.precision + } + + pub fn scale(&self) -> u8 { + self.scale + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub enum PrimitiveType { + String, + Long, + Integer, + Short, + Byte, + Float, + Double, + Boolean, + Binary, + Date, + Timestamp, + #[serde(rename = "timestamp_ntz")] + TimestampNtz, + #[serde( + serialize_with = "serialize_decimal", + deserialize_with = "deserialize_decimal", + untagged + )] + Decimal(DecimalType), +} + +impl PrimitiveType { + pub fn decimal(precision: u8, scale: u8) -> DeltaResult { + Ok(Self::Decimal(DecimalType::try_new(precision, scale)?)) + } + + pub fn data_type(&self) -> DataType { + DataType::Primitive(self.clone()) + } +} + +fn serialize_decimal( + dtype: &DecimalType, + serializer: S, +) -> Result { + serializer.serialize_str(&format!("decimal({},{})", dtype.precision(), dtype.scale())) +} + +fn deserialize_decimal<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + let value = std::string::String::deserialize(deserializer)?; + if !(value.starts_with("decimal(") && value.ends_with(')')) { + return Err(serde::de::Error::custom(format!( + "Invalid decimal: {value}" + ))); + } + let mut parts = value[8..value.len() - 1].split(','); + let precision = parts + .next() + .and_then(|v| v.trim().parse::().ok()) + .ok_or_else(|| { + serde::de::Error::custom(format!("Invalid precision in decimal: {value}")) + })?; + let scale = parts + .next() + .and_then(|v| v.trim().parse::().ok()) + .ok_or_else(|| serde::de::Error::custom(format!("Invalid scale in decimal: {value}")))?; + DecimalType::try_new(precision, scale).map_err(serde::de::Error::custom) +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct StructField { + pub name: std::string::String, + #[serde(rename = "type")] + pub data_type: DataType, + pub nullable: bool, + pub metadata: HashMap, +} + +impl StructField { + pub fn new( + name: impl Into, + data_type: impl Into, + nullable: bool, + ) -> Self { + Self { + name: name.into(), + data_type: data_type.into(), + nullable, + metadata: HashMap::default(), + } + } + + pub fn nullable(name: impl Into, data_type: impl Into) -> Self { + Self::new(name, data_type, true) + } + + pub fn not_null(name: impl Into, data_type: impl Into) -> Self { + Self::new(name, data_type, false) + } + + pub fn with_metadata( + mut self, + metadata: impl IntoIterator, impl Into)>, + ) -> Self { + self.metadata = metadata + .into_iter() + .map(|(k, v)| (k.into(), v.into())) + .collect(); + self + } + + pub fn add_metadata( + mut self, + metadata: impl IntoIterator, impl Into)>, + ) -> Self { + self.metadata + .extend(metadata.into_iter().map(|(k, v)| (k.into(), v.into()))); + self + } + + pub fn get_config_value(&self, key: &ColumnMetadataKey) -> Option<&MetadataValue> { + self.metadata.get(key.as_ref()) + } + + pub fn physical_name(&self, column_mapping_mode: ColumnMappingMode) -> &str { + match column_mapping_mode { + ColumnMappingMode::None => &self.name, + ColumnMappingMode::Id | ColumnMappingMode::Name => self + .metadata + .get(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) + .and_then(|v| match v { + MetadataValue::String(v) => Some(v.as_str()), + _ => None, + }) + .unwrap_or(&self.name), + } + } + + pub fn with_name(&self, new_name: impl Into) -> Self { + Self { + name: new_name.into(), + data_type: self.data_type.clone(), + nullable: self.nullable, + metadata: self.metadata.clone(), + } + } + + #[inline] + pub fn name(&self) -> &std::string::String { + &self.name + } + + #[inline] + pub fn is_nullable(&self) -> bool { + self.nullable + } + + #[inline] + pub const fn data_type(&self) -> &DataType { + &self.data_type + } + + #[inline] + pub const fn metadata(&self) -> &HashMap { + &self.metadata + } + + pub fn make_physical(&self, column_mapping_mode: ColumnMappingMode) -> Self { + make_physical_field(self, column_mapping_mode) + } +} + +impl Display for StructField { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{}: {} (is nullable: {})", + self.name, self.data_type, self.nullable + ) + } +} + +fn make_physical_field(field: &StructField, column_mapping_mode: ColumnMappingMode) -> StructField { + let data_type = match &field.data_type { + DataType::Struct(inner) => DataType::from(inner.make_physical(column_mapping_mode)), + other => other.clone(), + }; + + let mut metadata = field.metadata().clone(); + let physical_name_key = ColumnMetadataKey::ColumnMappingPhysicalName.as_ref(); + let field_id_key = ColumnMetadataKey::ColumnMappingId.as_ref(); + let parquet_field_id_key = ColumnMetadataKey::ParquetFieldId.as_ref(); + + match column_mapping_mode { + ColumnMappingMode::Id => { + if let Some(MetadataValue::Number(fid)) = metadata.get(field_id_key) { + let fid = *fid; + metadata.insert(parquet_field_id_key.to_string(), MetadataValue::Number(fid)); + } + } + ColumnMappingMode::Name => { + metadata.remove(field_id_key); + metadata.remove(parquet_field_id_key); + } + ColumnMappingMode::None => { + metadata.remove(physical_name_key); + metadata.remove(field_id_key); + metadata.remove(parquet_field_id_key); + } + } + + let name = match column_mapping_mode { + ColumnMappingMode::None => field.name().to_owned(), + ColumnMappingMode::Id | ColumnMappingMode::Name => { + field.physical_name(column_mapping_mode).to_owned() + } + }; + + StructField { + name, + data_type, + nullable: field.nullable, + metadata, + } +} + +#[derive(Debug, PartialEq, Clone, Eq)] +// [Credit]: +pub struct StructType { + type_name: std::string::String, + fields: IndexMap, +} + +impl StructType { + pub fn try_new(fields: impl IntoIterator) -> DeltaResult { + let mut field_map = IndexMap::new(); + for field in fields { + if let Some(dup) = field_map.insert(field.name.clone(), field) { + return Err(DeltaTableError::schema(format!( + "Duplicate field name: {}", + dup.name + ))); + } + } + Ok(Self { + type_name: "struct".into(), + fields: field_map, + }) + } + + pub fn try_from_results>( + fields: impl IntoIterator>, + ) -> DeltaResult { + let fields = fields + .into_iter() + .map(|f| f.map_err(Into::into)) + .collect::, _>>()?; + Self::try_new(fields) + } + + pub fn new_unchecked(fields: impl IntoIterator) -> Self { + let mut field_map = IndexMap::new(); + for field in fields { + field_map.insert(field.name.clone(), field); + } + Self { + type_name: "struct".into(), + fields: field_map, + } + } + + pub fn field(&self, name: impl AsRef) -> Option<&StructField> { + self.fields.get(name.as_ref()) + } + + pub fn fields( + &self, + ) -> impl ExactSizeIterator + DoubleEndedIterator + FusedIterator { + self.fields.values() + } + + pub fn into_fields( + self, + ) -> impl ExactSizeIterator + DoubleEndedIterator + FusedIterator { + self.fields.into_values() + } + + pub fn make_physical(&self, column_mapping_mode: ColumnMappingMode) -> Self { + let fields = self + .fields() + .map(|field| field.make_physical(column_mapping_mode)); + Self::new_unchecked(fields) + } +} + +#[derive(Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct StructTypeSerDeHelper { + #[serde(rename = "type")] + type_name: std::string::String, + fields: Vec, +} + +impl Serialize for StructType { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + StructTypeSerDeHelper { + type_name: self.type_name.clone(), + fields: self.fields.values().cloned().collect(), + } + .serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for StructType { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let helper = StructTypeSerDeHelper::deserialize(deserializer)?; + StructType::try_new(helper.fields).map_err(serde::de::Error::custom) + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct ArrayType { + #[serde(rename = "type")] + pub type_name: std::string::String, + pub element_type: DataType, + pub contains_null: bool, +} + +impl ArrayType { + pub fn new(element_type: DataType, contains_null: bool) -> Self { + Self { + type_name: "array".into(), + element_type, + contains_null, + } + } + + #[inline] + pub const fn element_type(&self) -> &DataType { + &self.element_type + } + + #[inline] + pub const fn contains_null(&self) -> bool { + self.contains_null + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(rename_all = "camelCase")] +// [Credit]: +pub struct MapType { + #[serde(rename = "type")] + pub type_name: std::string::String, + pub key_type: DataType, + pub value_type: DataType, + #[serde(default = "default_true")] + pub value_contains_null: bool, +} + +impl MapType { + pub fn new( + key_type: impl Into, + value_type: impl Into, + value_contains_null: bool, + ) -> Self { + Self { + type_name: "map".into(), + key_type: key_type.into(), + value_type: value_type.into(), + value_contains_null, + } + } + + #[inline] + pub const fn key_type(&self) -> &DataType { + &self.key_type + } + + #[inline] + pub const fn value_type(&self) -> &DataType { + &self.value_type + } + + #[inline] + pub const fn value_contains_null(&self) -> bool { + self.value_contains_null + } +} + +fn default_true() -> bool { + true +} + +fn serialize_variant( + _: &StructType, + serializer: S, +) -> Result { + serializer.serialize_str("variant") +} + +fn deserialize_variant<'de, D>(deserializer: D) -> Result, D::Error> +where + D: serde::Deserializer<'de>, +{ + let value = std::string::String::deserialize(deserializer)?; + if value != "variant" { + return Err(serde::de::Error::custom(format!( + "Invalid variant: {value}" + ))); + } + match DataType::unshredded_variant() { + DataType::Variant(st) => Ok(st), + _ => Err(serde::de::Error::custom( + "unable to construct variant schema".to_string(), + )), + } +} + +#[derive(Debug, Serialize, Deserialize, PartialEq, Clone, Eq)] +#[serde(untagged, rename_all = "camelCase")] +// [Credit]: +pub enum DataType { + Primitive(PrimitiveType), + Array(Box), + Struct(Box), + Map(Box), + #[serde( + serialize_with = "serialize_variant", + deserialize_with = "deserialize_variant" + )] + Variant(Box), +} + +impl DataType { + pub const STRING: Self = Self::Primitive(PrimitiveType::String); + pub const LONG: Self = Self::Primitive(PrimitiveType::Long); + pub const INTEGER: Self = Self::Primitive(PrimitiveType::Integer); + pub const SHORT: Self = Self::Primitive(PrimitiveType::Short); + pub const BYTE: Self = Self::Primitive(PrimitiveType::Byte); + pub const FLOAT: Self = Self::Primitive(PrimitiveType::Float); + pub const DOUBLE: Self = Self::Primitive(PrimitiveType::Double); + pub const BOOLEAN: Self = Self::Primitive(PrimitiveType::Boolean); + pub const BINARY: Self = Self::Primitive(PrimitiveType::Binary); + pub const DATE: Self = Self::Primitive(PrimitiveType::Date); + pub const TIMESTAMP: Self = Self::Primitive(PrimitiveType::Timestamp); + pub const TIMESTAMP_NTZ: Self = Self::Primitive(PrimitiveType::TimestampNtz); + + pub fn decimal(precision: u8, scale: u8) -> DeltaResult { + Ok(PrimitiveType::decimal(precision, scale)?.into()) + } + + pub fn try_struct_type(fields: impl IntoIterator) -> DeltaResult { + Ok(StructType::try_new(fields)?.into()) + } + + pub fn try_struct_type_from_results>( + fields: impl IntoIterator>, + ) -> DeltaResult { + StructType::try_from_results(fields).map(Self::from) + } + + pub fn struct_type_unchecked(fields: impl IntoIterator) -> Self { + StructType::new_unchecked(fields).into() + } + + pub fn unshredded_variant() -> Self { + Self::Variant(Box::new(StructType::new_unchecked([ + StructField::not_null("metadata", Self::BINARY), + StructField::not_null("value", Self::BINARY), + ]))) + } +} + +impl Display for PrimitiveType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::String => write!(f, "string"), + Self::Long => write!(f, "long"), + Self::Integer => write!(f, "integer"), + Self::Short => write!(f, "short"), + Self::Byte => write!(f, "byte"), + Self::Float => write!(f, "float"), + Self::Double => write!(f, "double"), + Self::Boolean => write!(f, "boolean"), + Self::Binary => write!(f, "binary"), + Self::Date => write!(f, "date"), + Self::Timestamp => write!(f, "timestamp"), + Self::TimestampNtz => write!(f, "timestamp_ntz"), + Self::Decimal(dtype) => write!(f, "decimal({},{})", dtype.precision(), dtype.scale()), + } + } +} + +impl Display for DataType { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + match self { + Self::Primitive(p) => write!(f, "{p}"), + Self::Array(a) => write!(f, "array<{}>", a.element_type), + Self::Struct(s) => { + write!(f, "struct<")?; + for (i, field) in s.fields().enumerate() { + if i > 0 { + write!(f, ", ")?; + } + write!(f, "{}: {}", field.name, field.data_type)?; + } + write!(f, ">") + } + Self::Map(m) => write!(f, "map<{}, {}>", m.key_type, m.value_type), + Self::Variant(_) => write!(f, "variant"), + } + } +} + +impl From for PrimitiveType { + fn from(dtype: DecimalType) -> Self { + Self::Decimal(dtype) + } +} + +impl From for DataType { + fn from(dtype: DecimalType) -> Self { + PrimitiveType::from(dtype).into() + } +} + +impl From for DataType { + fn from(ptype: PrimitiveType) -> Self { + Self::Primitive(ptype) + } +} + +impl From for DataType { + fn from(map_type: MapType) -> Self { + Self::Map(Box::new(map_type)) + } +} + +impl From for DataType { + fn from(struct_type: StructType) -> Self { + Self::Struct(Box::new(struct_type)) + } +} + +impl From for DataType { + fn from(array_type: ArrayType) -> Self { + Self::Array(Box::new(array_type)) + } +} + +impl FromStr for ColumnMappingMode { + type Err = DeltaTableError; + + fn from_str(s: &str) -> Result { + Self::try_from(s) + } +} diff --git a/crates/sail-delta-lake/src/spec/statistics.rs b/crates/sail-delta-lake/src/spec/statistics.rs new file mode 100644 index 0000000000..77ce443a2e --- /dev/null +++ b/crates/sail-delta-lake/src/spec/statistics.rs @@ -0,0 +1,426 @@ +// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt +// https://github.com/delta-io/delta-kernel-rs/blob/f105333a003232d7284f1a8f06cca3b6d6b232a9/LICENSE +// +// Copyright 2023-2024 The Delta Kernel Rust Authors +// Portions Copyright 2025-2026 LakeSail, Inc. +// Ported and modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::spec::fields::{ + STATS_FIELD_MAX_VALUES, STATS_FIELD_MIN_VALUES, STATS_FIELD_NULL_COUNT, STATS_FIELD_NUM_RECORDS, +}; +use crate::spec::{ + ColumnName, DataSkippingNumIndexedCols, DataType, PrimitiveType, Schema, StructField, + StructType, TableProperties, +}; + +/// Column statistics stored in `Stats`. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq, Clone)] +#[serde(untagged)] +pub enum StatValue { + Null, + Boolean(bool), + Number(serde_json::Number), + String(String), +} + +impl From for serde_json::Value { + fn from(value: StatValue) -> Self { + match value { + StatValue::Null => serde_json::Value::Null, + StatValue::Boolean(value) => serde_json::Value::Bool(value), + StatValue::Number(value) => serde_json::Value::Number(value), + StatValue::String(value) => serde_json::Value::String(value), + } + } +} + +// [Credit]: +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[serde(untagged)] +pub enum ColumnValueStat { + Column(HashMap), + Value(StatValue), +} + +impl ColumnValueStat { + pub fn as_column(&self) -> Option<&HashMap> { + match self { + ColumnValueStat::Column(m) => Some(m), + _ => None, + } + } + + pub fn as_value(&self) -> Option<&StatValue> { + match self { + ColumnValueStat::Value(v) => Some(v), + _ => None, + } + } + + pub fn get_path<'a>(&'a self, path: &[&str]) -> Option<&'a ColumnValueStat> { + let mut current = self; + for part in path { + current = current.as_column()?.get(*part)?; + } + Some(current) + } +} + +/// Column null-count statistics stored in `Stats`. +#[derive(Serialize, Deserialize, Debug, PartialEq, Eq)] +#[serde(untagged)] +pub enum ColumnCountStat { + Column(HashMap), + Value(i64), +} + +impl ColumnCountStat { + pub fn as_column(&self) -> Option<&HashMap> { + match self { + ColumnCountStat::Column(m) => Some(m), + _ => None, + } + } + + pub fn as_value(&self) -> Option { + match self { + ColumnCountStat::Value(v) => Some(*v), + _ => None, + } + } + + pub fn get_path<'a>(&'a self, path: &[&str]) -> Option<&'a ColumnCountStat> { + let mut current = self; + for part in path { + current = current.as_column()?.get(*part)?; + } + Some(current) + } +} + +/// Statistics associated with an Add action. +#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub struct Stats { + pub num_records: i64, + pub min_values: HashMap, + pub max_values: HashMap, + pub null_count: HashMap, +} + +impl Stats { + pub fn from_json_str(value: &str) -> Result { + serde_json::from_str::(value).map(|stats| stats.into_stats()) + } + + pub fn from_json_opt(value: Option<&str>) -> Result, serde_json::error::Error> { + value.map(Self::from_json_str).transpose() + } + + pub fn to_json_string(&self) -> Result { + serde_json::to_string(self) + } + + pub fn min_value(&self, name: &str) -> Option<&StatValue> { + lookup_value_stat(&self.min_values, name) + } + + pub fn max_value(&self, name: &str) -> Option<&StatValue> { + lookup_value_stat(&self.max_values, name) + } + + pub fn null_count_value(&self, name: &str) -> Option { + lookup_count_stat(&self.null_count, name) + } +} + +fn lookup_value_stat<'a>( + map: &'a HashMap, + name: &str, +) -> Option<&'a StatValue> { + if let Some(value) = map.get(name).and_then(ColumnValueStat::as_value) { + return Some(value); + } + let mut parts = name.split('.'); + let first = parts.next()?; + let path: Vec<&str> = parts.collect(); + map.get(first)?.get_path(&path)?.as_value() +} + +fn lookup_count_stat(map: &HashMap, name: &str) -> Option { + if let Some(value) = map.get(name).and_then(ColumnCountStat::as_value) { + return Some(value); + } + let mut parts = name.split('.'); + let first = parts.next()?; + let path: Vec<&str> = parts.collect(); + map.get(first)?.get_path(&path)?.as_value() +} + +#[derive(Serialize, Deserialize, Debug, Default, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +struct PartialStats { + pub num_records: i64, + pub min_values: Option>, + pub max_values: Option>, + pub null_count: Option>, +} + +impl PartialStats { + fn into_stats(self) -> Stats { + let PartialStats { + num_records, + min_values, + max_values, + null_count, + } = self; + Stats { + num_records, + min_values: min_values.unwrap_or_default(), + max_values: max_values.unwrap_or_default(), + null_count: null_count.unwrap_or_default(), + } + } +} + +/// Generates the expected schema for file statistics. +// [Credit]: +pub(crate) fn stats_schema( + physical_file_schema: &Schema, + table_properties: &TableProperties, +) -> crate::spec::DeltaResult { + let mut fields = Vec::with_capacity(4); + fields.push(StructField::nullable( + STATS_FIELD_NUM_RECORDS, + DataType::LONG, + )); + + if let Some(base_schema) = base_stats_schema(physical_file_schema, table_properties) { + if let Some(null_count_schema) = null_count_stats_schema(&base_schema) { + fields.push(StructField::nullable( + STATS_FIELD_NULL_COUNT, + null_count_schema, + )); + } + if let Some(min_max_schema) = min_max_stats_schema(&base_schema) { + fields.push(StructField::nullable( + STATS_FIELD_MIN_VALUES, + min_max_schema.clone(), + )); + fields.push(StructField::nullable( + STATS_FIELD_MAX_VALUES, + min_max_schema, + )); + } + } + StructType::try_new(fields) +} + +fn null_count_stats_schema(schema: &StructType) -> Option { + let fields: Vec = schema + .fields() + .filter_map(|field| { + let data_type = match &field.data_type { + DataType::Array(_) | DataType::Map(_) | DataType::Variant(_) => DataType::LONG, + DataType::Struct(inner) => { + if let Some(inner_schema) = null_count_stats_schema(inner) { + DataType::from(inner_schema) + } else { + return None; + } + } + DataType::Primitive(_) => DataType::LONG, + }; + Some(StructField { + name: field.name.clone(), + data_type, + nullable: true, + metadata: Default::default(), + }) + }) + .collect(); + + if fields.is_empty() { + None + } else { + StructType::try_new(fields).ok() + } +} + +fn base_stats_schema(schema: &StructType, props: &TableProperties) -> Option { + let column_names = props.data_skipping_stats_columns.clone(); + let n_columns = if column_names.is_some() { + None + } else { + Some( + props + .data_skipping_num_indexed_cols + .unwrap_or(DataSkippingNumIndexedCols::NumColumns(32)), + ) + }; + + let mut added_columns: u64 = 0; + let fields = base_stats_schema_fields( + schema, + &column_names, + &n_columns, + &mut added_columns, + &mut Vec::new(), + ); + + if fields.is_empty() { + None + } else { + StructType::try_new(fields).ok() + } +} + +fn base_stats_schema_fields( + schema: &StructType, + column_names: &Option>, + n_columns: &Option, + added_columns: &mut u64, + path: &mut Vec, +) -> Vec { + let mut result = Vec::new(); + for field in schema.fields() { + if let Some(DataSkippingNumIndexedCols::NumColumns(n_cols)) = n_columns { + if *added_columns >= *n_cols { + break; + } + } + + path.push(field.name.clone()); + let data_type = field.data_type(); + + let should_include = matches!(data_type, DataType::Struct(_)) + || column_names + .as_ref() + .map(|ns| should_include_column(&ColumnName::new(path.as_slice()), ns)) + .unwrap_or(true); + + if !should_include { + path.pop(); + continue; + } + + let new_field = if let DataType::Struct(inner) = data_type { + let inner_fields = + base_stats_schema_fields(inner, column_names, n_columns, added_columns, path); + path.pop(); + if inner_fields.is_empty() { + continue; + } + StructField { + name: field.name.clone(), + data_type: DataType::from(StructType::new_unchecked(inner_fields)), + nullable: true, + metadata: Default::default(), + } + } else { + *added_columns += 1; + path.pop(); + StructField { + name: field.name.clone(), + data_type: data_type.clone(), + nullable: true, + metadata: Default::default(), + } + }; + + result.push(new_field); + } + result +} + +fn min_max_stats_schema(schema: &StructType) -> Option { + let fields: Vec = schema + .fields() + .filter_map(|field| { + let data_type = match &field.data_type { + DataType::Array(_) | DataType::Map(_) | DataType::Variant(_) => return None, + DataType::Struct(inner) => { + let inner_schema = min_max_stats_schema(inner)?; + DataType::from(inner_schema) + } + DataType::Primitive(p) => { + if is_skipping_eligible_datatype(p) { + field.data_type.clone() + } else { + return None; + } + } + }; + Some(StructField { + name: field.name.clone(), + data_type, + nullable: field.nullable, + metadata: field.metadata.clone(), + }) + }) + .collect(); + + if fields.is_empty() { + None + } else { + StructType::try_new(fields).ok() + } +} + +fn should_include_column(column_name: &ColumnName, column_names: &[ColumnName]) -> bool { + column_names.iter().any(|name: &ColumnName| { + name.as_ref().starts_with(column_name) || column_name.as_ref().starts_with(name) + }) +} + +fn is_skipping_eligible_datatype(data_type: &PrimitiveType) -> bool { + matches!( + data_type, + &PrimitiveType::Byte + | &PrimitiveType::Short + | &PrimitiveType::Integer + | &PrimitiveType::Long + | &PrimitiveType::Float + | &PrimitiveType::Double + | &PrimitiveType::Date + | &PrimitiveType::Timestamp + | &PrimitiveType::TimestampNtz + | &PrimitiveType::String + | PrimitiveType::Decimal(_) + ) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use super::{lookup_value_stat, ColumnValueStat, StatValue}; + + #[test] + fn test_lookup_value_stat_supports_top_level_keys_containing_dots() { + let stats = HashMap::from([( + "first.name".to_string(), + ColumnValueStat::Value(StatValue::String("alice".to_string())), + )]); + + let value = lookup_value_stat(&stats, "first.name"); + + assert_eq!(value, Some(&StatValue::String("alice".to_string()))); + } +} diff --git a/crates/sail-delta-lake/src/spec/utils.rs b/crates/sail-delta-lake/src/spec/utils.rs new file mode 100644 index 0000000000..730929ccd0 --- /dev/null +++ b/crates/sail-delta-lake/src/spec/utils.rs @@ -0,0 +1,113 @@ +// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt +// +// Copyright (2020) QP Hou and a number of other contributors. +// Portions Copyright 2025-2026 LakeSail, Inc. +// Modified in 2026 by LakeSail, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::str::Utf8Error; + +use percent_encoding::{percent_decode_str, percent_encode, AsciiSet, CONTROLS}; + +use super::schema::{DataType, StructField}; + +// [Credit]: +const INVALID: &AsciiSet = &CONTROLS + .add(b'\\') + .add(b'{') + .add(b'^') + .add(b'}') + .add(b'%') + .add(b'`') + .add(b']') + .add(b'"') + .add(b'>') + .add(b'[') + .add(b'<') + .add(b'#') + .add(b'|') + .add(b'\r') + .add(b'\n') + .add(b'*') + .add(b'?'); + +pub(crate) fn encode_path(path: &str) -> String { + percent_encode(path.as_bytes(), INVALID).to_string() +} + +pub(crate) fn decode_path(path: &str) -> Result { + Ok(percent_decode_str(path).decode_utf8()?.to_string()) +} + +pub(crate) mod serde_path { + use serde::{Deserialize, Deserializer, Serialize, Serializer}; + + use super::{decode_path, encode_path}; + + pub fn deserialize<'de, D>(deserializer: D) -> Result + where + D: Deserializer<'de>, + { + let s = String::deserialize(deserializer)?; + decode_path(&s).map_err(serde::de::Error::custom) + } + + pub fn serialize(value: &str, serializer: S) -> Result + where + S: Serializer, + { + let encoded = encode_path(value); + String::serialize(&encoded, serializer) + } +} + +// [Credit]: +/// Checks if any field (including nested) in the provided iterator is a `timestampNtz`. +pub(crate) fn contains_timestampntz<'a>(mut fields: impl Iterator) -> bool { + fn has_timestamp(dtype: &DataType) -> bool { + match dtype { + &DataType::TIMESTAMP_NTZ => true, + DataType::Array(inner) => has_timestamp(inner.element_type()), + DataType::Struct(struct_type) => { + struct_type.fields().any(|f| has_timestamp(f.data_type())) + } + _ => false, + } + } + + fields.any(|field| has_timestamp(field.data_type())) +} + +/// Checks if any field (including nested) in an Arrow schema contains a `timestamp_ntz` type. +/// +/// In Arrow, `TimestampNtz` is represented as `Timestamp(Microsecond, None)` (no timezone). +pub(crate) fn contains_timestampntz_arrow(schema: &datafusion::arrow::datatypes::Schema) -> bool { + fn has_timestamp_ntz(dt: &datafusion::arrow::datatypes::DataType) -> bool { + use datafusion::arrow::datatypes::{DataType as ArrowDataType, TimeUnit}; + match dt { + ArrowDataType::Timestamp(TimeUnit::Microsecond, None) => true, + ArrowDataType::Struct(fields) => { + fields.iter().any(|f| has_timestamp_ntz(f.data_type())) + } + ArrowDataType::List(elem) + | ArrowDataType::LargeList(elem) + | ArrowDataType::FixedSizeList(elem, _) => has_timestamp_ntz(elem.data_type()), + _ => false, + } + } + schema + .fields() + .iter() + .any(|f| has_timestamp_ntz(f.data_type())) +} diff --git a/crates/sail-delta-lake/src/storage/config.rs b/crates/sail-delta-lake/src/storage/config.rs index ebde540b88..f2bd833c8f 100644 --- a/crates/sail-delta-lake/src/storage/config.rs +++ b/crates/sail-delta-lake/src/storage/config.rs @@ -23,7 +23,7 @@ use object_store::prefix::PrefixStore; use object_store::ObjectStore; use url::Url; -use crate::kernel::DeltaResult; +use crate::spec::DeltaResult; // [Credit]: /// Minimal storage configuration used to decorate object stores with table prefixes. diff --git a/crates/sail-delta-lake/src/storage/mod.rs b/crates/sail-delta-lake/src/storage/mod.rs index 0ebaae8674..f4dd14d85d 100644 --- a/crates/sail-delta-lake/src/storage/mod.rs +++ b/crates/sail-delta-lake/src/storage/mod.rs @@ -26,24 +26,17 @@ use async_trait::async_trait; use bytes::Bytes; use datafusion::execution::context::TaskContext; use datafusion_common::{DataFusionError, Result as DataFusionResult}; -use delta_kernel::engine::default::executor::tokio::{ - TokioBackgroundExecutor, TokioMultiThreadExecutor, -}; -use delta_kernel::engine::default::DefaultEngine; -use delta_kernel::path::ParsedLogPath; -use delta_kernel::{Engine, Error as KernelError, FileMeta, LogPath}; -use futures::TryStreamExt; use log::{debug, error}; use object_store::path::Path; -use object_store::{Error as ObjectStoreError, ObjectMeta, ObjectStore, PutMode, PutOptions}; +use object_store::{Error as ObjectStoreError, ObjectStore, ObjectStoreExt, PutMode, PutOptions}; use serde_json::Deserializer as JsonDeserializer; -use tokio::runtime::{Handle, RuntimeFlavor}; use url::Url; use uuid::Uuid; -use crate::kernel::models::Action; -use crate::kernel::transaction::TransactionError; -use crate::kernel::{DeltaResult, DeltaTableError}; +use crate::delta_log::latest_version_from_listing; +use crate::spec::{ + commit_path, Action, DeltaError as DeltaTableError, DeltaError, DeltaResult, TransactionError, +}; mod config; @@ -64,12 +57,6 @@ pub fn get_object_store_from_context( .map_err(|e| DataFusionError::External(Box::new(e))) } -const DELTA_LOG_FOLDER: &str = "_delta_log"; -static DELTA_LOG_PATH: LazyLock = LazyLock::new(|| Path::from(DELTA_LOG_FOLDER)); -#[expect(clippy::expect_used)] -static DUMMY_TABLE_ROOT: LazyLock = - LazyLock::new(|| Url::parse("memory:///").expect("memory URI must be valid")); - /// Holder for temporary commit paths or prepared bytes. #[derive(Clone)] pub enum CommitOrBytes { @@ -114,24 +101,6 @@ pub fn default_logstore( )) } -/// Extract version from an object store entry in the delta log. -fn extract_version_from_meta(meta: &ObjectMeta) -> Option { - let location = DUMMY_TABLE_ROOT.join(meta.location.as_ref()).ok()?; - let file_meta = FileMeta { - location, - last_modified: meta.last_modified.timestamp_millis(), - size: meta.size, - }; - let log_path = LogPath::try_new(file_meta).ok()?; - let parsed_path: ParsedLogPath = log_path.into(); - i64::try_from(parsed_path.version).ok() -} - -/// Return the `_delta_log` commit URI for the given version. -pub fn commit_uri_from_version(version: i64) -> Path { - Path::from_iter([DELTA_LOG_FOLDER, &format!("{version:020}.json")]) -} - /// Reads a commit and gets list of actions. pub fn get_actions(version: i64, commit_log_bytes: &Bytes) -> Result, DeltaTableError> { debug!("parsing commit with version {version}..."); @@ -186,12 +155,6 @@ pub trait LogStore: Send + Sync { /// Get the root object store (without table prefix). fn root_object_store(&self, operation_id: Option) -> Arc; - /// Obtain the kernel engine for this log store. - fn engine(&self, operation_id: Option) -> Arc { - let store = self.root_object_store(operation_id); - get_engine(store) - } - /// Get configuration representing configured log store. fn config(&self) -> &LogStoreConfig; @@ -242,7 +205,7 @@ impl LogStore for DefaultLogStore { CommitOrBytes::LogBytes(log_bytes) => self .object_store(None) .put_opts( - &commit_uri_from_version(version), + &commit_path(version), log_bytes.into(), put_options().clone(), ) @@ -279,7 +242,7 @@ impl LogStore for DefaultLogStore { let latest = latest_version_from_listing(self.object_store(None)).await?; match latest { Some(version) if version >= start => Ok(version), - Some(_) | None => Err(KernelError::MissingVersion.into()), + Some(_) | None => Err(DeltaError::MissingVersion), } } @@ -305,7 +268,7 @@ fn put_options() -> &'static PutOptions { } async fn read_commit_entry(storage: &dyn ObjectStore, version: i64) -> DeltaResult> { - let commit_uri = commit_uri_from_version(version); + let commit_uri = commit_path(version); match storage.get(&commit_uri).await { Ok(res) => { let bytes = res.bytes().await?; @@ -329,38 +292,6 @@ async fn read_commit_entry(storage: &dyn ObjectStore, version: i64) -> DeltaResu } } -async fn latest_version_from_listing(store: Arc) -> DeltaResult> { - let mut stream = store.list(Some(&DELTA_LOG_PATH)); - let mut max_version: Option = None; - while let Some(meta) = stream.try_next().await? { - if let Some(version) = extract_version_from_meta(&meta) { - max_version = Some(max_version.map_or(version, |curr| curr.max(version))); - } - } - Ok(max_version) -} - -fn get_engine(store: Arc) -> Arc { - let handle = Handle::current(); - match handle.runtime_flavor() { - RuntimeFlavor::MultiThread => Arc::new(DefaultEngine::new_with_executor( - store, - Arc::new(TokioMultiThreadExecutor::new(handle)), - )), - RuntimeFlavor::CurrentThread => Arc::new(DefaultEngine::new_with_executor( - store, - Arc::new(TokioBackgroundExecutor::new()), - )), - _ => { - error!("unsupported runtime flavor, using background executor"); - Arc::new(DefaultEngine::new_with_executor( - store, - Arc::new(TokioBackgroundExecutor::new()), - )) - } - } -} - fn to_uri(root: &Url, location: &Path) -> String { if location.as_ref().is_empty() || location.as_ref() == "/" { root.as_ref().to_string() diff --git a/crates/sail-delta-lake/src/table/mod.rs b/crates/sail-delta-lake/src/table/mod.rs index 065fb1a791..4582ecb195 100644 --- a/crates/sail-delta-lake/src/table/mod.rs +++ b/crates/sail-delta-lake/src/table/mod.rs @@ -21,22 +21,22 @@ use std::fmt; use std::sync::Arc; -use chrono::{DateTime, Utc}; +use chrono::{DateTime, NaiveDateTime, TimeZone, Utc}; use datafusion::arrow::datatypes::Schema; use datafusion::catalog::Session; use datafusion::datasource::listing::ListingTableUrl; use datafusion_common::Result; -use delta_kernel::Error as KernelError; use object_store::ObjectStore; -pub use state::DeltaTableState; use url::Url; use crate::datasource::{DeltaScanConfig, DeltaTableProvider}; -use crate::kernel::{DeltaResult, DeltaTableConfig, DeltaTableError}; +use crate::delta_log::resolve_version_timestamp; +pub use crate::kernel::snapshot::DeltaSnapshot; +use crate::kernel::DeltaTableConfig; use crate::logical::table_source::DeltaTableSource; use crate::options::TableDeltaOptions; -use crate::storage::{commit_uri_from_version, default_logstore, LogStoreRef, StorageConfig}; -mod state; +use crate::spec::{DeltaError, DeltaError as DeltaTableError, DeltaResult}; +use crate::storage::{default_logstore, LogStoreRef, StorageConfig}; /// In memory representation of a Delta Table /// @@ -47,7 +47,7 @@ mod state; #[derive(Clone)] pub struct DeltaTable { /// The state of the table as of the most recent loaded Delta log entry. - pub state: Option, + pub state: Option>, /// the load options used during load pub config: DeltaTableConfig, /// log store @@ -84,17 +84,15 @@ impl DeltaTable { /// Get the timestamp of a given version commit. pub(crate) async fn get_version_timestamp(&self, version: i64) -> Result { - if let Some(ts) = self - .state - .as_ref() - .and_then(|s| s.version_timestamp(version)) - { - return Ok(ts); - } - - let commit_uri = commit_uri_from_version(version); - let meta = self.log_store.object_store(None).head(&commit_uri).await?; - Ok(meta.last_modified.timestamp_millis()) + let snapshot = self.snapshot()?; + resolve_version_timestamp( + self.log_store.as_ref(), + version, + snapshot.version_timestamp(version), + snapshot.protocol(), + snapshot.metadata(), + ) + .await } /// Updates the DeltaTable to the latest version by incrementally applying newer versions. @@ -104,22 +102,28 @@ impl DeltaTable { max_version: Option, ) -> Result<(), DeltaTableError> { match self.state.as_mut() { - Some(state) => state.update(self.log_store.as_ref(), max_version).await, + Some(state) => { + Arc::make_mut(state) + .update(self.log_store.as_ref(), max_version.map(|v| v as u64)) + .await?; + Ok(()) + } _ => { - let state = DeltaTableState::try_new( + let state = DeltaSnapshot::try_new( self.log_store.as_ref(), self.config.clone(), max_version, + None, ) .await?; - self.state = Some(state); + self.state = Some(Arc::new(state)); Ok(()) } } } /// Returns the currently loaded state snapshot. - pub fn snapshot(&self) -> DeltaResult<&DeltaTableState> { + pub fn snapshot(&self) -> DeltaResult<&Arc> { self.state .as_ref() .ok_or_else(|| DeltaTableError::generic("Table has not yet been initialized")) @@ -187,6 +191,23 @@ pub async fn open_table_with_object_store_and_table_config( Ok(table) } +/// Open and load a Delta table with an explicit kernel load config at a fixed version. +pub async fn open_table_with_object_store_and_table_config_at_version( + location: Url, + object_store: Arc, + storage_options: StorageConfig, + table_config: DeltaTableConfig, + version: i64, +) -> DeltaResult { + let log_store = + create_logstore_with_object_store(object_store.clone(), location, storage_options)?; + + let mut table = DeltaTable::new(log_store, table_config); + table.load_version(version).await?; + + Ok(table) +} + pub(crate) async fn create_delta_table_with_object_store( location: Url, object_store: Arc, @@ -228,10 +249,16 @@ pub async fn create_delta_provider( let log_store = create_logstore_with_object_store(object_store, table_url.clone(), storage_config)?; - // Create a new DeltaTable instance but do not load it yet. - let mut deltalake_table = DeltaTable::new(log_store.clone(), Default::default()); + let table_config = if options.metadata_as_data_read { + DeltaTableConfig { + require_files: false, + ..Default::default() + } + } else { + Default::default() + }; + let mut deltalake_table = DeltaTable::new(log_store.clone(), table_config); - // Load the table state according to the provided time travel options. load_table_by_options(&mut deltalake_table, &options).await?; let snapshot = deltalake_table.snapshot()?.clone(); @@ -247,9 +274,14 @@ pub async fn create_delta_provider( }, commit_version_column_name: None, commit_timestamp_column_name: None, + delta_log_replay_strategy: options.delta_log_replay_strategy, + delta_log_replay_hash_threshold: options.delta_log_replay_hash_threshold, }; - let table_provider = DeltaTableProvider::try_new(snapshot, log_store, scan_config)?; + let mut table_provider = DeltaTableProvider::try_new(snapshot.clone(), log_store, scan_config)?; + if !options.metadata_as_data_read && !snapshot.adds().is_empty() { + table_provider = table_provider.with_files(snapshot.adds().to_vec()); + } Ok(Arc::new(table_provider)) } @@ -268,7 +300,16 @@ pub async fn create_delta_source( create_logstore_with_object_store(object_store, table_url.clone(), storage_config)?; // Create a new DeltaTable instance but do not load it yet. - let mut deltalake_table = DeltaTable::new(log_store.clone(), Default::default()); + // For metadata-as-data reads, avoid eagerly loading active file metadata on the driver. + let table_config = if options.metadata_as_data_read { + DeltaTableConfig { + require_files: false, + ..Default::default() + } + } else { + Default::default() + }; + let mut deltalake_table = DeltaTable::new(log_store.clone(), table_config); // Load the table state according to the provided time travel options. load_table_by_options(&mut deltalake_table, &options).await?; @@ -286,6 +327,8 @@ pub async fn create_delta_source( }, commit_version_column_name: None, commit_timestamp_column_name: None, + delta_log_replay_strategy: options.delta_log_replay_strategy, + delta_log_replay_hash_threshold: options.delta_log_replay_hash_threshold, }; Ok(Arc::new(DeltaTableSource::try_new( @@ -301,15 +344,12 @@ async fn load_table_by_options(table: &mut DeltaTable, options: &TableDeltaOptio if let Some(version) = options.version_as_of { table.load_version(version).await?; } else if let Some(timestamp_str) = &options.timestamp_as_of { - // This logic is adapted from delta-rs `DeltaTable::load_with_datetime` - let datetime = DateTime::parse_from_rfc3339(timestamp_str) - .map_err(|e| DeltaTableError::generic(format!("Invalid timestamp string: {}", e)))? - .with_timezone(&Utc); + let datetime = parse_timestamp_as_of(timestamp_str)?; let target_version = find_version_for_timestamp(table, datetime) .await .map_err(|e| { - if matches!(e, DeltaTableError::Kernel(KernelError::MissingVersion)) { + if matches!(e, DeltaTableError::MissingVersion) { DeltaTableError::generic(format!( "No version of the Delta table exists at or before timestamp {}", timestamp_str @@ -333,14 +373,25 @@ async fn find_version_for_timestamp( datetime: DateTime, ) -> DeltaResult { let log_store = table.log_store(); - let mut max_version = log_store.get_latest_version(0).await?; - let mut min_version = 0; - - // In case the table is not initialized yet (e.g. state is None), - // get_version_timestamp needs some state to work with. Let's load version 0. - if table.version().is_none() { - table.load_version(0).await?; + let latest_version = log_store.get_latest_version(0).await?; + if table.version() != Some(latest_version) { + table.load_version(latest_version).await?; } + let snapshot = table.snapshot()?; + let (mut min_version, mut max_version) = + if let Some((enablement_version, enablement_timestamp)) = + snapshot.in_commit_timestamp_enablement() + { + if datetime.timestamp_millis() >= enablement_timestamp { + (enablement_version, latest_version) + } else if enablement_version == 0 { + return Err(DeltaError::MissingVersion); + } else { + (0, enablement_version - 1) + } + } else { + (0, latest_version) + }; let target_ts = datetime.timestamp_millis(); let mut target_version = -1; @@ -362,8 +413,41 @@ async fn find_version_for_timestamp( if target_version == -1 { // If no version was found, it means the provided timestamp is before the first commit. - Err(KernelError::MissingVersion.into()) + Err(DeltaError::MissingVersion) } else { Ok(target_version) } } + +fn parse_timestamp_as_of(timestamp: &str) -> DeltaResult> { + let rfc3339_result = DateTime::parse_from_rfc3339(timestamp); + if let Ok(datetime) = rfc3339_result { + return Ok(datetime.with_timezone(&Utc)); + } + + let mut last_error = rfc3339_result + .err() + .map(|e| format!("RFC3339 parsing error: {e}")); + + for format in [ + "%Y-%m-%d %H:%M:%S%.f", + "%Y-%m-%dT%H:%M:%S%.f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + ] { + match NaiveDateTime::parse_from_str(timestamp, format) { + Ok(naive) => return Ok(Utc.from_utc_datetime(&naive)), + Err(e) => { + last_error = Some(format!("Failed to parse with format '{format}': {e}")); + } + } + } + + let detail = last_error + .map(|e| format!(" Details: {e}")) + .unwrap_or_default(); + + Err(DeltaTableError::generic(format!( + "Invalid timestamp string: {timestamp}. Supported formats are: RFC3339 (e.g. '2024-01-02T03:04:05Z'), '%Y-%m-%d %H:%M:%S%.f', '%Y-%m-%dT%H:%M:%S%.f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S'.{detail}", + ))) +} diff --git a/crates/sail-delta-lake/src/table/state.rs b/crates/sail-delta-lake/src/table/state.rs deleted file mode 100644 index 8bfd1ee93e..0000000000 --- a/crates/sail-delta-lake/src/table/state.rs +++ /dev/null @@ -1,244 +0,0 @@ -// https://github.com/delta-io/delta-rs/blob/5575ad16bf641420404611d65f4ad7626e9acb16/LICENSE.txt -// -// Copyright (2020) QP Hou and a number of other contributors. -// Portions Copyright (2025) LakeSail, Inc. -// Modified in 2025 by LakeSail, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// [Credit]: - -//! The module for delta table state. - -use std::ops::{Deref, DerefMut}; -use std::sync::Arc; - -use chrono::Utc; -use delta_kernel::engine::arrow_conversion::TryIntoKernel; -use delta_kernel::expressions::column_expr_ref; -use delta_kernel::schema::{ColumnMetadataKey, StructField}; -use delta_kernel::table_features::ColumnMappingMode; -use delta_kernel::{EvaluationHandler, Expression}; -use futures::TryStreamExt; - -use crate::kernel::arrow::engine_ext::{ExpressionEvaluatorExt, SnapshotExt}; -use crate::kernel::models::{DataType, Remove}; -use crate::kernel::snapshot::EagerSnapshot; -use crate::kernel::{ - DeltaResult, DeltaTableConfig, DeltaTableError, TablePropertiesExt, ARROW_HANDLER, -}; -use crate::storage::LogStore; - -/// State snapshot currently held by the Delta Table instance. -#[derive(Debug, Clone)] -pub struct DeltaTableState { - pub(crate) snapshot: EagerSnapshot, -} - -impl DeltaTableState { - /// Create a new DeltaTableState - pub async fn try_new( - log_store: &dyn LogStore, - config: DeltaTableConfig, - version: Option, - ) -> DeltaResult { - log_store.refresh().await?; - // TODO: pass through predictae - let snapshot = EagerSnapshot::try_new(log_store, config, version).await?; - Ok(Self { snapshot }) - } - - /// Obtain the eagerly materialized snapshot. - pub fn snapshot(&self) -> &EagerSnapshot { - &self.snapshot - } - - /// Full list of tombstones (remove actions) representing files removed from table state). - pub async fn all_tombstones( - &self, - log_store: &dyn LogStore, - ) -> DeltaResult> { - Ok(self - .snapshot - .snapshot() - .tombstones(log_store) - .try_collect::>() - .await? - .into_iter()) - } - - /// List of unexpired tombstones (remove actions) representing files removed from table state. - /// The retention period is set by `deletedFileRetentionDuration` with default value of 1 week. - pub async fn unexpired_tombstones( - &self, - log_store: &dyn LogStore, - ) -> DeltaResult> { - let retention_timestamp = Utc::now().timestamp_millis() - - self - .table_properties() - .deleted_file_retention_duration() - .as_millis() as i64; - let tombstones = self.all_tombstones(log_store).await?.collect::>(); - Ok(tombstones - .into_iter() - .filter(move |t| t.deletion_timestamp.unwrap_or(0) > retention_timestamp)) - } - - /// Determine effective column mapping mode: when explicit mode is None but - /// the schema carries column mapping annotations on any top-level field, - /// treat it as Name. - pub fn effective_column_mapping_mode(&self) -> ColumnMappingMode { - let explicit = self - .snapshot() - .snapshot() - .table_configuration() - .column_mapping_mode(); - if matches!(explicit, ColumnMappingMode::None) { - let kschema = self.snapshot().snapshot().schema().clone(); - let has_annotations = kschema.fields().any(|f| { - f.metadata() - .contains_key(ColumnMetadataKey::ColumnMappingPhysicalName.as_ref()) - && f.metadata() - .contains_key(ColumnMetadataKey::ColumnMappingId.as_ref()) - }); - if has_annotations { - return ColumnMappingMode::Name; - } - } - explicit - } - - /// Update the state of the table to the given version. - pub async fn update( - &mut self, - log_store: &dyn LogStore, - version: Option, - ) -> Result<(), DeltaTableError> { - log_store.refresh().await?; - self.snapshot - .update(log_store, version.map(|v| v as u64)) - .await?; - Ok(()) - } - - /// Get an [arrow::record_batch::RecordBatch] containing add action data. - /// - /// # Arguments - /// - /// * `flatten` - whether to flatten the schema. Partition values columns are - /// given the prefix `partition.`, statistics (null_count, min, and max) are - /// given the prefix `null_count.`, `min.`, and `max.`, and tags the - /// prefix `tags.`. Nested field names are concatenated with `.`. - /// - /// # Data schema - /// - /// Each row represents a file that is a part of the selected tables state. - /// - /// * `path` (String): relative or absolute to a file. - /// * `size_bytes` (Int64): size of file in bytes. - /// * `modification_time` (Millisecond Timestamp): time the file was created. - /// * `null_count.{col_name}` (Int64): number of null values for column in - /// this file. - /// * `num_records.{col_name}` (Int64): number of records for column in - /// this file. - /// * `min.{col_name}` (matches column type): minimum value of column in file - /// (if available). - /// * `max.{col_name}` (matches column type): maximum value of column in file - /// (if available). - /// * `partition.{partition column name}` (matches column type): value of - /// partition the file corresponds to. - pub fn add_actions_table( - &self, - flatten: bool, - ) -> Result { - let mut expressions = vec![ - column_expr_ref!("path"), - column_expr_ref!("size"), - column_expr_ref!("modificationTime"), - ]; - let mut fields = vec![ - StructField::not_null("path", DataType::STRING), - StructField::not_null("size_bytes", DataType::LONG), - StructField::not_null("modification_time", DataType::LONG), - ]; - - let stats_schema = self.snapshot.snapshot().inner.stats_schema()?; - let num_records_field = stats_schema - .field("numRecords") - .ok_or_else(|| DeltaTableError::schema("numRecords field not found".to_string()))? - .with_name("num_records"); - - expressions.push(column_expr_ref!("stats_parsed.numRecords")); - fields.push(num_records_field); - - if let Some(null_count_field) = stats_schema.field("nullCount") { - let null_count_field = null_count_field.with_name("null_count"); - expressions.push(column_expr_ref!("stats_parsed.nullCount")); - fields.push(null_count_field); - } - - if let Some(min_values_field) = stats_schema.field("minValues") { - let min_values_field = min_values_field.with_name("min"); - expressions.push(column_expr_ref!("stats_parsed.minValues")); - fields.push(min_values_field); - } - - if let Some(max_values_field) = stats_schema.field("maxValues") { - let max_values_field = max_values_field.with_name("max"); - expressions.push(column_expr_ref!("stats_parsed.maxValues")); - fields.push(max_values_field); - } - - if let Some(partition_schema) = self.snapshot.snapshot().inner.partitions_schema()? { - fields.push(StructField::nullable( - "partition", - DataType::try_struct_type(partition_schema.fields().cloned())?, - )); - expressions.push(column_expr_ref!("partitionValues_parsed")); - } - - let expression = Expression::Struct(expressions); - let table_schema = DataType::try_struct_type(fields)?; - - let input_schema = self.snapshot.files.schema(); - let input_schema = Arc::new(input_schema.as_ref().try_into_kernel()?); - let actions = self.snapshot.files.clone(); - - let evaluator = ARROW_HANDLER.new_expression_evaluator( - input_schema, - Arc::new(expression), - table_schema, - )?; - let result = evaluator.evaluate_arrow(actions)?; - - if flatten { - Ok(result.normalize(".", None)?) - } else { - Ok(result) - } - } -} - -impl Deref for DeltaTableState { - type Target = EagerSnapshot; - - fn deref(&self) -> &Self::Target { - &self.snapshot - } -} - -impl DerefMut for DeltaTableState { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.snapshot - } -} diff --git a/crates/sail-delta-lake/src/table_format.rs b/crates/sail-delta-lake/src/table_format.rs index 8af6244911..238030f4c5 100644 --- a/crates/sail-delta-lake/src/table_format.rs +++ b/crates/sail-delta-lake/src/table_format.rs @@ -17,12 +17,13 @@ use sail_data_source::options::{ use sail_data_source::resolve_listing_urls; use url::Url; -use crate::options::{ColumnMappingModeOption, TableDeltaOptions}; +use crate::options::{DeltaLogReplayStrategyOption, TableDeltaOptions}; use crate::physical_plan::planner::{ plan_delete, plan_merge, DeltaPhysicalPlanner, DeltaTableConfig, PlannerContext, }; +use crate::spec::{canonicalize_and_validate_table_properties, route_table_property_key}; use crate::table::open_table_with_object_store; -use crate::{create_delta_provider, create_delta_source, DeltaTableError, KernelError}; +use crate::{create_delta_provider, create_delta_source, DeltaTableError}; /// Delta Lake implementation of [`TableFormat`]. #[derive(Debug)] @@ -86,13 +87,14 @@ impl TableFormat for DeltaTableFormat { ctx: &dyn Session, info: SinkInfo, ) -> Result> { + let path = info.path(); let SinkInfo { input, - path, mode, partition_by, bucket_by, sort_order, + table_properties, options, } = info; @@ -102,8 +104,17 @@ impl TableFormat for DeltaTableFormat { if bucket_by.is_some() { return not_impl_err!("bucketing for Delta format"); } + if partition_by.iter().any(|field| field.transform.is_some()) { + return not_impl_err!("partition transforms for Delta format"); + } + let partition_by = partition_by + .into_iter() + .map(|field| field.column) + .collect::>(); let table_url = Self::parse_table_url(ctx, vec![path]).await?; + let (options, routed_table_properties) = + split_delta_write_options_and_table_properties(options); let delta_options = resolve_delta_write_options(options)?; let object_store = ctx @@ -116,11 +127,29 @@ impl TableFormat for DeltaTableFormat { .await { Ok(table) => Some(table), - Err(DeltaTableError::Kernel(KernelError::InvalidTableLocation(_))) - | Err(DeltaTableError::Kernel(KernelError::FileNotFound(_))) => None, + Err(DeltaTableError::InvalidTableLocation(_)) + | Err(DeltaTableError::FileNotFound(_)) => None, Err(err) => return Err(DataFusionError::External(Box::new(err))), }; let table_exists = table.is_some(); + let mut metadata_configuration = resolve_delta_metadata_configuration(&table_properties) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + + if table_exists { + if !routed_table_properties.is_empty() { + let mut keys: Vec<_> = routed_table_properties.keys().cloned().collect(); + keys.sort(); + log::warn!( + "ignoring write-time Delta table properties for existing table at {table_url}: {}", + keys.join(", ") + ); + } + } else { + let routed_metadata_configuration = + resolve_delta_metadata_configuration(&routed_table_properties) + .map_err(|e| DataFusionError::External(Box::new(e)))?; + metadata_configuration.extend(routed_metadata_configuration); + } match mode { PhysicalSinkMode::ErrorIfExists => { @@ -197,6 +226,7 @@ impl TableFormat for DeltaTableFormat { let table_config = DeltaTableConfig::new( table_url, delta_options, + metadata_configuration, partition_columns, table_schema_for_cond, table_exists, @@ -227,7 +257,14 @@ impl TableFormat for DeltaTableFormat { let delta_options = resolve_delta_write_options(options)?; - let delete_config = DeltaTableConfig::new(table_url, delta_options, Vec::new(), None, true); + let delete_config = DeltaTableConfig::new( + table_url, + delta_options, + HashMap::new(), + Vec::new(), + None, + true, + ); let delete_ctx = PlannerContext::new(ctx, delete_config); let delete_exec = plan_delete(&delete_ctx, condition).await?; @@ -244,6 +281,7 @@ impl TableFormat for DeltaTableFormat { let merge_config = DeltaTableConfig::new( table_url, delta_options, + HashMap::new(), info.target.partition_by.clone(), None, true, @@ -271,6 +309,29 @@ fn apply_delta_read_options(from: DeltaReadOptions, to: &mut TableDeltaOptions) if let Some(version_as_of) = from.version_as_of { to.version_as_of = Some(version_as_of) } + if let Some(metadata_as_data_read) = from.metadata_as_data_read { + to.metadata_as_data_read = metadata_as_data_read; + } + if let Some(ref raw) = from.delta_log_replay_strategy { + to.delta_log_replay_strategy = match raw.to_ascii_lowercase().as_str() { + "auto" => DeltaLogReplayStrategyOption::Auto, + "sort" => DeltaLogReplayStrategyOption::Sort, + "hash" => DeltaLogReplayStrategyOption::Hash, + other => { + return plan_err!( + "invalid value for deltaLogReplayStrategy: {other}, expected auto/sort/hash" + ) + } + }; + } + if let Some(threshold) = from.delta_log_replay_hash_threshold { + if threshold == 0 { + return plan_err!( + "invalid value for deltaLogReplayHashThreshold: expected positive integer" + ); + } + to.delta_log_replay_hash_threshold = threshold; + } Ok(()) } @@ -290,12 +351,25 @@ fn apply_delta_write_options(from: DeltaWriteOptions, to: &mut TableDeltaOptions if let Some(write_batch_size) = from.write_batch_size { to.write_batch_size = write_batch_size; } - if let Some(column_mapping_mode) = from.column_mapping_mode { - match column_mapping_mode.to_ascii_lowercase().as_str() { - "name" => to.column_mapping_mode = ColumnMappingModeOption::Name, - "id" => to.column_mapping_mode = ColumnMappingModeOption::Id, - _ => to.column_mapping_mode = ColumnMappingModeOption::None, + if let Some(ref raw) = from.delta_log_replay_strategy { + to.delta_log_replay_strategy = match raw.to_ascii_lowercase().as_str() { + "auto" => DeltaLogReplayStrategyOption::Auto, + "sort" => DeltaLogReplayStrategyOption::Sort, + "hash" => DeltaLogReplayStrategyOption::Hash, + other => { + return plan_err!( + "invalid value for deltaLogReplayStrategy: {other}, expected auto/sort/hash" + ) + } + }; + } + if let Some(threshold) = from.delta_log_replay_hash_threshold { + if threshold == 0 { + return plan_err!( + "invalid value for deltaLogReplayHashThreshold: expected positive integer" + ); } + to.delta_log_replay_hash_threshold = threshold; } Ok(()) } @@ -321,3 +395,74 @@ pub fn resolve_delta_write_options( } Ok(delta_options) } + +fn resolve_delta_metadata_configuration( + table_properties: &HashMap, +) -> crate::spec::DeltaResult> { + canonicalize_and_validate_table_properties( + table_properties + .iter() + .map(|(k, v)| (k.as_str(), v.as_str())), + ) +} + +fn split_delta_write_options_and_table_properties( + options: Vec>, +) -> (Vec>, HashMap) { + let mut clean_options = Vec::with_capacity(options.len()); + let mut table_properties = HashMap::new(); + + for layer in options { + let mut clean_layer = HashMap::with_capacity(layer.len()); + for (key, value) in layer { + if let Some(property_key) = route_table_property_key(&key) { + table_properties.insert(property_key, value); + } else { + clean_layer.insert(key, value); + } + } + clean_options.push(clean_layer); + } + + (clean_options, table_properties) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_split_delta_write_options_and_table_properties() { + let options = vec![ + HashMap::from([ + ("mergeSchema".to_string(), "true".to_string()), + ("column_mapping_mode".to_string(), "name".to_string()), + ]), + HashMap::from([ + ("delta.appendOnly".to_string(), "true".to_string()), + ("targetFileSize".to_string(), "10".to_string()), + ]), + ]; + + let (clean_options, table_properties) = + split_delta_write_options_and_table_properties(options); + + assert_eq!(clean_options.len(), 2); + assert_eq!( + clean_options[0], + HashMap::from([("mergeSchema".to_string(), "true".to_string())]) + ); + assert_eq!( + clean_options[1], + HashMap::from([("targetFileSize".to_string(), "10".to_string())]) + ); + assert_eq!( + table_properties.get("delta.columnMapping.mode"), + Some(&"name".to_string()) + ); + assert_eq!( + table_properties.get("delta.appendOnly"), + Some(&"true".to_string()) + ); + } +} diff --git a/crates/sail-execution/Cargo.toml b/crates/sail-execution/Cargo.toml index 9c281d4123..debd5a19c4 100644 --- a/crates/sail-execution/Cargo.toml +++ b/crates/sail-execution/Cargo.toml @@ -9,6 +9,7 @@ workspace = true [dependencies] sail-common = { path = "../sail-common" } sail-common-datafusion = { path = "../sail-common-datafusion" } +sail-catalog = { path = "../sail-catalog" } sail-data-source = { path = "../sail-data-source" } sail-server = { path = "../sail-server" } sail-python-udf = { path = "../sail-python-udf" } diff --git a/crates/sail-execution/proto/sail/plan/physical.proto b/crates/sail-execution/proto/sail/plan/physical.proto index 63ade1f10c..03d6cd1b87 100644 --- a/crates/sail-execution/proto/sail/plan/physical.proto +++ b/crates/sail-execution/proto/sail/plan/physical.proto @@ -34,23 +34,26 @@ message ExtendedPhysicalPlanNode { DeltaDiscoveryExecNode delta_discovery = 19; DeltaRemoveActionsExecNode delta_remove_actions = 20; DeltaLogReplayExecNode delta_log_replay = 21; - AvroExecNode avro = 22; - ConsoleSinkExecNode console_sink = 23; - SocketSourceExecNode socket_source = 24; - RateSourceExecNode rate_source = 25; - TextSinkExecNode text_sink = 26; - BinarySourceExecNode binary_source = 27; - StreamCollectorExecNode stream_collector = 28; - StreamLimitExecNode stream_limit = 29; - StreamFilterExecNode stream_filter = 30; - StreamSourceAdapterExecNode stream_source_adapter = 31; - IcebergWriterExecNode iceberg_writer = 32; - IcebergCommitExecNode iceberg_commit = 33; - MergeCardinalityCheckExecNode merge_cardinality_check = 34; - PythonDataSourceExecNode python_data_source = 35; - MonotonicIdExecNode monotonic_id = 36; - PythonDataSourceWriteExecNode python_data_source_write = 37; - PythonDataSourceWriteCommitExecNode python_data_source_write_commit = 38; + DeltaMetadataStatsExecNode delta_metadata_stats = 22; + AvroExecNode avro = 23; + ConsoleSinkExecNode console_sink = 24; + SocketSourceExecNode socket_source = 25; + RateSourceExecNode rate_source = 26; + TextSinkExecNode text_sink = 27; + BinarySourceExecNode binary_source = 28; + StreamCollectorExecNode stream_collector = 29; + StreamLimitExecNode stream_limit = 30; + StreamFilterExecNode stream_filter = 31; + StreamSourceAdapterExecNode stream_source_adapter = 32; + IcebergWriterExecNode iceberg_writer = 33; + IcebergCommitExecNode iceberg_commit = 34; + MergeCardinalityCheckExecNode merge_cardinality_check = 35; + PythonDataSourceExecNode python_data_source = 36; + MonotonicIdExecNode monotonic_id = 37; + PythonDataSourceWriteExecNode python_data_source_write = 38; + PythonDataSourceWriteCommitExecNode python_data_source_write_commit = 39; + CatalogCommandExecNode catalog_command = 40; + BarrierExecNode barrier = 41; } } @@ -440,6 +443,7 @@ message DeltaWriterExecNode { bool table_exists = 6; PhysicalSinkMode sink_mode = 7; optional string operation_override_json = 8; + string metadata_configuration = 9; } message DeltaCommitExecNode { @@ -455,6 +459,14 @@ message DeltaScanByAddsExecNode { bytes input = 1; string table_url = 2; bytes table_schema = 3; + optional bytes output_schema = 4; + string scan_config_json = 5; + optional PhysicalProjection projection = 6; + optional uint64 limit = 7; + optional bytes pushdown_filter = 8; + int64 version = 9; + optional bytes statistics = 10; + // TODO: Consider snapshot hints (partition columns, kernel schema) to avoid worker log reads. } message DeltaDiscoveryExecNode { @@ -467,6 +479,11 @@ message DeltaDiscoveryExecNode { bool input_partition_scan = 7; } +message DeltaMetadataStatsExecNode { + bytes input = 1; + bytes stats_schema = 2; +} + message DeltaRemoveActionsExecNode { bytes input = 1; } @@ -565,10 +582,27 @@ enum CompressionTypeVariant { COMPRESSION_TYPE_VARIANT_UNCOMPRESSED = 4; } +enum PartitionTransformKind { + PARTITION_TRANSFORM_KIND_UNSPECIFIED = 0; + PARTITION_TRANSFORM_KIND_IDENTITY = 1; + PARTITION_TRANSFORM_KIND_YEAR = 2; + PARTITION_TRANSFORM_KIND_MONTH = 3; + PARTITION_TRANSFORM_KIND_DAY = 4; + PARTITION_TRANSFORM_KIND_HOUR = 5; + PARTITION_TRANSFORM_KIND_BUCKET = 6; + PARTITION_TRANSFORM_KIND_TRUNCATE = 7; +} + +message CatalogPartitionFieldNode { + string column = 1; + PartitionTransformKind transform_kind = 2; + uint32 transform_value = 3; +} + message IcebergWriterExecNode { bytes input = 1; string table_url = 2; - repeated string partition_columns = 3; + repeated CatalogPartitionFieldNode partition_columns = 3; PhysicalSinkMode sink_mode = 4; bool table_exists = 5; string options = 6; @@ -580,12 +614,16 @@ message IcebergCommitExecNode { } message DeltaLogReplayExecNode { + // Legacy single-input encoding for sort replay mode. bytes input = 1; string table_url = 2; int64 version = 3; repeated string partition_columns = 4; repeated string checkpoint_files = 5; repeated string commit_files = 6; + // Preferred encoding for hash replay mode. + optional bytes checkpoint_input = 7; + optional bytes commits_input = 8; } message PythonDataSourceInputPartition { @@ -621,3 +659,18 @@ message PythonDataSourceWriteCommitExecNode { // Input plan that yields per-partition write results. bytes input = 3; } + +message CatalogCommandExecNode { + // Schema for the command output. + bytes schema = 1; + // JSON-encoded CatalogCommand. + string command = 2; +} + +message BarrierExecNode { + // Precondition plans to be exhausted before the main plan starts. + // Each element is a DataFusion PhysicalPlanNode encoded as bytes. + repeated bytes preconditions = 1; + // The main plan to execute after all preconditions are exhausted. + bytes plan = 2; +} diff --git a/crates/sail-execution/src/codec.rs b/crates/sail-execution/src/codec.rs index 99750ef107..c8bc45036d 100644 --- a/crates/sail-execution/src/codec.rs +++ b/crates/sail-execution/src/codec.rs @@ -7,6 +7,7 @@ use datafusion::arrow::datatypes::{DataType, Schema, TimeUnit}; use datafusion::common::parsers::CompressionTypeVariant; use datafusion::common::{ plan_datafusion_err, plan_err, Constraint, Constraints, JoinSide, Result, ScalarValue, + Statistics, }; use datafusion::datasource::file_format::file_compression_type::FileCompressionType; use datafusion::datasource::memory::MemorySourceConfig; @@ -42,7 +43,9 @@ use datafusion_proto::physical_plan::to_proto::{ serialize_file_scan_config, serialize_partitioning, serialize_physical_expr, serialize_physical_sort_exprs, }; -use datafusion_proto::physical_plan::{AsExecutionPlan, PhysicalExtensionCodec}; +use datafusion_proto::physical_plan::{ + AsExecutionPlan, DefaultPhysicalProtoConverter, PhysicalExtensionCodec, +}; use datafusion_proto::protobuf::{ JoinType as ProtoJoinType, PhysicalPlanNode, PhysicalSortExprNode, }; @@ -71,6 +74,7 @@ use datafusion_spark::function::url::url_encode::UrlEncode; use prost::Message; use sail_catalog_system::physical_plan::SystemTableExec; use sail_common_datafusion::array::record_batch::{read_record_batches, write_record_batches}; +use sail_common_datafusion::catalog::{CatalogPartitionField, PartitionTransform}; use sail_common_datafusion::datasource::PhysicalSinkMode; use sail_common_datafusion::system::catalog::SystemTable; use sail_common_datafusion::udf::StreamUDF; @@ -86,8 +90,9 @@ use sail_data_source::formats::text::source::TextSource; use sail_data_source::formats::text::writer::{TextSink, TextWriterOptions}; use sail_delta_lake::physical_plan::{ DeltaCastColumnExpr, DeltaCommitExec, DeltaDiscoveryExec, DeltaLogReplayExec, - DeltaRemoveActionsExec, DeltaScanByAddsExec, DeltaWriterExec, + DeltaMetadataStatsExec, DeltaRemoveActionsExec, DeltaScanByAddsExec, DeltaWriterExec, }; +use sail_delta_lake::spec::DeltaOperation; use sail_function::aggregate::histogram_numeric::HistogramNumericFunction; use sail_function::aggregate::kurtosis::KurtosisFunction; use sail_function::aggregate::max_min_by::{MaxByFunction, MinByFunction}; @@ -111,9 +116,12 @@ use sail_function::scalar::datetime::spark_interval::{ SparkCalendarInterval, SparkDayTimeInterval, SparkYearMonthInterval, }; use sail_function::scalar::datetime::spark_last_day::SparkLastDay; +use sail_function::scalar::datetime::spark_make_time::SparkMakeTime; use sail_function::scalar::datetime::spark_make_timestamp::SparkMakeTimestampNtz; use sail_function::scalar::datetime::spark_make_ym_interval::SparkMakeYmInterval; use sail_function::scalar::datetime::spark_next_day::SparkNextDay; +use sail_function::scalar::datetime::spark_time_diff::SparkTimeDiff; +use sail_function::scalar::datetime::spark_time_trunc::SparkTimeTrunc; use sail_function::scalar::datetime::spark_timestamp::SparkTimestamp; use sail_function::scalar::datetime::spark_to_chrono_fmt::SparkToChronoFmt; use sail_function::scalar::datetime::spark_try_make_timestamp_ntz::SparkTryMakeTimestampNtz; @@ -122,6 +130,9 @@ use sail_function::scalar::datetime::spark_unix_timestamp::SparkUnixTimestamp; use sail_function::scalar::datetime::timestamp_now::TimestampNow; use sail_function::scalar::drop_struct_field::DropStructField; use sail_function::scalar::explode::{explode_name_to_kind, Explode}; +use sail_function::scalar::geo::st_asbinary::StAsBinary; +use sail_function::scalar::geo::st_geogfromwkb::StGeogFromWKB; +use sail_function::scalar::geo::st_geomfromwkb::StGeomFromWKB; use sail_function::scalar::hash::spark_murmur3_hash::SparkMurmur3Hash; use sail_function::scalar::hash::spark_xxhash64::SparkXxhash64; use sail_function::scalar::json::SparkToJson; @@ -170,6 +181,8 @@ use sail_iceberg::physical_plan::{IcebergCommitExec, IcebergWriterExec}; use sail_iceberg::TableIcebergOptions; use sail_logical_plan::range::Range; use sail_logical_plan::show_string::{ShowStringFormat, ShowStringStyle}; +use sail_physical_plan::barrier::BarrierExec; +use sail_physical_plan::catalog_command::CatalogCommandExec; use sail_physical_plan::map_partitions::MapPartitionsExec; use sail_physical_plan::merge_cardinality_check::MergeCardinalityCheckExec; use sail_physical_plan::monotonic_id::MonotonicIdExec; @@ -280,12 +293,12 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { requires_infinite_memory: false, } }; - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( eq_properties, partitioning, EmissionType::Both, boundedness, - ); + )); let node = StageInputExec::new(input as usize, properties); Ok(Arc::new(node)) } @@ -375,6 +388,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { &proto, ctx, self, + &DefaultPhysicalProtoConverter {}, Arc::new(JsonSource::new(table_schema)), )?; let source = FileScanConfigBuilder::from(source) @@ -389,6 +403,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { &proto, ctx, self, + &DefaultPhysicalProtoConverter {}, Arc::new(ArrowSource::new_file_source(table_schema)), )?; Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) @@ -417,6 +432,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { &proto, ctx, self, + &DefaultPhysicalProtoConverter {}, Arc::new(TextSource::new(table_schema, whole_text, line_sep)), )?; let source = FileScanConfigBuilder::from(source) @@ -434,6 +450,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { &proto, ctx, self, + &DefaultPhysicalProtoConverter {}, Arc::new(BinarySource::new(table_schema, path_glob_filter)), )?; let source = FileScanConfigBuilder::from(source).build(); @@ -446,6 +463,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { &proto, ctx, self, + &DefaultPhysicalProtoConverter {}, Arc::new(AvroSource::new(table_schema)), )?; Ok(Arc::new(DataSourceExec::new(Arc::new(source)))) @@ -563,6 +581,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { table_exists, sink_mode, operation_override_json, + metadata_configuration, }) => { let input = self.try_decode_plan(&input, ctx)?; let sink_schema = self.try_decode_schema(&sink_schema)?; @@ -577,9 +596,14 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { .map_err(|e| plan_datafusion_err!("failed to parse table URL: {e}"))?; let options = serde_json::from_str(&options).map_err(|e| plan_datafusion_err!("{e}"))?; + let metadata_configuration = serde_json::from_str(&metadata_configuration) + .map_err(|e| plan_datafusion_err!("{e}"))?; let operation_override = if let Some(s) = operation_override_json.as_ref() { - Some(serde_json::from_str(s).map_err(|e| plan_datafusion_err!("{e}"))?) + Some( + serde_json::from_str::(s) + .map_err(|e| plan_datafusion_err!("{e}"))?, + ) } else { None }; @@ -587,6 +611,7 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { input, table_url, options, + metadata_configuration, partition_columns, sink_mode, table_exists, @@ -626,16 +651,70 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { input, table_url, table_schema, + output_schema, + scan_config_json, + projection, + limit, + pushdown_filter, + version, + statistics, }) => { let input = self.try_decode_plan(&input, ctx)?; let table_url = Url::parse(&table_url) .map_err(|e| plan_datafusion_err!("failed to parse table URL: {e}"))?; let table_schema = Arc::new(self.try_decode_schema(&table_schema)?); - Ok(Arc::new(DeltaScanByAddsExec::new( - input, - table_url, - table_schema, - ))) + let output_schema = if let Some(schema_bytes) = output_schema { + Arc::new(self.try_decode_schema(&schema_bytes)?) + } else { + Arc::clone(&table_schema) + }; + let scan_config: sail_delta_lake::datasource::DeltaScanConfig = + if scan_config_json.is_empty() { + sail_delta_lake::datasource::DeltaScanConfig::default() + } else { + serde_json::from_str(&scan_config_json).map_err(|e| { + plan_datafusion_err!("failed to decode Delta scan config: {e}") + })? + }; + let projection = projection + .map(|p| self.try_decode_projection(&p.columns)) + .transpose() + .map_err(|_| { + plan_datafusion_err!("invalid projection for DeltaScanByAddsExec") + })?; + let limit = limit + .map(usize::try_from) + .transpose() + .map_err(|_| plan_datafusion_err!("invalid limit for DeltaScanByAddsExec"))?; + let pushdown_filter = if let Some(pred_bytes) = pushdown_filter { + let predicate = parse_physical_expr( + &self.try_decode_message(&pred_bytes)?, + ctx, + &output_schema, + self, + )?; + Some(predicate) + } else { + None + }; + let statistics = statistics + .as_ref() + .map(|bytes| self.try_decode_statistics(bytes)) + .transpose()?; + Ok(Arc::new( + DeltaScanByAddsExec::new( + input, + table_url, + version, + table_schema, + output_schema, + scan_config, + projection, + limit, + pushdown_filter, + ) + .with_output_statistics(statistics), + )) } NodeKind::DeltaDiscovery(gen::DeltaDiscoveryExecNode { table_url, @@ -678,6 +757,14 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { input_partition_scan, )?)) } + NodeKind::DeltaMetadataStats(gen::DeltaMetadataStatsExecNode { + input, + stats_schema, + }) => { + let input = self.try_decode_plan(&input, ctx)?; + let stats_schema = Arc::new(self.try_decode_schema(&stats_schema)?); + Ok(Arc::new(DeltaMetadataStatsExec::new(input, stats_schema))) + } NodeKind::DeltaRemoveActions(gen::DeltaRemoveActionsExecNode { input }) => { let input = self.try_decode_plan(&input, ctx)?; Ok(Arc::new(DeltaRemoveActionsExec::new(input)?)) @@ -689,18 +776,40 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { partition_columns, checkpoint_files, commit_files, + checkpoint_input, + commits_input, }) => { - let input = self.try_decode_plan(&input, ctx)?; let table_url = Url::parse(&table_url) .map_err(|e| plan_datafusion_err!("failed to parse table URL: {e}"))?; - Ok(Arc::new(DeltaLogReplayExec::new( - input, - table_url, - version, - partition_columns, - checkpoint_files, - commit_files, - ))) + match (checkpoint_input.as_ref(), commits_input.as_ref()) { + (Some(checkpoint_input), Some(commits_input)) => { + let checkpoint_input = self.try_decode_plan(checkpoint_input, ctx)?; + let commits_input = self.try_decode_plan(commits_input, ctx)?; + Ok(Arc::new(DeltaLogReplayExec::new_hash( + checkpoint_input, + commits_input, + table_url, + version, + partition_columns, + checkpoint_files, + commit_files, + ))) + } + (None, None) => { + let input = self.try_decode_plan(&input, ctx)?; + Ok(Arc::new(DeltaLogReplayExec::new( + input, + table_url, + version, + partition_columns, + checkpoint_files, + commit_files, + ))) + } + _ => plan_err!( + "DeltaLogReplayExec requires both checkpoint_input and commits_input when hash replay is encoded" + ), + } } NodeKind::ConsoleSink(gen::ConsoleSinkExecNode { input }) => { let input = self.try_decode_plan(&input, ctx)?; @@ -790,11 +899,16 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { let sort_order = physical_sort_expr_nodes .as_ref() .map(|physical_sort_expr_nodes| { - parse_physical_sort_exprs(physical_sort_expr_nodes, ctx, &schema, self).map( - |sort_exprs| { - LexRequirement::new(sort_exprs.into_iter().map(Into::into)) - }, + parse_physical_sort_exprs( + physical_sort_expr_nodes, + ctx, + &schema, + self, + &DefaultPhysicalProtoConverter {}, ) + .map(|sort_exprs| { + LexRequirement::new(sort_exprs.into_iter().map(Into::into)) + }) }) .transpose()? .flatten(); @@ -872,6 +986,10 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { self.try_decode_physical_sink_mode(sink_mode, &input.schema(), ctx)?; let table_url = Url::parse(&table_url) .map_err(|e| plan_datafusion_err!("failed to parse table URL: {e}"))?; + let partition_columns = partition_columns + .into_iter() + .map(|field| self.try_decode_catalog_partition_field(field)) + .collect::>>()?; let options = if options.is_empty() { TableIcebergOptions::default() } else { @@ -947,6 +1065,23 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { expected_partitions as usize, ))) } + NodeKind::CatalogCommand(gen::CatalogCommandExecNode { schema, command }) => { + let schema = Arc::new(self.try_decode_schema(&schema)?); + let command: sail_catalog::command::CatalogCommand = serde_json::from_str(&command) + .map_err(|e| plan_datafusion_err!("failed to decode CatalogCommand: {e}"))?; + Ok(Arc::new(CatalogCommandExec::new(command, schema))) + } + NodeKind::Barrier(gen::BarrierExecNode { + preconditions, + plan, + }) => { + let preconditions = preconditions + .into_iter() + .map(|i| self.try_decode_plan(&i, ctx)) + .collect::>()?; + let plan = self.try_decode_plan(&plan, ctx)?; + Ok(Arc::new(BarrierExec::new(preconditions, plan))) + } _ => plan_err!("unsupported physical plan node: {node_kind:?}"), } } @@ -1117,8 +1252,11 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { if let Some(file_scan) = source.as_any().downcast_ref::() { let file_source = file_scan.file_source(); if let Some(text_source) = file_source.as_any().downcast_ref::() { - let base_config = - self.try_encode_message(serialize_file_scan_config(file_scan, self)?)?; + let base_config = self.try_encode_message(serialize_file_scan_config( + file_scan, + self, + &DefaultPhysicalProtoConverter {}, + )?)?; let file_compression_type = self.try_encode_file_compression_type(file_scan.file_compression_type)?; NodeKind::Text(gen::TextExecNode { @@ -1130,16 +1268,22 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { } else if let Some(binary_source) = file_source.as_any().downcast_ref::() { - let base_config = - self.try_encode_message(serialize_file_scan_config(file_scan, self)?)?; + let base_config = self.try_encode_message(serialize_file_scan_config( + file_scan, + self, + &DefaultPhysicalProtoConverter {}, + )?)?; NodeKind::BinarySource(gen::BinarySourceExecNode { base_config, path_glob_filter: binary_source.path_glob_filter().cloned(), }) } else if file_source.as_any().is::() { // TODO: Check if we still need to have JsonSource: https://github.com/apache/datafusion/pull/14224 - let base_config = - self.try_encode_message(serialize_file_scan_config(file_scan, self)?)?; + let base_config = self.try_encode_message(serialize_file_scan_config( + file_scan, + self, + &DefaultPhysicalProtoConverter {}, + )?)?; let file_compression_type = self.try_encode_file_compression_type(file_scan.file_compression_type)?; NodeKind::NdJson(gen::NdJsonExecNode { @@ -1148,12 +1292,18 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { }) } else if file_source.as_any().is::() { // TODO: Check if we still need to have ArrowSource: https://github.com/apache/datafusion/pull/14224 - let base_config = - self.try_encode_message(serialize_file_scan_config(file_scan, self)?)?; + let base_config = self.try_encode_message(serialize_file_scan_config( + file_scan, + self, + &DefaultPhysicalProtoConverter {}, + )?)?; NodeKind::Arrow(gen::ArrowExecNode { base_config }) } else if file_source.as_any().is::() { - let base_config = - self.try_encode_message(serialize_file_scan_config(file_scan, self)?)?; + let base_config = self.try_encode_message(serialize_file_scan_config( + file_scan, + self, + &DefaultPhysicalProtoConverter {}, + )?)?; NodeKind::Avro(gen::AvroExecNode { base_config }) } else { return plan_err!("unsupported data source node: {data_source:?}"); @@ -1205,6 +1355,10 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { table_exists: delta_writer_exec.table_exists(), sink_mode: Some(sink_mode), operation_override_json, + metadata_configuration: serde_json::to_string( + delta_writer_exec.metadata_configuration(), + ) + .map_err(|e| plan_datafusion_err!("{e}"))?, }) } else if let Some(delta_commit_exec) = node.as_any().downcast_ref::() { let input = self.try_encode_plan(delta_commit_exec.input().clone())?; @@ -1222,10 +1376,41 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { { let input = self.try_encode_plan(delta_scan_by_adds_exec.input().clone())?; let table_schema = self.try_encode_schema(delta_scan_by_adds_exec.table_schema())?; + let output_schema = self.try_encode_schema(delta_scan_by_adds_exec.output_schema())?; + let scan_config_json = serde_json::to_string(delta_scan_by_adds_exec.scan_config()) + .map_err(|e| plan_datafusion_err!("failed to encode Delta scan config: {e}"))?; + let projection = delta_scan_by_adds_exec + .projection() + .map(|p| { + self.try_encode_projection(p) + .map(|columns| gen::PhysicalProjection { columns }) + }) + .transpose() + .map_err(|_| plan_datafusion_err!("invalid projection for DeltaScanByAddsExec"))?; + let limit = delta_scan_by_adds_exec + .limit() + .map(u64::try_from) + .transpose() + .map_err(|_| plan_datafusion_err!("invalid limit for DeltaScanByAddsExec"))?; + let pushdown_filter = if let Some(pred) = delta_scan_by_adds_exec.pushdown_filter() { + let predicate_node = serialize_physical_expr(pred, self)?; + Some(self.try_encode_message(predicate_node)?) + } else { + None + }; + let statistics = + Some(self.try_encode_statistics(delta_scan_by_adds_exec.statistics())?); NodeKind::DeltaScanByAdds(gen::DeltaScanByAddsExecNode { input, table_url: delta_scan_by_adds_exec.table_url().to_string(), table_schema, + output_schema: Some(output_schema), + scan_config_json, + projection, + limit, + pushdown_filter, + version: delta_scan_by_adds_exec.version(), + statistics, }) } else if let Some(delta_discovery_exec) = node.as_any().downcast_ref::() @@ -1251,6 +1436,13 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { input_partition_columns: delta_discovery_exec.input_partition_columns().to_vec(), input_partition_scan: delta_discovery_exec.input_partition_scan(), }) + } else if let Some(delta_metadata_stats_exec) = + node.as_any().downcast_ref::() + { + NodeKind::DeltaMetadataStats(gen::DeltaMetadataStatsExecNode { + input: self.try_encode_plan(delta_metadata_stats_exec.input().clone())?, + stats_schema: self.try_encode_schema(delta_metadata_stats_exec.stats_schema())?, + }) } else if let Some(delta_remove_actions_exec) = node.as_any().downcast_ref::() { @@ -1259,7 +1451,20 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { } else if let Some(delta_log_replay_exec) = node.as_any().downcast_ref::() { - let input = self.try_encode_plan(delta_log_replay_exec.children()[0].clone())?; + let children = delta_log_replay_exec.children(); + let (input, checkpoint_input, commits_input) = match children.as_slice() { + [input] => (self.try_encode_plan((*input).clone())?, None, None), + [checkpoint_input, commits_input] => ( + Vec::new(), + Some(self.try_encode_plan((*checkpoint_input).clone())?), + Some(self.try_encode_plan((*commits_input).clone())?), + ), + _ => { + return plan_err!( + "DeltaLogReplayExec expects one child for sort replay or two children for hash replay" + ) + } + }; NodeKind::DeltaLogReplay(gen::DeltaLogReplayExecNode { input, table_url: delta_log_replay_exec.table_url().to_string(), @@ -1267,6 +1472,8 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { partition_columns: delta_log_replay_exec.partition_columns().to_vec(), checkpoint_files: delta_log_replay_exec.checkpoint_files().to_vec(), commit_files: delta_log_replay_exec.commit_files().to_vec(), + checkpoint_input, + commits_input, }) } else if let Some(console_sink) = node.as_any().downcast_ref::() { let input = self.try_encode_plan(console_sink.input().clone())?; @@ -1410,7 +1617,11 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { NodeKind::IcebergWriter(gen::IcebergWriterExecNode { input, table_url: iceberg_writer_exec.table_url().to_string(), - partition_columns: iceberg_writer_exec.partition_columns().to_vec(), + partition_columns: iceberg_writer_exec + .partition_columns() + .iter() + .map(Self::try_encode_catalog_partition_field) + .collect::>>()?, sink_mode: Some(sink_mode), table_exists: iceberg_writer_exec.table_exists(), options, @@ -1458,6 +1669,24 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { expected_partitions: python_commit_exec.expected_partitions() as u64, input, }) + } else if let Some(catalog_command_exec) = + node.as_any().downcast_ref::() + { + let schema = self.try_encode_schema(catalog_command_exec.schema().as_ref())?; + let command = serde_json::to_string(catalog_command_exec.command()) + .map_err(|e| plan_datafusion_err!("failed to encode CatalogCommand: {e}"))?; + NodeKind::CatalogCommand(gen::CatalogCommandExecNode { schema, command }) + } else if let Some(barrier_exec) = node.as_any().downcast_ref::() { + let preconditions = barrier_exec + .preconditions() + .iter() + .map(|child| self.try_encode_plan(child.clone())) + .collect::>()?; + let plan = self.try_encode_plan(barrier_exec.plan().clone())?; + NodeKind::Barrier(gen::BarrierExecNode { + preconditions, + plan, + }) } else { return plan_err!("unsupported physical plan node: {node:?}"); }; @@ -1615,6 +1844,9 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { "randstr" => Ok(Arc::new(ScalarUDF::from(Randstr::new()))), "format_number" => Ok(Arc::new(ScalarUDF::from(FormatNumber::new()))), "soundex" => Ok(Arc::new(ScalarUDF::from(Soundex::new()))), + "st_asbinary" => Ok(Arc::new(ScalarUDF::from(StAsBinary::new()))), + "st_geomfromwkb" => Ok(Arc::new(ScalarUDF::from(StGeomFromWKB::new()))), + "st_geogfromwkb" => Ok(Arc::new(ScalarUDF::from(StGeogFromWKB::new()))), "spark_array" | "spark_make_array" | "array" => { Ok(Arc::new(ScalarUDF::from(SparkArray::new()))) } @@ -1690,6 +1922,11 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { "spark_try_make_timestamp_ntz" | "try_make_timestamp_ntz" => { Ok(Arc::new(ScalarUDF::from(SparkTryMakeTimestampNtz::new()))) } + "spark_make_time" | "make_time" => Ok(Arc::new(ScalarUDF::from(SparkMakeTime::new()))), + "spark_time_diff" | "time_diff" => Ok(Arc::new(ScalarUDF::from(SparkTimeDiff::new()))), + "spark_time_trunc" | "time_trunc" => { + Ok(Arc::new(ScalarUDF::from(SparkTimeTrunc::new()))) + } "spark_mask" | "mask" => Ok(Arc::new(ScalarUDF::from(SparkMask::new()))), "spark_concat_ws" | "concat_ws" => Ok(Arc::new(ScalarUDF::from(SparkConcatWs::new()))), "spark_sequence" | "sequence" => Ok(Arc::new(ScalarUDF::from(SparkSequence::new()))), @@ -1758,6 +1995,9 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { || node_inner.is::() || node_inner.is::() || node_inner.is::() + || node_inner.is::() + || node_inner.is::() + || node_inner.is::() || node_inner.is::() || node_inner.is::() || node_inner.is::() @@ -1799,6 +2039,9 @@ impl PhysicalExtensionCodec for RemoteExecutionCodec { || node_inner.is::() || node_inner.is::() || node_inner.is::() + || node_inner.is::() + || node_inner.is::() + || node_inner.is::() || node_inner.is::() || node_inner.is::() || node_inner.is::() @@ -2227,6 +2470,55 @@ impl RemoteExecutionCodec { Ok(gen::PhysicalSinkMode { mode: Some(mode) }) } + fn try_decode_catalog_partition_field( + &self, + field: gen::CatalogPartitionFieldNode, + ) -> Result { + let transform_kind = gen::PartitionTransformKind::try_from(field.transform_kind) + .map_err(|_| plan_datafusion_err!("invalid partition transform kind"))?; + let transform = match transform_kind { + gen::PartitionTransformKind::Unspecified | gen::PartitionTransformKind::Identity => { + None + } + gen::PartitionTransformKind::Year => Some(PartitionTransform::Year), + gen::PartitionTransformKind::Month => Some(PartitionTransform::Month), + gen::PartitionTransformKind::Day => Some(PartitionTransform::Day), + gen::PartitionTransformKind::Hour => Some(PartitionTransform::Hour), + gen::PartitionTransformKind::Bucket => { + Some(PartitionTransform::Bucket(field.transform_value)) + } + gen::PartitionTransformKind::Truncate => { + Some(PartitionTransform::Truncate(field.transform_value)) + } + }; + Ok(CatalogPartitionField { + column: field.column, + transform, + }) + } + + fn try_encode_catalog_partition_field( + field: &CatalogPartitionField, + ) -> Result { + let (transform_kind, transform_value) = match field.transform { + None => (gen::PartitionTransformKind::Unspecified as i32, 0), + Some(PartitionTransform::Identity) => (gen::PartitionTransformKind::Identity as i32, 0), + Some(PartitionTransform::Year) => (gen::PartitionTransformKind::Year as i32, 0), + Some(PartitionTransform::Month) => (gen::PartitionTransformKind::Month as i32, 0), + Some(PartitionTransform::Day) => (gen::PartitionTransformKind::Day as i32, 0), + Some(PartitionTransform::Hour) => (gen::PartitionTransformKind::Hour as i32, 0), + Some(PartitionTransform::Bucket(n)) => (gen::PartitionTransformKind::Bucket as i32, n), + Some(PartitionTransform::Truncate(w)) => { + (gen::PartitionTransformKind::Truncate as i32, w) + } + }; + Ok(gen::CatalogPartitionFieldNode { + column: field.column.clone(), + transform_kind, + transform_value, + }) + } + fn try_decode_stream_udf(&self, udf: ExtendedStreamUdf) -> Result> { let ExtendedStreamUdf { stream_udf_kind } = udf; let stream_udf_kind = match stream_udf_kind { @@ -2384,8 +2676,14 @@ impl RemoteExecutionCodec { .map(|x| self.try_decode_message(x)) .collect::>()?; let lex_ordering = LexOrdering::new( - parse_physical_sort_exprs(&lex_ordering, ctx, schema, self) - .map_err(|e| plan_datafusion_err!("failed to decode lex ordering: {e}"))?, + parse_physical_sort_exprs( + &lex_ordering, + ctx, + schema, + self, + &DefaultPhysicalProtoConverter {}, + ) + .map_err(|e| plan_datafusion_err!("failed to decode lex ordering: {e}"))?, ); match lex_ordering { Some(lex_ordering) => Ok(lex_ordering), @@ -2394,7 +2692,11 @@ impl RemoteExecutionCodec { } fn try_encode_lex_ordering(&self, lex_ordering: &LexOrdering) -> Result { - let lex_ordering = serialize_physical_sort_exprs(lex_ordering.to_vec(), self)?; + let lex_ordering = serialize_physical_sort_exprs( + lex_ordering.to_vec(), + self, + &DefaultPhysicalProtoConverter {}, + )?; let lex_ordering = lex_ordering .into_iter() .map(|x| self.try_encode_message(x)) @@ -2805,12 +3107,19 @@ impl RemoteExecutionCodec { ctx: &TaskContext, ) -> Result { let partitioning = self.try_decode_message(buf)?; - parse_protobuf_partitioning(Some(&partitioning), ctx, schema, self)? - .ok_or_else(|| plan_datafusion_err!("no partitioning found")) + parse_protobuf_partitioning( + Some(&partitioning), + ctx, + schema, + self, + &DefaultPhysicalProtoConverter {}, + )? + .ok_or_else(|| plan_datafusion_err!("no partitioning found")) } fn try_encode_partitioning(&self, partitioning: &Partitioning) -> Result> { - let partitioning = serialize_partitioning(partitioning, self)?; + let partitioning = + serialize_partitioning(partitioning, self, &DefaultPhysicalProtoConverter {})?; self.try_encode_message(partitioning) } @@ -2841,6 +3150,15 @@ impl RemoteExecutionCodec { self.try_encode_message::(schema.try_into()?) } + fn try_decode_statistics(&self, buf: &[u8]) -> Result { + let statistics = self.try_decode_message::(buf)?; + (&statistics).try_into() + } + + fn try_encode_statistics(&self, statistics: &Statistics) -> Result> { + self.try_encode_message::(statistics.into()) + } + fn try_decode_message(&self, buf: &[u8]) -> Result where M: Message + Default, diff --git a/crates/sail-execution/src/driver/actor/handler.rs b/crates/sail-execution/src/driver/actor/handler.rs index f721af56b8..52c8089c4d 100644 --- a/crates/sail-execution/src/driver/actor/handler.rs +++ b/crates/sail-execution/src/driver/actor/handler.rs @@ -509,6 +509,11 @@ impl DriverActor { } } + /// Assigns pending tasks to available workers and dispatches them for execution. + /// + /// Gets task assignments from the task assigner, builds task definitions from the job + /// scheduler, and dispatches each task to either the driver or a remote worker via gRPC. + /// Tasks that fail to build a definition are reported as failed. fn run_tasks(&mut self, ctx: &mut ActorContext) { let assignments = self.task_assigner.assign_tasks(); self.task_assigner.track_streams(&assignments); diff --git a/crates/sail-execution/src/driver/job_scheduler/core.rs b/crates/sail-execution/src/driver/job_scheduler/core.rs index 12768547a6..f25022ff5b 100644 --- a/crates/sail-execution/src/driver/job_scheduler/core.rs +++ b/crates/sail-execution/src/driver/job_scheduler/core.rs @@ -503,6 +503,7 @@ impl JobScheduler { actions } + /// Builds the serialized task definition and context for the given task key. pub fn get_task_definition( &self, key: &TaskKey, diff --git a/crates/sail-execution/src/driver/job_scheduler/state.rs b/crates/sail-execution/src/driver/job_scheduler/state.rs index c36927b14d..f744a3ad12 100644 --- a/crates/sail-execution/src/driver/job_scheduler/state.rs +++ b/crates/sail-execution/src/driver/job_scheduler/state.rs @@ -12,6 +12,7 @@ use crate::error::ExecutionResult; use crate::id::JobId; use crate::job_graph::JobGraph; +/// Tracks graph/topology and runtime state for a single job. pub struct JobDescriptor { pub graph: JobGraph, pub topology: JobTopology, diff --git a/crates/sail-execution/src/driver/job_scheduler/topology.rs b/crates/sail-execution/src/driver/job_scheduler/topology.rs index 333af8ad48..802b046773 100644 --- a/crates/sail-execution/src/driver/job_scheduler/topology.rs +++ b/crates/sail-execution/src/driver/job_scheduler/topology.rs @@ -36,6 +36,7 @@ pub struct StageTopology { } impl JobTopology { + /// Groups pipelined stages into components and builds the topology of task regions and stages. pub fn try_new(graph: &JobGraph) -> ExecutionResult { let mut stages = (0..graph.stages().len()) .map(|_| StageTopology { consumers: vec![] }) @@ -88,7 +89,6 @@ impl JobTopology { } } - // generate task region topology let mut regions = vec![]; for component in components { diff --git a/crates/sail-execution/src/driver/task_assigner/core.rs b/crates/sail-execution/src/driver/task_assigner/core.rs index 47f451dd35..c5aa72aa4e 100644 --- a/crates/sail-execution/src/driver/task_assigner/core.rs +++ b/crates/sail-execution/src/driver/task_assigner/core.rs @@ -163,6 +163,7 @@ impl TaskAssigner { Some(assignment.clone()) } + /// Records local and remote stream ownership for each resource based on the given task assignments. pub fn track_streams(&mut self, assignments: &[TaskSetAssignment]) { for assignment in assignments { self.driver.track_remote_streams(&assignment.set); @@ -236,6 +237,7 @@ impl TaskAssigner { } } + /// Builds a snapshot of available task slots across the driver and active workers for assignment. fn build_worker_task_slot_assigner(&self) -> TaskSlotAssigner { let slots = self .workers @@ -268,6 +270,7 @@ impl TaskAssignmentGetter for TaskAssigner { } } +/// Assigns task regions to driver or worker slots, consuming available slots as tasks are placed. struct TaskSlotAssigner { /// The available task slots on workers. slots: Vec<(WorkerId, Vec)>, diff --git a/crates/sail-execution/src/driver/task_assigner/mod.rs b/crates/sail-execution/src/driver/task_assigner/mod.rs index 1d186c06dc..8425afe9d8 100644 --- a/crates/sail-execution/src/driver/task_assigner/mod.rs +++ b/crates/sail-execution/src/driver/task_assigner/mod.rs @@ -11,6 +11,7 @@ use crate::driver::task_assigner::state::{DriverResource, WorkerResource}; use crate::id::{TaskKey, WorkerId}; use crate::task::scheduling::{TaskAssignment, TaskRegion}; +/// Manages task queuing and assignment across the driver and worker slots. pub struct TaskAssigner { options: TaskAssignerOptions, driver: DriverResource, @@ -18,11 +19,14 @@ pub struct TaskAssigner { requested_worker_count: usize, /// A lookup table from task attempts to the place they are assigned to. /// This is more convenient than finding the task attempt in the task slots. + /// /// Each task attempt can only be assigned once throughout its lifetime. + /// /// This lookup table is updated when the task attempt is assigned, /// but there is no need to remove the task attempt when it is completed, as /// the mapping is still valid for historical purposes. task_assignments: IndexMap, + /// Pending task regions waiting to be assigned to available driver or worker slots. task_queue: VecDeque, } diff --git a/crates/sail-execution/src/driver/task_assigner/state.rs b/crates/sail-execution/src/driver/task_assigner/state.rs index 0926e11d0b..c08e9b6a44 100644 --- a/crates/sail-execution/src/driver/task_assigner/state.rs +++ b/crates/sail-execution/src/driver/task_assigner/state.rs @@ -18,6 +18,7 @@ pub struct DriverResource { } impl DriverResource { + /// Assigns a task set to the first available driver slot. pub fn add_task_set(&mut self, set: TaskSet) { for slot in &mut self.task_slots { if slot.is_vacant() { @@ -71,6 +72,7 @@ impl DriverResource { } #[derive(Debug)] +/// Represents the current state of a worker's resources as seen by the task assigner. pub enum WorkerResource { Active { /// The task slots on the worker. @@ -94,6 +96,7 @@ pub enum WorkerResource { } impl WorkerResource { + /// Assigns a task set to the specified slot on this worker. pub fn add_task_set(&mut self, slot: usize, set: TaskSet) { match self { WorkerResource::Active { task_slots, .. } => { diff --git a/crates/sail-execution/src/driver/worker_pool/core.rs b/crates/sail-execution/src/driver/worker_pool/core.rs index 958625e945..c64c49278e 100644 --- a/crates/sail-execution/src/driver/worker_pool/core.rs +++ b/crates/sail-execution/src/driver/worker_pool/core.rs @@ -255,6 +255,7 @@ impl WorkerPool { } } + /// Dispatches a task to a specific worker by sending the task definition over gRPC. pub fn run_task( &mut self, ctx: &mut ActorContext, diff --git a/crates/sail-execution/src/id.rs b/crates/sail-execution/src/id.rs index 2da9b7b645..ea967e6d55 100644 --- a/crates/sail-execution/src/id.rs +++ b/crates/sail-execution/src/id.rs @@ -90,6 +90,7 @@ where } } +/// Uniquely identifies a task attempt within a job by stage, partition, and attempt number. #[derive(Debug, Clone, Eq, Hash, PartialEq)] pub struct TaskKey { pub job_id: JobId, diff --git a/crates/sail-execution/src/job_graph/mod.rs b/crates/sail-execution/src/job_graph/mod.rs index 0ba9e923d6..f40dfde011 100644 --- a/crates/sail-execution/src/job_graph/mod.rs +++ b/crates/sail-execution/src/job_graph/mod.rs @@ -96,6 +96,7 @@ pub struct Stage { pub placement: TaskPlacement, } +/// Specifies whether a task must run on the driver or on any available worker node. #[derive(Debug, Clone, Copy, Eq, PartialEq, Hash)] pub enum TaskPlacement { Driver, diff --git a/crates/sail-execution/src/job_graph/planner.rs b/crates/sail-execution/src/job_graph/planner.rs index 8b8122f784..934b075383 100644 --- a/crates/sail-execution/src/job_graph/planner.rs +++ b/crates/sail-execution/src/job_graph/planner.rs @@ -16,6 +16,7 @@ use datafusion::physical_plan::{ }; use sail_catalog_system::physical_plan::SystemTableExec; use sail_common_datafusion::utils::items::ItemTaker; +use sail_physical_plan::catalog_command::CatalogCommandExec; use crate::error::{ExecutionError, ExecutionResult}; use crate::job_graph::{ @@ -137,9 +138,10 @@ fn ensure_partitioned_hash_join_if_build_side_emits_unmatched_rows( join.on.clone(), join.filter.clone(), &join.join_type, - join.projection.clone(), + join.projection.as_deref().map(|p| p.to_vec()), PartitionMode::Partitioned, join.null_equality, + false, )?))) })?; @@ -164,6 +166,7 @@ enum PartitionUsage { Shared, } +/// Recursively splits an execution plan into stages at shuffle boundaries and adds them to the job graph. fn build_job_graph( plan: Arc, usage: PartitionUsage, @@ -201,10 +204,12 @@ fn build_job_graph( build_job_graph(left.clone(), PartitionUsage::Shared, graph)?, build_job_graph(right.clone(), usage, graph)?, ] - } else if plan.as_any().is::() || plan.as_any().is::() + } else if plan.as_any().is::() + || plan.as_any().is::() + || plan.as_any().is::() { let child = plan.children().one()?; - // At the shuffle boundary, we only expect to use the child partition once + // At the stage boundary, we only expect to use the child partition once // since the shuffle writer can materialize the data for multiple consumption. vec![build_job_graph(child.clone(), PartitionUsage::Once, graph)?] } else { @@ -233,7 +238,12 @@ fn build_job_graph( match &properties.partitioning { Partitioning::UnknownPartitioning(n) => { let n = *n; - let properties = properties.with_partitioning(Partitioning::RoundRobinBatch(n)); + let properties = Arc::new( + properties + .as_ref() + .clone() + .with_partitioning(Partitioning::RoundRobinBatch(n)), + ); create_shuffle(child, graph, properties, consumption)? } Partitioning::RoundRobinBatch(_) | Partitioning::Hash(_, _) => { @@ -254,7 +264,7 @@ fn build_job_graph( let child = plan.children().one()?; plan.clone() .with_new_children(vec![create_merge_input(child, graph)?])? - } else if plan.as_any().is::() { + } else if plan.as_any().is::() || plan.as_any().is::() { plan.children().zero()?; create_driver_stage(&plan, graph)? } else { @@ -293,7 +303,7 @@ fn create_shuffle( graph: &mut JobGraph, // These are the properties after repartition/coalesce, // which are different from the properties of the input plan. - properties: PlanProperties, + properties: Arc, consumption: ShuffleConsumption, ) -> ExecutionResult> { let distribution = match properties.partitioning.clone() { diff --git a/crates/sail-execution/src/job_runner.rs b/crates/sail-execution/src/job_runner.rs index a86557d015..17255b54cb 100644 --- a/crates/sail-execution/src/job_runner.rs +++ b/crates/sail-execution/src/job_runner.rs @@ -8,7 +8,7 @@ use datafusion::prelude::SessionContext; use sail_common_datafusion::session::job::{JobRunner, JobRunnerHistory}; use sail_common_datafusion::system::observable::{JobRunnerObserver, Observer, StateObservable}; use sail_server::actor::{ActorHandle, ActorSystem}; -use sail_telemetry::telemetry::global_metric_registry; +use sail_telemetry::telemetry::global_metrics; use sail_telemetry::{trace_execution_plan, TracingExecOptions}; use tokio::sync::mpsc::error::SendError; use tokio::sync::oneshot; @@ -54,7 +54,7 @@ impl JobRunner for LocalJobRunner { } let job_id = self.next_job_id.fetch_add(1, Ordering::Relaxed); let options = TracingExecOptions { - metric_registry: global_metric_registry(), + metrics: global_metrics(), job_id: Some(job_id), stage: None, attempt: None, diff --git a/crates/sail-execution/src/plan/shuffle_read.rs b/crates/sail-execution/src/plan/shuffle_read.rs index b634f51e4a..0a49ab5d27 100644 --- a/crates/sail-execution/src/plan/shuffle_read.rs +++ b/crates/sail-execution/src/plan/shuffle_read.rs @@ -19,7 +19,7 @@ use crate::stream::reader::{TaskReadLocation, TaskStreamReader}; pub struct ShuffleReadExec { /// For each output partition, a list of locations to read from. locations: Vec>, - properties: PlanProperties, + properties: Arc, reader: Arc, } @@ -27,7 +27,7 @@ impl ShuffleReadExec { pub fn new( locations: Vec>, reader: Arc, - properties: PlanProperties, + properties: Arc, ) -> Self { Self { locations, @@ -57,7 +57,7 @@ impl ExecutionPlan for ShuffleReadExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-execution/src/plan/shuffle_write.rs b/crates/sail-execution/src/plan/shuffle_write.rs index 3be3046dcc..c12964bdf7 100644 --- a/crates/sail-execution/src/plan/shuffle_write.rs +++ b/crates/sail-execution/src/plan/shuffle_write.rs @@ -30,7 +30,7 @@ pub struct ShuffleWriteExec { shuffle_partitioning: Partitioning, /// For each input partition, a list of locations to write to. locations: Vec>, - properties: PlanProperties, + properties: Arc, writer: Arc, } @@ -54,7 +54,7 @@ impl ShuffleWriteExec { } _ => partitioning, }; - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(Arc::new(Schema::empty())), // The shuffle write plan has the same number of partitions as the input plan. // For each partition that are executed, the data is further partitioned according to @@ -66,7 +66,7 @@ impl ShuffleWriteExec { Boundedness::Unbounded { requires_infinite_memory: false, }, - ); + )); Self { plan, shuffle_partitioning: partitioning, @@ -97,7 +97,7 @@ impl ExecutionPlan for ShuffleWriteExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-execution/src/plan/stage_input.rs b/crates/sail-execution/src/plan/stage_input.rs index ff6d41e454..1a6dbe23be 100644 --- a/crates/sail-execution/src/plan/stage_input.rs +++ b/crates/sail-execution/src/plan/stage_input.rs @@ -11,11 +11,11 @@ use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, Pla #[derive(Debug, Clone)] pub struct StageInputExec { input: I, - properties: PlanProperties, + properties: Arc, } impl StageInputExec { - pub fn new(input: I, properties: PlanProperties) -> Self { + pub fn new(input: I, properties: Arc) -> Self { Self { input, properties } } @@ -50,7 +50,7 @@ where self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-execution/src/task/scheduling.rs b/crates/sail-execution/src/task/scheduling.rs index 5282f34c84..c11598a8c1 100644 --- a/crates/sail-execution/src/task/scheduling.rs +++ b/crates/sail-execution/src/task/scheduling.rs @@ -30,12 +30,14 @@ pub struct TaskSet { pub entries: Vec, } +/// A single task within a task set, pairing a task key with its output stream kind. #[derive(Debug, Clone)] pub struct TaskSetEntry { pub key: TaskKey, pub output: TaskOutputKind, } +/// Whether a task's output stream is stored locally on the executing node or written to a remote location. #[derive(Debug, Clone)] pub enum TaskOutputKind { Local, @@ -72,6 +74,7 @@ impl TaskSet { } } +/// Pairs a TaskSet with an execution location. #[derive(Debug, Clone)] pub struct TaskSetAssignment { pub set: TaskSet, diff --git a/crates/sail-execution/src/task_runner/core.rs b/crates/sail-execution/src/task_runner/core.rs index ec77746256..e137c19d80 100644 --- a/crates/sail-execution/src/task_runner/core.rs +++ b/crates/sail-execution/src/task_runner/core.rs @@ -16,7 +16,7 @@ use sail_common_datafusion::error::CommonErrorCause; use sail_delta_lake::physical_plan::DeltaPhysicalExprAdapterFactory; use sail_python_udf::error::PyErrExtractor; use sail_server::actor::{Actor, ActorContext}; -use sail_telemetry::telemetry::global_metric_registry; +use sail_telemetry::telemetry::global_metrics; use sail_telemetry::{trace_execution_plan, TracingExecOptions}; use tokio::sync::oneshot; @@ -73,6 +73,7 @@ impl TaskRunner { } } + /// Deserializes and prepares a physical plan for execution on this node. fn execute_plan( &mut self, ctx: &mut ActorContext, @@ -100,7 +101,7 @@ impl TaskRunner { DisplayableExecutionPlan::new(plan.as_ref()).indent(true) ); let options = TracingExecOptions { - metric_registry: global_metric_registry(), + metrics: global_metrics(), job_id: Some(key.job_id.into()), stage: Some(key.stage), attempt: Some(key.attempt), diff --git a/crates/sail-execution/src/task_runner/monitor.rs b/crates/sail-execution/src/task_runner/monitor.rs index ee0d003a20..65fe5a4ef7 100644 --- a/crates/sail-execution/src/task_runner/monitor.rs +++ b/crates/sail-execution/src/task_runner/monitor.rs @@ -36,6 +36,7 @@ impl TaskMonitor where T::Message: TaskRunnerMessage, { + /// Runs the task monitor, reporting running and terminal status updates. pub async fn run(self) { let Self { handle, @@ -52,10 +53,12 @@ where let _ = handle.send(event).await; } + /// Builds a "task is running" status message. fn running(key: TaskKey) -> T::Message { T::Message::report_task_status(key, TaskStatus::Running, None, None) } + /// Waits for a cancellation signal and builds a canceled status message. async fn cancel(key: TaskKey, signal: oneshot::Receiver<()>) -> T::Message { let _ = signal.await; T::Message::report_task_status( @@ -66,6 +69,7 @@ where ) } + /// Drains the output stream and builds a succeeded or failed status message. async fn execute(key: TaskKey, mut stream: SendableRecordBatchStream) -> T::Message { let event = loop { let Some(batch) = stream.next().await else { diff --git a/crates/sail-execution/src/worker/client.rs b/crates/sail-execution/src/worker/client.rs index 33a0ffa1c3..04cc5df0ef 100644 --- a/crates/sail-execution/src/worker/client.rs +++ b/crates/sail-execution/src/worker/client.rs @@ -41,6 +41,7 @@ impl WorkerClient { } impl WorkerClient { + /// Sends a task to a remote worker for execution via gRPC. pub async fn run_task( &self, key: TaskKey, diff --git a/crates/sail-execution/src/worker_manager/kubernetes.rs b/crates/sail-execution/src/worker_manager/kubernetes.rs index 00ea2dda84..20b3b0939b 100644 --- a/crates/sail-execution/src/worker_manager/kubernetes.rs +++ b/crates/sail-execution/src/worker_manager/kubernetes.rs @@ -9,7 +9,7 @@ use k8s_openapi::apimachinery::pkg::apis::meta::v1::{ObjectMeta, OwnerReference} use k8s_openapi::{DeepMerge, Resource}; use kube::Api; use rand::distr::Uniform; -use rand::Rng; +use rand::RngExt; use sail_common::config::ClusterConfigEnv; use sail_server::RetryStrategy; use sail_telemetry::common::ContextPropagationEnv; diff --git a/crates/sail-function/Cargo.toml b/crates/sail-function/Cargo.toml index 5f9506637f..8a526b6d47 100644 --- a/crates/sail-function/Cargo.toml +++ b/crates/sail-function/Cargo.toml @@ -36,4 +36,5 @@ half = { workspace = true } url = { workspace = true } percent-encoding = { workspace = true } jiter = { workspace = true } +thiserror = { workspace = true } serde_json = { workspace = true } diff --git a/crates/sail-function/src/aggregate/max_min_by.rs b/crates/sail-function/src/aggregate/max_min_by.rs index 5bd3f202ec..8f0b5df7fa 100644 --- a/crates/sail-function/src/aggregate/max_min_by.rs +++ b/crates/sail-function/src/aggregate/max_min_by.rs @@ -11,7 +11,7 @@ use datafusion::error::DataFusionError; use datafusion::functions_aggregate::first_last::last_value_udaf; use datafusion::logical_expr::expr::{AggregateFunction, Sort}; use datafusion::logical_expr::function::{AccumulatorArgs, StateFieldsArgs}; -use datafusion::logical_expr::simplify::SimplifyInfo; +use datafusion::logical_expr::simplify::SimplifyContext; use datafusion::logical_expr::utils::format_state_name; use datafusion::logical_expr::{function, Accumulator, AggregateUDFImpl, Signature, Volatility}; use datafusion::prelude::Expr; @@ -164,7 +164,7 @@ impl AggregateUDFImpl for MaxByFunction { } fn simplify(&self) -> Option { - let simplify = |mut aggr_func: AggregateFunction, _: &dyn SimplifyInfo| { + let simplify = |mut aggr_func: AggregateFunction, _: &SimplifyContext| { let mut order_by = aggr_func.params.order_by; let (second_arg, first_arg) = ( aggr_func.params.args.remove(1), @@ -270,7 +270,7 @@ impl AggregateUDFImpl for MinByFunction { } fn simplify(&self) -> Option { - let simplify = |mut aggr_func: AggregateFunction, _: &dyn SimplifyInfo| { + let simplify = |mut aggr_func: AggregateFunction, _: &SimplifyContext| { let mut order_by = aggr_func.params.order_by; let (second_arg, first_arg) = ( aggr_func.params.args.remove(1), diff --git a/crates/sail-function/src/functions_nested_utils.rs b/crates/sail-function/src/functions_nested_utils.rs index 700f29d56b..a08ea2c78f 100644 --- a/crates/sail-function/src/functions_nested_utils.rs +++ b/crates/sail-function/src/functions_nested_utils.rs @@ -16,7 +16,8 @@ macro_rules! opt_downcast_arg { }}; } -pub(crate) use {downcast_arg, opt_downcast_arg}; +pub(crate) use downcast_arg; +pub(crate) use opt_downcast_arg; /// array function wrapper that differentiates between scalar (length 1) and array. pub(crate) fn make_scalar_function( diff --git a/crates/sail-function/src/scalar/array/spark_array.rs b/crates/sail-function/src/scalar/array/spark_array.rs index 95bd477040..127586ec63 100644 --- a/crates/sail-function/src/scalar/array/spark_array.rs +++ b/crates/sail-function/src/scalar/array/spark_array.rs @@ -111,6 +111,16 @@ impl ScalarUDFImpl for SparkArray { plan_err!("Coercion from {acc:?} to {x:?} failed.") } })?; + // When any input is a floating-point type (Double/Float), keep it as Double + // instead of promoting to Decimal128. Floats support NaN/Infinity which + // Decimal128 cannot represent, causing runtime overflow errors. + let new_type = if matches!(new_type, DataType::Decimal128(_, _)) + && arg_types.iter().any(|dt| dt.is_floating()) + { + DataType::Float64 + } else { + new_type + }; Ok(vec![new_type; arg_types.len()]) } } diff --git a/crates/sail-function/src/scalar/array/spark_array_min_max.rs b/crates/sail-function/src/scalar/array/spark_array_min_max.rs index 7877fa1072..b734eba8ef 100644 --- a/crates/sail-function/src/scalar/array/spark_array_min_max.rs +++ b/crates/sail-function/src/scalar/array/spark_array_min_max.rs @@ -51,15 +51,17 @@ impl ScalarUDFImpl for ArrayMin { fn return_type(&self, arg_types: &[DataType]) -> Result { match &arg_types[0] { - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) => Ok(field.data_type().clone()), - _ => plan_err!("ArrayMin can only accept List, LargeList or FixedSizeList."), + DataType::List(field) | DataType::LargeList(field) => Ok(field.data_type().clone()), + DataType::Null => Ok(DataType::Null), + _ => plan_err!("ArrayMin can only accept List or LargeList."), } } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { let ScalarFunctionArgs { args, .. } = args; + if args[0].data_type() == DataType::Null { + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); + } make_scalar_function(array_min_inner)(&args) } } @@ -98,15 +100,17 @@ impl ScalarUDFImpl for ArrayMax { fn return_type(&self, arg_types: &[DataType]) -> Result { match &arg_types[0] { - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) => Ok(field.data_type().clone()), - _ => plan_err!("ArrayMax can only accept List, LargeList or FixedSizeList."), + DataType::List(field) | DataType::LargeList(field) => Ok(field.data_type().clone()), + DataType::Null => Ok(DataType::Null), + _ => plan_err!("ArrayMax can only accept List or LargeList."), } } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { let ScalarFunctionArgs { args, .. } = args; + if args[0].data_type() == DataType::Null { + return Ok(ColumnarValue::Scalar(ScalarValue::Null)); + } make_scalar_function(array_max_inner)(&args) } } diff --git a/crates/sail-function/src/scalar/array/spark_sequence.rs b/crates/sail-function/src/scalar/array/spark_sequence.rs index f8b6c1e0d8..651e4c8b34 100644 --- a/crates/sail-function/src/scalar/array/spark_sequence.rs +++ b/crates/sail-function/src/scalar/array/spark_sequence.rs @@ -324,12 +324,16 @@ fn gen_sequence_date(args: &[ArrayRef]) -> Result { } let mut new_date = start; + let mut overflow = false; let values = from_fn(|| { - if (negative && new_date < stop) || (!negative && new_date > stop) { + if overflow || (negative && new_date < stop) || (!negative && new_date > stop) { None } else { let current_date = new_date; - new_date = Date32Type::add_month_day_nano(new_date, step); + match Date32Type::add_month_day_nano_opt(new_date, step) { + Some(next) => new_date = next, + None => overflow = true, + } Some(Some(current_date)) } }); diff --git a/crates/sail-function/src/scalar/datetime/mod.rs b/crates/sail-function/src/scalar/datetime/mod.rs index 09e3a61d16..f2f72ecbc1 100644 --- a/crates/sail-function/src/scalar/datetime/mod.rs +++ b/crates/sail-function/src/scalar/datetime/mod.rs @@ -4,9 +4,12 @@ pub mod spark_date; pub mod spark_date_part; pub mod spark_interval; pub mod spark_last_day; +pub mod spark_make_time; pub mod spark_make_timestamp; pub mod spark_make_ym_interval; pub mod spark_next_day; +pub mod spark_time_diff; +pub mod spark_time_trunc; pub mod spark_timestamp; pub mod spark_to_chrono_fmt; pub mod spark_try_make_timestamp_ntz; diff --git a/crates/sail-function/src/scalar/datetime/spark_last_day.rs b/crates/sail-function/src/scalar/datetime/spark_last_day.rs index fa37d2aed3..6b8f0bb66b 100644 --- a/crates/sail-function/src/scalar/datetime/spark_last_day.rs +++ b/crates/sail-function/src/scalar/datetime/spark_last_day.rs @@ -104,7 +104,9 @@ impl ScalarUDFImpl for SparkLastDay { } fn spark_last_day(days: i32) -> Result { - let date = Date32Type::to_naive_date(days); + let date = Date32Type::to_naive_date_opt(days).ok_or_else(|| { + exec_datafusion_err!("Spark `last_day`: Unable to parse date from days: {days}") + })?; let (year, month) = (date.year(), date.month()); let (next_year, next_month) = if month == 12 { diff --git a/crates/sail-function/src/scalar/datetime/spark_make_time.rs b/crates/sail-function/src/scalar/datetime/spark_make_time.rs new file mode 100644 index 0000000000..f8c1feec24 --- /dev/null +++ b/crates/sail-function/src/scalar/datetime/spark_make_time.rs @@ -0,0 +1,175 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::array::{Array, PrimitiveBuilder}; +use datafusion::arrow::datatypes::{DataType, Time64MicrosecondType, TimeUnit}; +use datafusion_common::types::NativeType; +use datafusion_common::{exec_err, plan_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; + +use crate::scalar::datetime::utils::{to_decimal128_array, to_int32_array}; + +// Seconds argument is coerced to Decimal128(16, 6) — matching Spark's DecimalType(16, 6). +// The unscaled i128 value is therefore already in microseconds (value * 10^-6 gives seconds, +// so unscaled directly equals whole_seconds * 1_000_000 + microsecond_fraction). +const SECONDS_PRECISION: u8 = 16; +const SECONDS_SCALE: i8 = 6; + +// Scale factor: 10^6, the number of microseconds per whole second. +const SCALE_FACTOR: i128 = 1_000_000; + +const MICROS_PER_MINUTE: i64 = 60 * 1_000_000; +const MICROS_PER_HOUR: i64 = 60 * MICROS_PER_MINUTE; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkMakeTime { + signature: Signature, +} + +impl Default for SparkMakeTime { + fn default() -> Self { + Self::new() + } +} + +impl SparkMakeTime { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkMakeTime { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "spark_make_time" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Time64(TimeUnit::Microsecond)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { + args, number_rows, .. + } = args; + + if args.len() != 3 { + return exec_err!( + "Spark `make_time` function requires 3 arguments, got {}", + args.len() + ); + } + + // Handle NULL propagation for scalar inputs + let contains_scalar_null = args.iter().any(|arg| { + matches!( + arg, + ColumnarValue::Scalar(ScalarValue::Int32(None)) + | ColumnarValue::Scalar(ScalarValue::Decimal128(None, _, _)) + | ColumnarValue::Scalar(ScalarValue::Null) + ) + }); + + if contains_scalar_null { + return Ok(ColumnarValue::Scalar(ScalarValue::Time64Microsecond(None))); + } + + let hours = to_int32_array(&args[0], "hour", "make_time", number_rows)?; + let minutes = to_int32_array(&args[1], "minute", "make_time", number_rows)?; + let seconds = to_decimal128_array(&args[2], "second", "make_time", number_rows)?; + + let mut builder = PrimitiveBuilder::::with_capacity(number_rows); + + for i in 0..number_rows { + if hours.is_null(i) || minutes.is_null(i) || seconds.is_null(i) { + builder.append_null(); + continue; + } + + let h = hours.value(i); + let m = minutes.value(i); + let sec_unscaled = seconds.value(i); + + builder.append_value(make_time(h, m, sec_unscaled)?); + } + + Ok(ColumnarValue::Array(Arc::new(builder.finish()))) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + if arg_types.len() != 3 { + return exec_err!( + "Spark `make_time` function requires 3 arguments, got {}", + arg_types.len() + ); + } + + let hour: NativeType = (&arg_types[0]).into(); + let minute: NativeType = (&arg_types[1]).into(); + let second: NativeType = (&arg_types[2]).into(); + + if (hour.is_integer() || matches!(hour, NativeType::String | NativeType::Null)) + && (minute.is_integer() || matches!(minute, NativeType::String | NativeType::Null)) + && (second.is_numeric() || matches!(second, NativeType::String | NativeType::Null)) + { + // Use Decimal128(16, 6) for seconds, matching Spark's DecimalType(16, 6). + // This avoids f64 rounding errors (e.g. 59.9999999 rounding up to 60 seconds). + Ok(vec![ + DataType::Int32, + DataType::Int32, + DataType::Decimal128(SECONDS_PRECISION, SECONDS_SCALE), + ]) + } else { + plan_err!("The arguments of Spark `make_time` must be (integer, integer, numeric)") + } + } +} + +/// Create a TIME value (microseconds since midnight) from hour, minute, and second components. +/// +/// Matches Spark 4.1 `MakeTime` / `DateTimeUtils.makeTime` semantics: +/// - hour: 0 to 23 +/// - minute: 0 to 59 +/// - sec_unscaled: the unscaled i128 of a `Decimal128(16, 6)` value, +/// where actual_seconds = sec_unscaled / 1_000_000. +/// Valid range: [0, 60_000_000) — i.e., [0.000000, 59.999999]. +/// +/// `sec_unscaled` is the raw integer from `Decimal128(16, 6)`, matching Spark's +/// `DecimalType(16, 6)`. Using the unscaled integer directly avoids the f64 rounding +/// trap where values like 59.9999999 would round up to 60_000_000 µs (= 60 s) and +/// silently overflow. Instead, 59.9999999 as Decimal(16,6) has unscaled = 59_999_999, +/// which is within range; an input of exactly 60.0 (unscaled = 60_000_000) correctly errors. +/// +/// Errors on invalid inputs, matching Spark's `DATETIME_FIELD_OUT_OF_BOUNDS` behaviour. +fn make_time(hour: i32, minute: i32, sec_unscaled: i128) -> Result { + if !(0..=23).contains(&hour) { + return exec_err!("make_time: Invalid value for HourOfDay (valid values 0 - 23): {hour}"); + } + if !(0..=59).contains(&minute) { + return exec_err!( + "make_time: Invalid value for MinuteOfHour (valid values 0 - 59): {minute}" + ); + } + // Unscaled range for valid seconds [0.000000, 59.999999] is [0, 59_999_999]. + if !(0..60 * SCALE_FACTOR).contains(&sec_unscaled) { + return exec_err!( + "make_time: Invalid value for SecondOfMinute (valid values 0 - 59): {}", + ScalarValue::Decimal128(Some(sec_unscaled), SECONDS_PRECISION, SECONDS_SCALE) + ); + } + + // sec_unscaled is already in microseconds because Decimal(16,6) has scale 6, + // so no further conversion is needed — just cast to i64. + let sec_micros = sec_unscaled as i64; + + Ok(hour as i64 * MICROS_PER_HOUR + minute as i64 * MICROS_PER_MINUTE + sec_micros) +} diff --git a/crates/sail-function/src/scalar/datetime/spark_next_day.rs b/crates/sail-function/src/scalar/datetime/spark_next_day.rs index 14a2c87d76..0af6f42713 100644 --- a/crates/sail-function/src/scalar/datetime/spark_next_day.rs +++ b/crates/sail-function/src/scalar/datetime/spark_next_day.rs @@ -189,7 +189,7 @@ where } fn spark_next_day(days: i32, day_of_week: &str) -> Option { - let date = Date32Type::to_naive_date(days); + let date = Date32Type::to_naive_date_opt(days)?; let day_of_week = day_of_week.trim().to_uppercase(); let day_of_week = match day_of_week.as_str() { diff --git a/crates/sail-function/src/scalar/datetime/spark_time_diff.rs b/crates/sail-function/src/scalar/datetime/spark_time_diff.rs new file mode 100644 index 0000000000..0a51325958 --- /dev/null +++ b/crates/sail-function/src/scalar/datetime/spark_time_diff.rs @@ -0,0 +1,219 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::array::types::Time64MicrosecondType; +use datafusion::arrow::array::{ + new_null_array, Array, ArrayRef, AsArray, Int64Builder, PrimitiveArray, StringArrayType, +}; +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; + +use crate::scalar::datetime::utils::to_time64_array; + +/// Returns the divisor in microseconds for a given time_diff unit string. +/// Returns `None` for unsupported units. +fn unit_divisor(unit: &str) -> Option { + match unit.to_uppercase().as_str() { + "MICROSECOND" => Some(1), + "MILLISECOND" => Some(1_000), + "SECOND" => Some(1_000_000), + "MINUTE" => Some(60_000_000), + "HOUR" => Some(3_600_000_000), + _ => None, + } +} + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTimeDiff { + signature: Signature, +} + +impl Default for SparkTimeDiff { + fn default() -> Self { + Self::new() + } +} + +impl SparkTimeDiff { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkTimeDiff { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "spark_time_diff" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Int64) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { + args, number_rows, .. + } = args; + + let [unit_arg, start_arg, end_arg] = args.as_slice() else { + return exec_err!( + "Spark `time_diff` function requires 3 arguments, got {}", + args.len() + ); + }; + + // All-scalar fast path — return a scalar value directly. + if let ( + ColumnarValue::Scalar(unit_sv), + ColumnarValue::Scalar(start_sv), + ColumnarValue::Scalar(end_sv), + ) = (unit_arg, start_arg, end_arg) + { + let unit_opt = match unit_sv { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => Some(s.as_str()), + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Null => None, + _ => return exec_err!("time_diff: unit must be a string"), + }; + let start_opt = match start_sv { + ScalarValue::Time64Microsecond(v) => *v, + _ => return exec_err!("time_diff: start must be TIME"), + }; + let end_opt = match end_sv { + ScalarValue::Time64Microsecond(v) => *v, + _ => return exec_err!("time_diff: end must be TIME"), + }; + let result = match (unit_opt, start_opt, end_opt) { + (Some(unit), Some(start), Some(end)) => { + let divisor = match unit_divisor(unit) { + Some(d) => d, + None => return exec_err!( + "time_diff: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ), + }; + Some((end - start) / divisor) + } + _ => None, + }; + return Ok(ColumnarValue::Scalar(ScalarValue::Int64(result))); + } + + // At least one array arg. + // Null scalar time args → all-null result without touching the converter. + if matches!( + start_arg, + ColumnarValue::Scalar(ScalarValue::Time64Microsecond(None)) + ) || matches!( + end_arg, + ColumnarValue::Scalar(ScalarValue::Time64Microsecond(None)) + ) { + return Ok(ColumnarValue::Array(new_null_array( + &DataType::Int64, + number_rows, + ))); + } + + // Broadcast scalar time args to arrays of `number_rows`. + let starts = to_time64_array(start_arg, "start", "time_diff", number_rows)?; + let ends = to_time64_array(end_arg, "end", "time_diff", number_rows)?; + + let result: ArrayRef = match unit_arg { + // Scalar unit — resolve divisor once and apply to all rows. + ColumnarValue::Scalar(ScalarValue::Utf8(Some(unit))) + | ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(unit))) => { + let divisor = match unit_divisor(unit.as_str()) { + Some(d) => d, + None => return exec_err!( + "time_diff: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ), + }; + let mut builder = Int64Builder::with_capacity(number_rows); + for i in 0..number_rows { + if starts.is_null(i) || ends.is_null(i) { + builder.append_null(); + } else { + // Rust integer division truncates toward zero, matching Spark behavior. + builder.append_value((ends.value(i) - starts.value(i)) / divisor); + } + } + Arc::new(builder.finish()) + } + // Null scalar unit → all-null result. + ColumnarValue::Scalar(ScalarValue::Utf8(None)) + | ColumnarValue::Scalar(ScalarValue::LargeUtf8(None)) + | ColumnarValue::Scalar(ScalarValue::Null) => { + new_null_array(&DataType::Int64, number_rows) + } + // Array unit — resolve divisor per row. + ColumnarValue::Array(unit_array) => match unit_array.data_type() { + DataType::Utf8 => { + time_diff_rows(unit_array.as_string::(), &starts, &ends, number_rows)? + } + DataType::LargeUtf8 => { + time_diff_rows(unit_array.as_string::(), &starts, &ends, number_rows)? + } + DataType::Utf8View => { + time_diff_rows(unit_array.as_string_view(), &starts, &ends, number_rows)? + } + _ => return exec_err!("time_diff: unit must be a string"), + }, + _ => return exec_err!("time_diff: unit must be a string"), + }; + + Ok(ColumnarValue::Array(result)) + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + if arg_types.len() != 3 { + return exec_err!( + "Spark `time_diff` function requires 3 arguments, got {}", + arg_types.len() + ); + } + Ok(vec![ + DataType::Utf8, + DataType::Time64(TimeUnit::Microsecond), + DataType::Time64(TimeUnit::Microsecond), + ]) + } +} + +/// Per-row difference computation when unit comes from a string array column. +fn time_diff_rows<'a, S>( + unit_array: &'a S, + starts: &PrimitiveArray, + ends: &PrimitiveArray, + number_rows: usize, +) -> Result +where + &'a S: StringArrayType<'a>, +{ + let mut builder = Int64Builder::with_capacity(number_rows); + for ((unit_opt, start_opt), end_opt) in unit_array.iter().zip(starts.iter()).zip(ends.iter()) { + match (unit_opt, start_opt, end_opt) { + (None, _, _) | (_, None, _) | (_, _, None) => builder.append_null(), + (Some(unit), Some(start), Some(end)) => match unit_divisor(unit) { + // Rust integer division truncates toward zero, matching Spark behavior. + Some(divisor) => builder.append_value((end - start) / divisor), + None => { + return exec_err!( + "time_diff: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ) + } + }, + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) +} diff --git a/crates/sail-function/src/scalar/datetime/spark_time_trunc.rs b/crates/sail-function/src/scalar/datetime/spark_time_trunc.rs new file mode 100644 index 0000000000..a09ba5f81d --- /dev/null +++ b/crates/sail-function/src/scalar/datetime/spark_time_trunc.rs @@ -0,0 +1,209 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::array::{ + new_null_array, Array, ArrayRef, AsArray, PrimitiveArray, PrimitiveBuilder, StringArrayType, +}; +use datafusion::arrow::datatypes::{DataType, Time64MicrosecondType, TimeUnit}; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; + +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkTimeTrunc { + signature: Signature, +} + +impl Default for SparkTimeTrunc { + fn default() -> Self { + Self::new() + } +} + +impl SparkTimeTrunc { + pub fn new() -> Self { + Self { + signature: Signature::user_defined(Volatility::Immutable), + } + } +} + +/// Returns the truncation divisor in microseconds for a given unit string. +/// Returns `None` for unrecognized units. +fn truncation_divisor(unit: &str) -> Option { + match unit.to_uppercase().as_str() { + "MICROSECOND" => Some(1), + "MILLISECOND" => Some(1_000), + "SECOND" => Some(1_000_000), + "MINUTE" => Some(60_000_000), + "HOUR" => Some(3_600_000_000), + _ => None, + } +} + +impl ScalarUDFImpl for SparkTimeTrunc { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "spark_time_trunc" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Time64(TimeUnit::Microsecond)) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let ScalarFunctionArgs { + args, number_rows, .. + } = args; + + let [unit_arg, time_arg] = args.as_slice() else { + return exec_err!( + "Spark `time_trunc` function requires 2 arguments, got {}", + args.len() + ); + }; + + match (unit_arg, time_arg) { + // (Scalar unit, Scalar time) — return a scalar + (ColumnarValue::Scalar(unit_sv), ColumnarValue::Scalar(time_sv)) => { + let unit_opt = match unit_sv { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + Some(s.as_str()) + } + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Null => { + None + } + _ => return exec_err!("time_trunc: unit must be a string"), + }; + let time_opt = match time_sv { + ScalarValue::Time64Microsecond(v) => *v, + _ => return exec_err!("time_trunc: time must be TIME"), + }; + let result = match (unit_opt, time_opt) { + (Some(unit), Some(time)) => { + let divisor = match truncation_divisor(unit) { + Some(d) => d, + None => return exec_err!( + "time_trunc: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ), + }; + Some(time - (time % divisor)) + } + _ => None, + }; + Ok(ColumnarValue::Scalar(ScalarValue::Time64Microsecond( + result, + ))) + } + // (Scalar unit, Array time) — constant divisor across all rows + (ColumnarValue::Scalar(unit_sv), ColumnarValue::Array(time_array)) => { + let unit_opt = match unit_sv { + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + Some(s.as_str()) + } + ScalarValue::Utf8(None) | ScalarValue::LargeUtf8(None) | ScalarValue::Null => { + None + } + _ => return exec_err!("time_trunc: unit must be a string"), + }; + match unit_opt { + None => Ok(ColumnarValue::Array(new_null_array( + &DataType::Time64(TimeUnit::Microsecond), + number_rows, + ))), + Some(unit) => { + let divisor = match truncation_divisor(unit) { + Some(d) => d, + None => return exec_err!( + "time_trunc: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ), + }; + let times = time_array.as_primitive::(); + let result = times + .iter() + .map(|t| t.map(|v| v - (v % divisor))) + .collect::>(); + Ok(ColumnarValue::Array(Arc::new(result) as ArrayRef)) + } + } + } + // (Array unit, Scalar or Array time) — per-row unit lookup + (ColumnarValue::Array(unit_array), time_arg) => { + let times = match time_arg { + ColumnarValue::Array(a) => a.as_primitive::().to_owned(), + ColumnarValue::Scalar(ScalarValue::Time64Microsecond(Some(v))) => { + PrimitiveArray::::from_value(*v, number_rows) + } + ColumnarValue::Scalar(ScalarValue::Time64Microsecond(None)) => { + return Ok(ColumnarValue::Array(new_null_array( + &DataType::Time64(TimeUnit::Microsecond), + number_rows, + ))); + } + _ => return exec_err!("time_trunc: time must be TIME"), + }; + let result = match unit_array.data_type() { + DataType::Utf8 => { + trunc_time_rows(unit_array.as_string::(), ×, number_rows) + } + DataType::LargeUtf8 => { + trunc_time_rows(unit_array.as_string::(), ×, number_rows) + } + DataType::Utf8View => { + trunc_time_rows(unit_array.as_string_view(), ×, number_rows) + } + _ => exec_err!("time_trunc: unit must be a string"), + }?; + Ok(ColumnarValue::Array(result)) + } + } + } + + fn coerce_types(&self, arg_types: &[DataType]) -> Result> { + if arg_types.len() != 2 { + return exec_err!( + "Spark `time_trunc` function requires 2 arguments, got {}", + arg_types.len() + ); + } + Ok(vec![ + DataType::Utf8, + DataType::Time64(TimeUnit::Microsecond), + ]) + } +} + +/// Per-row truncation when unit comes from a string array column. +fn trunc_time_rows<'a, S>( + unit_array: &'a S, + times: &PrimitiveArray, + number_rows: usize, +) -> Result +where + &'a S: StringArrayType<'a>, +{ + let mut builder = PrimitiveBuilder::::with_capacity(number_rows); + for (unit_opt, time_opt) in unit_array.iter().zip(times.iter()) { + match (unit_opt, time_opt) { + (None, _) | (_, None) => builder.append_null(), + (Some(unit), Some(val)) => match truncation_divisor(unit) { + Some(divisor) => builder.append_value(val - (val % divisor)), + None => { + return exec_err!( + "time_trunc: unsupported unit '{}'. Supported: HOUR, MINUTE, SECOND, MILLISECOND, MICROSECOND", + unit + ) + } + }, + } + } + Ok(Arc::new(builder.finish()) as ArrayRef) +} diff --git a/crates/sail-function/src/scalar/datetime/timestamp_now.rs b/crates/sail-function/src/scalar/datetime/timestamp_now.rs index ee03271a21..3eb77dc21a 100644 --- a/crates/sail-function/src/scalar/datetime/timestamp_now.rs +++ b/crates/sail-function/src/scalar/datetime/timestamp_now.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, TimeUnit}; use datafusion_common::{internal_err, Result, ScalarValue}; -use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; use datafusion_expr::{ ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, }; @@ -57,8 +57,8 @@ impl ScalarUDFImpl for TimestampNow { internal_err!("invoke should not be called on a simplified timestamp_now() function") } - fn simplify(&self, _args: Vec, info: &dyn SimplifyInfo) -> Result { - let now = info.execution_props().query_execution_start_time; + fn simplify(&self, _args: Vec, info: &SimplifyContext) -> Result { + let now = info.query_execution_start_time().unwrap_or_default(); let now = match self.time_unit() { TimeUnit::Second => Some(now.timestamp()), TimeUnit::Millisecond => Some(now.timestamp_millis()), diff --git a/crates/sail-function/src/scalar/datetime/utils.rs b/crates/sail-function/src/scalar/datetime/utils.rs index 91d25f2238..2fd0e421b1 100644 --- a/crates/sail-function/src/scalar/datetime/utils.rs +++ b/crates/sail-function/src/scalar/datetime/utils.rs @@ -1,5 +1,5 @@ use datafusion::arrow::array::types::{ - Date32Type, Float64Type, Int32Type, Time64MicrosecondType, UInt32Type, + Date32Type, Decimal128Type, Float64Type, Int32Type, Time64MicrosecondType, UInt32Type, }; use datafusion::arrow::array::{AsArray, Float64Array, Int32Array, PrimitiveArray, UInt32Array}; use datafusion::arrow::datatypes::DataType; @@ -166,6 +166,27 @@ pub(crate) fn to_time64_array( } } +/// Reads a `Decimal128` column as its raw unscaled `i128` values. +pub(crate) fn to_decimal128_array( + col: &ColumnarValue, + arg_name: &str, + fn_name: &str, + number_rows: usize, +) -> Result> { + match col { + ColumnarValue::Array(array) => Ok(array.as_primitive::().to_owned()), + ColumnarValue::Scalar(ScalarValue::Decimal128(Some(value), _, _)) => { + Ok(PrimitiveArray::::from_value( + *value, + number_rows, + )) + } + other => { + exec_err!("Unsupported {arg_name} arg {other:?} for Spark function `{fn_name}`") + } + } +} + pub(crate) fn to_int32_array( col: &ColumnarValue, arg_name: &str, diff --git a/crates/sail-function/src/scalar/geo/mod.rs b/crates/sail-function/src/scalar/geo/mod.rs new file mode 100644 index 0000000000..2c213dd5d7 --- /dev/null +++ b/crates/sail-function/src/scalar/geo/mod.rs @@ -0,0 +1,4 @@ +pub mod st_asbinary; +pub mod st_geogfromwkb; +pub mod st_geomfromwkb; +pub mod wkb_reader; diff --git a/crates/sail-function/src/scalar/geo/st_asbinary.rs b/crates/sail-function/src/scalar/geo/st_asbinary.rs new file mode 100644 index 0000000000..74da56cefe --- /dev/null +++ b/crates/sail-function/src/scalar/geo/st_asbinary.rs @@ -0,0 +1,76 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; + +/// ST_AsBinary - Convert Geometry/Geography to WKB (Binary) +/// +/// Input: Binary containing WKB (with geoarrow metadata) +/// Output: Binary (WKB without metadata) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct StAsBinary { + signature: Signature, +} + +impl Default for StAsBinary { + fn default() -> Self { + Self::new() + } +} + +impl StAsBinary { + pub fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for StAsBinary { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_asbinary" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Binary) + } + + fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result> { + // st_asbinary strips metadata, so return a plain Binary field + Ok(Arc::new(Field::new(self.name(), DataType::Binary, true))) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let args = args.args; + + if args.len() != 1 { + return exec_err!( + "st_asbinary requires exactly 1 argument, got {}", + args.len() + ); + } + + match &args[0] { + ColumnarValue::Scalar(ScalarValue::Binary(Some(b))) => { + Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(b.to_vec())))) + } + ColumnarValue::Scalar(ScalarValue::Binary(None)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Binary(None))) + } + ColumnarValue::Array(array) => Ok(ColumnarValue::Array(array.clone())), + other => exec_err!("Unsupported argument type for st_asbinary: {:?}", other), + } + } +} diff --git a/crates/sail-function/src/scalar/geo/st_geogfromwkb.rs b/crates/sail-function/src/scalar/geo/st_geogfromwkb.rs new file mode 100644 index 0000000000..38433db7a2 --- /dev/null +++ b/crates/sail-function/src/scalar/geo/st_geogfromwkb.rs @@ -0,0 +1,103 @@ +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion::common::cast::as_binary_array; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; + +use super::wkb_reader::validate_geography; + +/// ST_GeoGFromWKB - Convert WKB to Geography(4326) +/// +/// Input: Binary containing WKB +/// Output: Binary with type Geography(4326) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct StGeogFromWKB { + signature: Signature, +} + +impl Default for StGeogFromWKB { + fn default() -> Self { + Self::new() + } +} + +impl StGeogFromWKB { + pub fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for StGeogFromWKB { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_geogfromwkb" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Binary) + } + + fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result> { + let mut metadata = HashMap::new(); + metadata.insert( + "ARROW:extension:name".to_string(), + "geoarrow.wkb".to_string(), + ); + metadata.insert( + "ARROW:extension:metadata".to_string(), + r#"{"crs":"OGC:CRS84","edges":"spherical"}"#.to_string(), + ); + Ok(Arc::new( + Field::new(self.name(), DataType::Binary, true).with_metadata(metadata), + )) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let args = args.args; + + if args.len() != 1 { + return exec_err!( + "st_geogfromwkb requires exactly 1 argument, got {}", + args.len() + ); + } + + match &args[0] { + ColumnarValue::Scalar(ScalarValue::Binary(Some(b))) => { + if let Err(e) = validate_geography(b) { + return exec_err!("Invalid WKB: {}", e); + } + Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(b.to_vec())))) + } + ColumnarValue::Scalar(ScalarValue::Binary(None)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Binary(None))) + } + ColumnarValue::Array(array) => { + let binary_array = as_binary_array(array)?; + for (i, opt) in binary_array.iter().enumerate() { + if let Some(b) = opt { + if let Err(e) = validate_geography(b) { + return exec_err!("Invalid WKB at index {}: {}", i, e); + } + } + } + Ok(ColumnarValue::Array(array.clone())) + } + other => exec_err!("Unsupported argument type for st_geogfromwkb: {:?}", other), + } + } +} diff --git a/crates/sail-function/src/scalar/geo/st_geomfromwkb.rs b/crates/sail-function/src/scalar/geo/st_geomfromwkb.rs new file mode 100644 index 0000000000..308a6be5b1 --- /dev/null +++ b/crates/sail-function/src/scalar/geo/st_geomfromwkb.rs @@ -0,0 +1,103 @@ +use std::any::Any; +use std::collections::HashMap; +use std::sync::Arc; + +use datafusion::arrow::datatypes::{DataType, Field}; +use datafusion::common::cast::as_binary_array; +use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_expr::{ + ColumnarValue, ReturnFieldArgs, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, +}; + +use super::wkb_reader::validate_geometry; + +/// ST_GeomFromWKB - Convert WKB to Geometry(0) +/// +/// Input: Binary containing WKB +/// Output: Binary with type Geometry(0) +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct StGeomFromWKB { + signature: Signature, +} + +impl Default for StGeomFromWKB { + fn default() -> Self { + Self::new() + } +} + +impl StGeomFromWKB { + pub fn new() -> Self { + Self { + signature: Signature::exact(vec![DataType::Binary], Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for StGeomFromWKB { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_geomfromwkb" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> Result { + Ok(DataType::Binary) + } + + fn return_field_from_args(&self, _args: ReturnFieldArgs) -> Result> { + let mut metadata = HashMap::new(); + metadata.insert( + "ARROW:extension:name".to_string(), + "geoarrow.wkb".to_string(), + ); + metadata.insert( + "ARROW:extension:metadata".to_string(), + r#"{"crs":"SRID:0"}"#.to_string(), + ); + Ok(Arc::new( + Field::new(self.name(), DataType::Binary, true).with_metadata(metadata), + )) + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + let args = args.args; + + if args.len() != 1 { + return exec_err!( + "st_geomfromwkb requires exactly 1 argument, got {}", + args.len() + ); + } + + match &args[0] { + ColumnarValue::Scalar(ScalarValue::Binary(Some(b))) => { + if let Err(e) = validate_geometry(b) { + return exec_err!("Invalid WKB: {}", e); + } + Ok(ColumnarValue::Scalar(ScalarValue::Binary(Some(b.to_vec())))) + } + ColumnarValue::Scalar(ScalarValue::Binary(None)) => { + Ok(ColumnarValue::Scalar(ScalarValue::Binary(None))) + } + ColumnarValue::Array(array) => { + let binary_array = as_binary_array(array)?; + for (i, opt) in binary_array.iter().enumerate() { + if let Some(b) = opt { + if let Err(e) = validate_geometry(b) { + return exec_err!("Invalid WKB at index {}: {}", i, e); + } + } + } + Ok(ColumnarValue::Array(array.clone())) + } + other => exec_err!("Unsupported argument type for st_geomfromwkb: {:?}", other), + } + } +} diff --git a/crates/sail-function/src/scalar/geo/wkb_reader.rs b/crates/sail-function/src/scalar/geo/wkb_reader.rs new file mode 100644 index 0000000000..7accf36f9c --- /dev/null +++ b/crates/sail-function/src/scalar/geo/wkb_reader.rs @@ -0,0 +1,1137 @@ +const BYTE_SIZE: usize = 1; +const INT_SIZE: usize = 4; +const DOUBLE_SIZE: usize = 8; + +const BIG_ENDIAN: u8 = 0; +const LITTLE_ENDIAN: u8 = 1; + +const DIM_OFFSET_2D: i32 = 0; +const DIM_OFFSET_Z: i32 = 1000; +const DIM_OFFSET_M: i32 = 2000; +const DIM_OFFSET_ZM: i32 = 3000; + +const MIN_LONGITUDE: f64 = -180.0; +const MAX_LONGITUDE: f64 = 180.0; +const MIN_LATITUDE: f64 = -90.0; +const MAX_LATITUDE: f64 = 90.0; + +#[derive(Debug, Clone, PartialEq, thiserror::Error)] +pub enum WkbError { + #[error("WKB data is empty or null")] + EmptyInput, + #[error("WKB data too short")] + InputTooShort, + #[error("Invalid byte order: {0}")] + InvalidByteOrder(u8), + #[error("Invalid or unsupported type: {0}")] + InvalidType(i32), + #[error("Unsupported dimension: {0}")] + UnsupportedDimension(i32), + #[error("Unexpected end of WKB buffer: expected {expected} bytes at position {position}, but only {remaining} remaining")] + UnexpectedEndOfBuffer { + expected: usize, + remaining: usize, + position: usize, + }, + #[error("Non-finite coordinate value ({value}) found at position {position}")] + NonFiniteCoordinate { value: f64, position: usize }, + #[error("Invalid coordinate value at position {position}: {message}")] + InvalidCoordinateValue { message: String, position: usize }, + #[error("Too few points in linestring at position {position}: expected at least {expected_min}, got {actual}")] + TooFewPointsInLineString { + expected_min: i32, + actual: i32, + position: usize, + }, + #[error("Ring is not closed at position {position}")] + RingNotClosed { position: usize }, + #[error("Too few points in ring at position {position}: expected at least {expected_min}, got {actual}")] + TooFewPointsInRing { + expected_min: i32, + actual: i32, + position: usize, + }, + #[error("Expected Point in MultiPoint at position {position}")] + ExpectedPointInMultiPoint { position: usize }, + #[error("Expected LineString in MultiLineString at position {position}")] + ExpectedLineStringInMultiLineString { position: usize }, + #[error("Expected Polygon in MultiPolygon at position {position}")] + ExpectedPolygonInMultiPolygon { position: usize }, + #[error("Dimension mismatch at position {position}: expected Z={expected_has_z}, M={expected_has_m}, got Z={actual_has_z}, M={actual_has_m}")] + DimensionMismatch { + expected_has_z: bool, + actual_has_z: bool, + expected_has_m: bool, + actual_has_m: bool, + position: usize, + }, + #[error("Invalid or unsupported geometry type: {type} at position {position}")] + InvalidGeometryType { r#type: i32, position: usize }, + #[error("Geography bounds violation at position {position}: {message}")] + GeographyBoundsViolation { message: String, position: usize }, +} + +pub struct WkbReader { + is_geography: bool, + buffer: Vec, + position: usize, + byte_order: u8, + expected_has_z: bool, + expected_has_m: bool, +} + +impl WkbReader { + pub fn new(is_geography: bool) -> Self { + Self { + is_geography, + buffer: Vec::new(), + position: 0, + byte_order: BIG_ENDIAN, + expected_has_z: false, + expected_has_m: false, + } + } + + pub fn validate(&mut self, wkb: &[u8]) -> Result<(), WkbError> { + if wkb.is_empty() { + return Err(WkbError::EmptyInput); + } + + if wkb.len() < BYTE_SIZE + INT_SIZE { + return Err(WkbError::InputTooShort); + } + + self.buffer = wkb.to_vec(); + self.position = 0; + + self.read_byte_order()?; + self.read_geometry(0) + } + + fn read_byte_order(&mut self) -> Result<(), WkbError> { + let byte_order = self.read_byte()?; + if byte_order != BIG_ENDIAN && byte_order != LITTLE_ENDIAN { + return Err(WkbError::InvalidByteOrder(byte_order)); + } + self.byte_order = byte_order; + Ok(()) + } + + fn read_byte(&mut self) -> Result { + if self.position >= self.buffer.len() { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 1, + remaining: 0, + position: self.position, + }); + } + let b = self.buffer[self.position]; + self.position += 1; + Ok(b) + } + + fn read_int(&mut self) -> Result { + let remaining = self.buffer.len() - self.position; + if remaining < INT_SIZE { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: INT_SIZE, + remaining, + position: self.position, + }); + } + let bytes = &self.buffer[self.position..self.position + INT_SIZE]; + self.position += INT_SIZE; + let val = if self.byte_order == LITTLE_ENDIAN { + i32::from_le_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) + } else { + i32::from_be_bytes([bytes[0], bytes[1], bytes[2], bytes[3]]) + }; + Ok(val) + } + + fn read_double(&mut self) -> Result { + let remaining = self.buffer.len() - self.position; + if remaining < DOUBLE_SIZE { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: DOUBLE_SIZE, + remaining, + position: self.position, + }); + } + let bytes = &self.buffer[self.position..self.position + DOUBLE_SIZE]; + self.position += DOUBLE_SIZE; + let val = if self.byte_order == LITTLE_ENDIAN { + f64::from_le_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]) + } else { + f64::from_be_bytes([ + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], + ]) + }; + Ok(val) + } + + fn get_base_type(&self, wkb_type: i32) -> i32 { + let base = wkb_type; + for offset in [DIM_OFFSET_ZM, DIM_OFFSET_M, DIM_OFFSET_Z] { + if base > offset && base <= offset + 7 { + return base - offset; + } + } + base + } + + fn get_dimension_count(&self, wkb_type: i32) -> usize { + let base = wkb_type; + for (offset, dims) in [(DIM_OFFSET_ZM, 4), (DIM_OFFSET_M, 3), (DIM_OFFSET_Z, 3)] { + if base > offset && base <= offset + 7 { + return dims; + } + } + 2 + } + + fn has_z(&self, wkb_type: i32) -> bool { + let base = self.get_base_type(wkb_type); + let offset = wkb_type - base; + offset == DIM_OFFSET_Z || offset == DIM_OFFSET_ZM + } + + fn has_m(&self, wkb_type: i32) -> bool { + let base = self.get_base_type(wkb_type); + let offset = wkb_type - base; + offset == DIM_OFFSET_M || offset == DIM_OFFSET_ZM + } + + fn is_valid_wkb_type(&self, wkb_type: i32) -> bool { + let base = self.get_base_type(wkb_type); + if !(1..=7).contains(&base) { + return false; + } + let offset = wkb_type - base; + offset == DIM_OFFSET_2D + || offset == DIM_OFFSET_Z + || offset == DIM_OFFSET_M + || offset == DIM_OFFSET_ZM + } + + fn read_geometry(&mut self, depth: usize) -> Result<(), WkbError> { + if self.buffer.len() - self.position < INT_SIZE { + return Err(WkbError::InputTooShort); + } + + let type_start_pos = self.position; + let type_and_dim = self.read_int()?; + + if !self.is_valid_wkb_type(type_and_dim) { + return Err(WkbError::InvalidGeometryType { + r#type: type_and_dim, + position: type_start_pos, + }); + } + + let geo_type = self.get_base_type(type_and_dim); + let dimension_count = self.get_dimension_count(type_and_dim); + let has_z = self.has_z(type_and_dim); + let has_m = self.has_m(type_and_dim); + + if depth > 0 && (has_z != self.expected_has_z || has_m != self.expected_has_m) { + return Err(WkbError::DimensionMismatch { + expected_has_z: self.expected_has_z, + actual_has_z: has_z, + expected_has_m: self.expected_has_m, + actual_has_m: has_m, + position: type_start_pos, + }); + } + + if depth == 0 { + self.expected_has_z = has_z; + self.expected_has_m = has_m; + } + + match geo_type { + 1 => self.read_point(dimension_count, has_z, has_m, true), + 2 => self.read_linestring(dimension_count, has_z, has_m), + 3 => self.read_polygon(dimension_count, has_z, has_m), + 4 => self.read_multipoint(has_z, has_m), + 5 => self.read_multilinestring(has_z, has_m), + 6 => self.read_multipolygon(has_z, has_m), + 7 => self.read_geometry_collection(has_z, has_m), + _ => Err(WkbError::InvalidType(geo_type)), + } + } + + fn read_point( + &mut self, + dimension_count: usize, + _has_z: bool, + _has_m: bool, + allow_empty: bool, + ) -> Result<(), WkbError> { + let coords_start_pos = self.position; + let mut coords = Vec::with_capacity(dimension_count); + + for _ in 0..dimension_count { + let value = self.read_double()?; + if !allow_empty && !value.is_finite() { + return Err(WkbError::NonFiniteCoordinate { + value, + position: coords_start_pos, + }); + } + if value.is_finite() { + coords.push(value); + } + } + + if self.is_geography && coords.len() >= 2 { + let lon = coords[0]; + let lat = coords[1]; + + if !(MIN_LONGITUDE..=MAX_LONGITUDE).contains(&lon) { + return Err(WkbError::GeographyBoundsViolation { + message: format!("longitude {} is out of range [-180, 180]", lon), + position: coords_start_pos, + }); + } + + if !(MIN_LATITUDE..=MAX_LATITUDE).contains(&lat) { + return Err(WkbError::GeographyBoundsViolation { + message: format!("latitude {} is out of range [-90, 90]", lat), + position: coords_start_pos, + }); + } + } + + Ok(()) + } + + fn read_linestring( + &mut self, + dimension_count: usize, + has_z: bool, + has_m: bool, + ) -> Result<(), WkbError> { + let num_points_pos = self.position; + let num_points = self.read_int()?; + + if num_points < 2 { + return Err(WkbError::TooFewPointsInLineString { + expected_min: 2, + actual: num_points, + position: num_points_pos, + }); + } + + for _ in 0..num_points { + self.read_internal_point(dimension_count, has_z, has_m)?; + } + + Ok(()) + } + + fn read_internal_point( + &mut self, + dimension_count: usize, + _has_z: bool, + _has_m: bool, + ) -> Result<(), WkbError> { + let coords_start_pos = self.position; + + let mut coords = Vec::with_capacity(dimension_count); + for _ in 0..dimension_count { + let value = self.read_double()?; + if !value.is_finite() { + return Err(WkbError::NonFiniteCoordinate { + value, + position: coords_start_pos, + }); + } + coords.push(value); + } + + if self.is_geography && dimension_count >= 2 { + let lon = coords[0]; + let lat = coords[1]; + + if !(MIN_LONGITUDE..=MAX_LONGITUDE).contains(&lon) { + return Err(WkbError::GeographyBoundsViolation { + message: format!("longitude {} is out of range [-180, 180]", lon), + position: coords_start_pos, + }); + } + + if !(MIN_LATITUDE..=MAX_LATITUDE).contains(&lat) { + return Err(WkbError::GeographyBoundsViolation { + message: format!("latitude {} is out of range [-90, 90]", lat), + position: coords_start_pos, + }); + } + } + + Ok(()) + } + + fn read_polygon( + &mut self, + dimension_count: usize, + has_z: bool, + has_m: bool, + ) -> Result<(), WkbError> { + let num_rings = self.read_int()?; + if num_rings < 0 { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 4, + remaining: 0, + position: self.position - 4, + }); + } + + for _ in 0..num_rings { + self.read_ring(dimension_count, has_z, has_m)?; + } + + Ok(()) + } + + fn read_ring( + &mut self, + dimension_count: usize, + _has_z: bool, + _has_m: bool, + ) -> Result<(), WkbError> { + let num_points_pos = self.position; + let num_points = self.read_int()?; + + if num_points < 4 { + return Err(WkbError::TooFewPointsInRing { + expected_min: 4, + actual: num_points, + position: num_points_pos, + }); + } + + let mut first_coords: Option> = None; + let mut last_coords: Option> = None; + + for i in 0..num_points { + let coords_start_pos = self.position; + let mut coords = Vec::with_capacity(dimension_count); + + for _ in 0..dimension_count { + let value = self.read_double()?; + if !value.is_finite() { + return Err(WkbError::NonFiniteCoordinate { + value, + position: coords_start_pos, + }); + } + coords.push(value); + } + + if self.is_geography && dimension_count >= 2 { + let lon = coords[0]; + let lat = coords[1]; + if !(MIN_LONGITUDE..=MAX_LONGITUDE).contains(&lon) + || !(MIN_LATITUDE..=MAX_LATITUDE).contains(&lat) + { + return Err(WkbError::GeographyBoundsViolation { + message: format!( + "coordinate ({}, {}) is out of geography bounds [-180, 180] x [-90, 90]", + lon, lat + ), + position: coords_start_pos, + }); + } + } + + if i == 0 { + first_coords = Some(coords); + } else if i == num_points - 1 { + last_coords = Some(coords); + } + } + + if let (Some(first), Some(last)) = (first_coords, last_coords) { + if first != last { + return Err(WkbError::RingNotClosed { + position: num_points_pos, + }); + } + } + + Ok(()) + } + + fn read_multipoint( + &mut self, + expected_has_z: bool, + expected_has_m: bool, + ) -> Result<(), WkbError> { + let num_geometries = self.read_int()?; + if num_geometries < 0 { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 4, + remaining: 0, + position: self.position - 4, + }); + } + + for _ in 0..num_geometries { + let pos = self.position; + self.read_nested_geometry(expected_has_z, expected_has_m, |geo_type| { + if geo_type != 1 { + Err(WkbError::ExpectedPointInMultiPoint { position: pos }) + } else { + Ok(()) + } + })?; + } + + Ok(()) + } + + fn read_multilinestring( + &mut self, + expected_has_z: bool, + expected_has_m: bool, + ) -> Result<(), WkbError> { + let num_geometries = self.read_int()?; + if num_geometries < 0 { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 4, + remaining: 0, + position: self.position - 4, + }); + } + + for _ in 0..num_geometries { + let pos = self.position; + self.read_nested_geometry(expected_has_z, expected_has_m, |geo_type| { + if geo_type != 2 { + Err(WkbError::ExpectedLineStringInMultiLineString { position: pos }) + } else { + Ok(()) + } + })?; + } + + Ok(()) + } + + fn read_multipolygon( + &mut self, + expected_has_z: bool, + expected_has_m: bool, + ) -> Result<(), WkbError> { + let num_geometries = self.read_int()?; + if num_geometries < 0 { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 4, + remaining: 0, + position: self.position - 4, + }); + } + + for _ in 0..num_geometries { + let pos = self.position; + self.read_nested_geometry(expected_has_z, expected_has_m, |geo_type| { + if geo_type != 3 { + Err(WkbError::ExpectedPolygonInMultiPolygon { position: pos }) + } else { + Ok(()) + } + })?; + } + + Ok(()) + } + + fn read_geometry_collection( + &mut self, + expected_has_z: bool, + expected_has_m: bool, + ) -> Result<(), WkbError> { + let num_geometries = self.read_int()?; + if num_geometries < 0 { + return Err(WkbError::UnexpectedEndOfBuffer { + expected: 4, + remaining: 0, + position: self.position - 4, + }); + } + + for _ in 0..num_geometries { + self.read_nested_geometry(expected_has_z, expected_has_m, |_| Ok(()))?; + } + + Ok(()) + } + + fn read_nested_geometry( + &mut self, + expected_has_z: bool, + expected_has_m: bool, + type_check: F, + ) -> Result<(), WkbError> + where + F: FnOnce(i32) -> Result<(), WkbError>, + { + let byte_order = self.read_byte()?; + if byte_order != BIG_ENDIAN && byte_order != LITTLE_ENDIAN { + return Err(WkbError::InvalidByteOrder(byte_order)); + } + let saved_byte_order = self.byte_order; + self.byte_order = byte_order; + + let type_start_pos = self.position; + let type_and_dim = self.read_int()?; + + if !self.is_valid_wkb_type(type_and_dim) { + self.byte_order = saved_byte_order; + return Err(WkbError::InvalidGeometryType { + r#type: type_and_dim, + position: type_start_pos, + }); + } + + let geo_type = self.get_base_type(type_and_dim); + if let Err(e) = type_check(geo_type) { + self.byte_order = saved_byte_order; + return Err(e); + } + + let has_z = self.has_z(type_and_dim); + let has_m = self.has_m(type_and_dim); + + if has_z != expected_has_z || has_m != expected_has_m { + let err = self.position; + self.byte_order = saved_byte_order; + return Err(WkbError::DimensionMismatch { + expected_has_z, + actual_has_z: has_z, + expected_has_m, + actual_has_m: has_m, + position: err, + }); + } + + let dimension_count = self.get_dimension_count(type_and_dim); + + match geo_type { + 1 => self.read_point(dimension_count, has_z, has_m, false)?, + 2 => self.read_linestring(dimension_count, has_z, has_m)?, + 3 => self.read_polygon(dimension_count, has_z, has_m)?, + 4 => self.read_multipoint(has_z, has_m)?, + 5 => self.read_multilinestring(has_z, has_m)?, + 6 => self.read_multipolygon(has_z, has_m)?, + 7 => self.read_geometry_collection(has_z, has_m)?, + _ => { + self.byte_order = saved_byte_order; + return Err(WkbError::InvalidType(geo_type)); + } + } + + self.byte_order = saved_byte_order; + Ok(()) + } +} + +pub fn validate_geometry(wkb: &[u8]) -> Result<(), WkbError> { + let mut reader = WkbReader::new(false); + reader.validate(wkb) +} + +pub fn validate_geography(wkb: &[u8]) -> Result<(), WkbError> { + let mut reader = WkbReader::new(true); + reader.validate(wkb) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_valid_point() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x01, // type (Point) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_valid_linestring() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x02, // type (LineString) + 0x00, 0x00, 0x00, 0x02, // num points = 2 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x1 = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y1 = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // x2 = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y2 = 2.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_too_few_points_linestring() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x02, // type (LineString) + 0x00, 0x00, 0x00, 0x01, // num points = 1 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::TooFewPointsInLineString { .. }) + )); + } + + #[test] + fn test_empty_input() { + let wkb: Vec = vec![]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::EmptyInput))); + } + + #[test] + fn test_invalid_byte_order() { + let wkb: Vec = vec![ + 0x02, // invalid byte order + 0x00, 0x00, 0x00, 0x01, + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidByteOrder(2)))); + } + + #[test] + fn test_invalid_type() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian) + 0x00, 0x00, 0x00, 0x99, // invalid type + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidGeometryType { .. }))); + } + + #[test] + fn test_input_too_short() { + // Not enough bytes for header + let wkb: Vec = vec![0x00, 0x00, 0x00]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InputTooShort))); + } + + #[test] + fn test_nested_dimension_mismatch() { + // MultiPoint containing a PointZ when parent expects 2D + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x04, // type (MultiPoint) - 2D + 0x00, 0x00, 0x00, 0x01, // num points = 1 + // Point Z (nested) - 3D! + 0x00, // byte order + 0x00, 0x00, 0x03, 0xe9, // type (PointZ = 1001) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // z + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::DimensionMismatch { .. }))); + } + + #[test] + fn test_valid_multilinestring() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x05, // type (MultiLineString) + 0x00, 0x00, 0x00, 0x01, // num line strings = 1 + // LineString (nested) + 0x00, // byte order + 0x00, 0x00, 0x00, 0x02, // type (LineString) + 0x00, 0x00, 0x00, 0x02, // num points = 2 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // (0, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // (1, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_valid_multipolygon() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x06, // type (MultiPolygon) + 0x00, 0x00, 0x00, 0x01, // num polygons = 1 + // Polygon (nested) + 0x00, // byte order + 0x00, 0x00, 0x00, 0x03, // type (Polygon) + 0x00, 0x00, 0x00, 0x01, // num rings = 1 + 0x00, 0x00, 0x00, 0x04, // num points = 4 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // (0, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, // (1, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, // (1, 1) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // (0, 0) - closed + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_valid_polygon() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x03, // type (Polygon) + 0x00, 0x00, 0x00, 0x01, // num rings = 1 + 0x00, 0x00, 0x00, 0x04, // num points = 4 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // (0, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, // (1, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, // (1, 1) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // (0, 0) - closed + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_too_few_points_in_ring() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x03, // type (Polygon) + 0x00, 0x00, 0x00, 0x01, // num rings = 1 + 0x00, 0x00, 0x00, 0x03, // num points = 3 (invalid - needs 4) + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::TooFewPointsInRing { .. }))); + } + + #[test] + fn test_ring_not_closed() { + // Ring with 4 points where last != first + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x03, // type (Polygon) + 0x00, 0x00, 0x00, 0x01, // num rings = 1 + 0x00, 0x00, 0x00, 0x04, // num points = 4 + // Point 1: (0, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // Point 2: (1, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // Point 3: (1, 1) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, // Point 4: (0, 1) - NOT equal to point 1 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xf0, 0x3f, + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::RingNotClosed { .. }))); + } + + #[test] + fn test_valid_geometry_collection() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x07, // type (GeometryCollection) + 0x00, 0x00, 0x00, 0x01, // num geometries = 1 + // Point (nested) + 0x00, // byte order + 0x00, 0x00, 0x00, 0x01, // type (Point) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_empty_geometry_collection() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x07, // type (GeometryCollection) + 0x00, 0x00, 0x00, 0x00, // num geometries = 0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_point_with_z() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x03, 0xe9, // type (PointZ = 1001) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // z = 3.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_point_with_m() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x07, 0xd1, // type (PointM = 2001) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // m = 3.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_point_with_zm() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x0b, 0xb9, // type (PointZM = 3001) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // z = 3.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x40, // m = 4.0 + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_geography_longitude_out_of_range() { + // 200.0 in big endian = 0x4079000000000000 -> bytes: 40 79 00 00 00 00 00 00 + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x01, // type (Point) + 0x40, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x = 200.0 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, // y = 2.0 + ]; + let result = validate_geography(&wkb); + assert!(matches!( + result, + Err(WkbError::GeographyBoundsViolation { .. }) + )); + } + + #[test] + fn test_geography_latitude_out_of_range() { + // 100.0 in big endian = 0x4059000000000000 -> bytes: 40 59 00 00 00 00 00 00 + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x01, // type (Point) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, // x = 1.0 + 0x40, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // y = 100.0 + ]; + let result = validate_geography(&wkb); + assert!(matches!( + result, + Err(WkbError::GeographyBoundsViolation { .. }) + )); + } + + #[test] + fn test_geography_valid_bounds() { + // 120.0 = 0x405e000000000000, 45.0 = 0x4046800000000000 + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x01, // type (Point) + 0x40, 0x5e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // x = 120.0 + 0x40, 0x46, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, // y = 45.0 + ]; + assert!(validate_geography(&wkb).is_ok()); + } + + #[test] + fn test_negative_count_multipoint() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x04, // type (MultiPoint) + 0xff, 0xff, 0xff, 0xff, // num points = -1 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::UnexpectedEndOfBuffer { .. }) + )); + } + + #[test] + fn test_negative_count_multilinestring() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x05, // type (MultiLineString) + 0xff, 0xff, 0xff, 0xff, // num line strings = -1 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::UnexpectedEndOfBuffer { .. }) + )); + } + + #[test] + fn test_negative_count_multipolygon() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x06, // type (MultiPolygon) + 0xff, 0xff, 0xff, 0xff, // num polygons = -1 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::UnexpectedEndOfBuffer { .. }) + )); + } + + #[test] + fn test_negative_count_geometry_collection() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x07, // type (GeometryCollection) + 0xff, 0xff, 0xff, 0xff, // num geometries = -1 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::UnexpectedEndOfBuffer { .. }) + )); + } + + #[test] + fn test_nested_invalid_type_in_multipoint() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x04, // type (MultiPoint) + 0x00, 0x00, 0x00, 0x01, // num points = 1 + 0x00, // byte order + 0x00, 0x00, 0x00, 0x02, // type (LineString) - wrong type! + 0x00, 0x00, 0x00, 0x02, // num points = 2 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::ExpectedPointInMultiPoint { .. }) + )); + } + + #[test] + fn test_nested_invalid_type_in_multilinestring() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x05, // type (MultiLineString) + 0x00, 0x00, 0x00, 0x01, // num line strings = 1 + 0x00, // byte order + 0x00, 0x00, 0x00, 0x01, // type (Point) - wrong type! + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x40, + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::ExpectedLineStringInMultiLineString { .. }) + )); + } + + #[test] + fn test_nested_invalid_type_in_multipolygon() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x06, // type (MultiPolygon) + 0x00, 0x00, 0x00, 0x01, // num polygons = 1 + 0x00, // byte order + 0x00, 0x00, 0x00, 0x01, // type (Point) - wrong type! + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x3f, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x40, + ]; + let result = validate_geometry(&wkb); + assert!(matches!( + result, + Err(WkbError::ExpectedPolygonInMultiPolygon { .. }) + )); + } + + #[test] + fn test_nested_invalid_type_in_geometry_collection() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x07, // type (GeometryCollection) + 0x00, 0x00, 0x00, 0x01, // num geometries = 1 + 0x00, // byte order + 0x00, 0x00, 0x00, 0x63, // type = 99 (invalid) + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidGeometryType { .. }))); + } + + #[test] + fn test_nested_invalid_byte_order() { + let wkb: Vec = vec![ + 0x00, // big endian outer + 0x00, 0x00, 0x00, 0x07, // type (GeometryCollection) + 0x00, 0x00, 0x00, 0x01, // num geometries = 1 + 0x02, // byte order = 2 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidByteOrder(2)))); + } + + #[test] + fn test_polygon_with_hole() { + let wkb: Vec = vec![ + 0x00, // big endian + 0x00, 0x00, 0x00, 0x03, // type (Polygon) + 0x00, 0x00, 0x00, 0x02, // num rings = 2 (exterior + 1 hole) + // Exterior ring + 0x00, 0x00, 0x00, 0x05, // num points = 5 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // (0, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x40, // (4, 0) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x40, // (4, 4) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // (0, 4) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, // (0, 0) - closed + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, // Hole ring + 0x00, 0x00, 0x00, 0x05, // num points = 5 + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x40, // (1, 1) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xe8, 0x40, // (3, 1) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xe8, 0x40, // (3, 3) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xe8, 0x40, // (1, 3) + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0xe8, 0x40, // (1, 1) - closed + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xe8, 0x40, + ]; + assert!(validate_geometry(&wkb).is_ok()); + } + + #[test] + fn test_invalid_type_zero() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x00, // type = 0 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidGeometryType { .. }))); + } + + #[test] + fn test_type_8_and_above_invalid() { + let wkb: Vec = vec![ + 0x00, // byte order (big endian = 0) + 0x00, 0x00, 0x00, 0x08, // type = 8 (invalid!) + ]; + let result = validate_geometry(&wkb); + assert!(matches!(result, Err(WkbError::InvalidGeometryType { .. }))); + } +} diff --git a/crates/sail-function/src/scalar/hash/spark_murmur3_hash.rs b/crates/sail-function/src/scalar/hash/spark_murmur3_hash.rs index 9005fa3098..c4cc379273 100644 --- a/crates/sail-function/src/scalar/hash/spark_murmur3_hash.rs +++ b/crates/sail-function/src/scalar/hash/spark_murmur3_hash.rs @@ -51,16 +51,10 @@ impl ScalarUDFImpl for SparkMurmur3Hash { let length = args.len(); if length < 1 { return Err(DataFusionError::Internal( - "spark_hash requires at least one argument".to_string(), + "spark_murmur3_hash (hash) requires at least one argument".to_string(), )); } - let seed = &args[length - 1]; - match seed { - ColumnarValue::Scalar(ScalarValue::Int32(_)) => {} - _ => { - args.push(ColumnarValue::Scalar(ScalarValue::Int32(Some(42)))); - } - } + args.push(ColumnarValue::Scalar(ScalarValue::Int32(Some(42)))); spark_murmur3_hash(&args[..]) } } diff --git a/crates/sail-function/src/scalar/hash/spark_xxhash64.rs b/crates/sail-function/src/scalar/hash/spark_xxhash64.rs index d990d96234..5679128812 100644 --- a/crates/sail-function/src/scalar/hash/spark_xxhash64.rs +++ b/crates/sail-function/src/scalar/hash/spark_xxhash64.rs @@ -54,17 +54,7 @@ impl ScalarUDFImpl for SparkXxhash64 { "spark_xxhash64 requires at least one argument".to_string(), )); } - let seed = &args[length - 1]; - match seed { - ColumnarValue::Scalar(ScalarValue::Int32(Some(seed))) => { - let new_scalar = ScalarValue::Int64(Some(*seed as i64)); - args[length - 1] = ColumnarValue::Scalar(new_scalar); - } - ColumnarValue::Scalar(ScalarValue::Int64(_)) => {} - _ => { - args.push(ColumnarValue::Scalar(ScalarValue::Int64(Some(42)))); - } - } + args.push(ColumnarValue::Scalar(ScalarValue::Int64(Some(42)))); spark_xxhash64(&args[..]) } } diff --git a/crates/sail-function/src/scalar/json/to_json.rs b/crates/sail-function/src/scalar/json/to_json.rs index 356422bd86..9a9ad03091 100644 --- a/crates/sail-function/src/scalar/json/to_json.rs +++ b/crates/sail-function/src/scalar/json/to_json.rs @@ -459,6 +459,16 @@ fn decimal_to_json_number(value: i128, scale: i8) -> Value { } fn number_from_f64(value: f64) -> Value { + if value.is_nan() { + return Value::String("NaN".to_string()); + } + if value.is_infinite() { + return if value.is_sign_positive() { + Value::String("Infinity".to_string()) + } else { + Value::String("-Infinity".to_string()) + }; + } serde_json::Number::from_f64(value) .map(Value::Number) .unwrap_or_else(|| Value::String(value.to_string())) diff --git a/crates/sail-function/src/scalar/math/random.rs b/crates/sail-function/src/scalar/math/random.rs index 0876c4b66e..f69734c086 100644 --- a/crates/sail-function/src/scalar/math/random.rs +++ b/crates/sail-function/src/scalar/math/random.rs @@ -5,7 +5,7 @@ use datafusion::arrow::array::Float64Array; use datafusion::arrow::datatypes::DataType; use datafusion_common::{exec_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; -use rand::{rng, Rng}; +use rand::{rng, RngExt}; use super::xorshift::SparkXorShiftRandom; use crate::error::{invalid_arg_count_exec_err, unsupported_data_types_exec_err}; diff --git a/crates/sail-function/src/scalar/math/spark_abs.rs b/crates/sail-function/src/scalar/math/spark_abs.rs index d5c89f643a..0554f6cc6b 100644 --- a/crates/sail-function/src/scalar/math/spark_abs.rs +++ b/crates/sail-function/src/scalar/math/spark_abs.rs @@ -13,7 +13,7 @@ use datafusion::arrow::datatypes::{ use datafusion::functions::math::expr_fn::abs; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_expr::{ ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility, @@ -167,7 +167,7 @@ impl ScalarUDFImpl for SparkAbs { } } - fn simplify(&self, args: Vec, info: &dyn SimplifyInfo) -> Result { + fn simplify(&self, args: Vec, info: &SimplifyContext) -> Result { match info.get_data_type(&args[0])? { DataType::Interval(_) | DataType::Duration(_) => Ok(ExprSimplifyResult::Original(args)), _ => Ok(ExprSimplifyResult::Simplified(abs(args.one()?))), diff --git a/crates/sail-function/src/scalar/mod.rs b/crates/sail-function/src/scalar/mod.rs index 493a5afaee..4dc4556b9a 100644 --- a/crates/sail-function/src/scalar/mod.rs +++ b/crates/sail-function/src/scalar/mod.rs @@ -4,6 +4,7 @@ pub mod csv; pub mod datetime; pub mod drop_struct_field; pub mod explode; +pub mod geo; pub mod hash; pub mod json; pub mod map; diff --git a/crates/sail-function/src/scalar/string/soundex.rs b/crates/sail-function/src/scalar/string/soundex.rs index 71ba95aef6..a8697b145d 100644 --- a/crates/sail-function/src/scalar/string/soundex.rs +++ b/crates/sail-function/src/scalar/string/soundex.rs @@ -1,9 +1,9 @@ use std::any::Any; use std::sync::Arc; -use datafusion::arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use datafusion::arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringArray}; use datafusion::arrow::datatypes::DataType; -use datafusion_common::cast::as_generic_string_array; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; use datafusion_common::{exec_err, Result}; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; @@ -41,8 +41,11 @@ impl ScalarUDFImpl for Soundex { &self.signature } - fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(DataType::Utf8) + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[0] { + DataType::LargeUtf8 => Ok(DataType::LargeUtf8), + _ => Ok(DataType::Utf8), + } } fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { @@ -51,9 +54,8 @@ impl ScalarUDFImpl for Soundex { return exec_err!("`soundex` function requires 1 argument, got {}", args.len()); } match args[0].data_type() { - DataType::Utf8 | DataType::Utf8View => { - make_scalar_function(soundex::, vec![])(&args) - } + DataType::Utf8 => make_scalar_function(soundex::, vec![])(&args), + DataType::Utf8View => make_scalar_function(soundex_view, vec![])(&args), DataType::LargeUtf8 => make_scalar_function(soundex::, vec![])(&args), other => { exec_err!("unsupported data type {other:?} for function `soundex`") @@ -83,13 +85,16 @@ fn classify_char(c: char) -> SoundexChar { } } -/// Computes the 4-character Soundex code for a string. +/// Computes the Soundex code for a string (Spark-compatible). +/// +/// Spark returns the input unchanged if the first character is not ASCII alphabetic. fn compute_soundex(s: &str) -> String { - let mut chars = s.chars().filter(|c| c.is_ascii_alphabetic()); + let mut chars = s.chars(); + // Spark: if the first character is not a letter, return the input as-is. let first_char = match chars.next() { - Some(c) => c.to_ascii_uppercase(), - None => return "".to_string(), + Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(), + _ => return s.to_string(), }; let mut result = String::with_capacity(4); @@ -126,14 +131,22 @@ fn compute_soundex(s: &str) -> String { result } -/// Applies Soundex to each element in a string array. +/// Applies Soundex to each element in a Utf8/LargeUtf8 string array. fn soundex(args: &[ArrayRef]) -> Result { let str_array = as_generic_string_array::(&args[0])?; + let result = str_array + .iter() + .map(|opt_str| opt_str.map(compute_soundex)) + .collect::>(); + Ok(Arc::new(result) as ArrayRef) +} +/// Applies Soundex to each element in a Utf8View string array. +fn soundex_view(args: &[ArrayRef]) -> Result { + let str_array = as_string_view_array(&args[0])?; let result = str_array .iter() .map(|opt_str| opt_str.map(compute_soundex)) .collect::(); - Ok(Arc::new(result) as ArrayRef) } diff --git a/crates/sail-function/src/scalar/string/spark_to_binary.rs b/crates/sail-function/src/scalar/string/spark_to_binary.rs index 2e125526a6..be706f255a 100644 --- a/crates/sail-function/src/scalar/string/spark_to_binary.rs +++ b/crates/sail-function/src/scalar/string/spark_to_binary.rs @@ -5,7 +5,7 @@ use datafusion::arrow::datatypes::DataType; use datafusion::functions::encoding::expr_fn::decode; use datafusion::functions::encoding::inner::DecodeFunc; use datafusion_common::{exec_err, Result, ScalarValue}; -use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; +use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext}; use datafusion_expr::{expr, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl}; use datafusion_expr_common::columnar_value::ColumnarValue; use datafusion_expr_common::signature::{Signature, TypeSignature, Volatility}; @@ -119,7 +119,7 @@ impl ScalarUDFImpl for SparkToBinary { } } - fn simplify(&self, args: Vec, _info: &dyn SimplifyInfo) -> Result { + fn simplify(&self, args: Vec, _info: &SimplifyContext) -> Result { if args.len() != 1 && args.len() != 2 { return exec_err!( "Spark `to_binary` function requires 1 or 2 arguments, got {}", diff --git a/crates/sail-function/src/scalar/url/parse_url.rs b/crates/sail-function/src/scalar/url/parse_url.rs index 33ed9320d6..e9b5d528a0 100644 --- a/crates/sail-function/src/scalar/url/parse_url.rs +++ b/crates/sail-function/src/scalar/url/parse_url.rs @@ -57,13 +57,12 @@ impl ParseUrl { /// * `Err(DataFusionError)` - If the URL is malformed and cannot be parsed /// fn parse(value: &str, part: &str, key: Option<&str>) -> Result> { - Url::parse(value) - .map_err(|e| exec_datafusion_err!("{e:?}")) - .map(|url| match part { + match Url::parse(value) { + Ok(url) => Ok(match part { "HOST" => url.host_str().map(String::from), "PATH" => { - let path: String = url.path().to_string(); - let path: String = if path == "/" { "".to_string() } else { path }; + let path = url.path().to_string(); + let path = if path == "/" { "".to_string() } else { path }; Some(path) } "QUERY" => match key { @@ -86,7 +85,7 @@ impl ParseUrl { "USERINFO" => { let username = url.username(); if username.is_empty() { - return None; + return Ok(None); } match url.password() { Some(password) => Some(format!("{username}:{password}")), @@ -94,7 +93,42 @@ impl ParseUrl { } } _ => None, - }) + }), + Err(url::ParseError::RelativeUrlWithoutBase) => { + // Spark's java.net.URI treats schemeless strings as relative URIs. + // Parse the components manually: path?query#fragment + let (without_fragment, fragment) = match value.find('#') { + Some(i) => (&value[..i], Some(&value[i + 1..])), + None => (value, None), + }; + let (path, query) = match without_fragment.find('?') { + Some(i) => (&without_fragment[..i], Some(&without_fragment[i + 1..])), + None => (without_fragment, None), + }; + Ok(match part { + "PATH" => Some(path.to_string()), + "QUERY" => match key { + None => query.map(String::from), + Some(key) => query.and_then(|q| { + q.split('&') + .filter_map(|pair| pair.split_once('=')) + .find(|(k, _)| *k == key) + .map(|(_, v)| v.to_string()) + }), + }, + "REF" => fragment.map(String::from), + "FILE" => { + let file = match query { + Some(q) => format!("{path}?{q}"), + None => path.to_string(), + }; + Some(file) + } + _ => None, + }) + } + Err(e) => Err(exec_datafusion_err!("{e:?}")), + } } } diff --git a/crates/sail-iceberg/src/datasource/expr_adapter.rs b/crates/sail-iceberg/src/datasource/expr_adapter.rs index 290cbf73b1..0514cf37a7 100644 --- a/crates/sail-iceberg/src/datasource/expr_adapter.rs +++ b/crates/sail-iceberg/src/datasource/expr_adapter.rs @@ -30,16 +30,16 @@ impl PhysicalExprAdapterFactory for IcebergPhysicalExprAdapterFactory { &self, logical_file_schema: SchemaRef, physical_file_schema: SchemaRef, - ) -> Arc { + ) -> Result> { let (column_mapping, default_values) = create_column_mapping(&logical_file_schema, &physical_file_schema); - Arc::new(IcebergPhysicalExprAdapter { + Ok(Arc::new(IcebergPhysicalExprAdapter { logical_file_schema, physical_file_schema, column_mapping, default_values, - }) + })) } } diff --git a/crates/sail-iceberg/src/datasource/expressions.rs b/crates/sail-iceberg/src/datasource/expressions.rs index b3a18e3fc4..baaec6457e 100644 --- a/crates/sail-iceberg/src/datasource/expressions.rs +++ b/crates/sail-iceberg/src/datasource/expressions.rs @@ -14,7 +14,6 @@ use std::sync::Arc; use datafusion::catalog::Session; use datafusion::common::{DFSchema, Result}; -use datafusion::logical_expr::execution_props::ExecutionProps; use datafusion::logical_expr::simplify::SimplifyContext; use datafusion::logical_expr::{Expr, TableProviderFilterPushDown}; use datafusion::optimizer::simplify_expressions::ExprSimplifier; @@ -25,8 +24,7 @@ pub fn simplify_expr( df_schema: &DFSchema, expr: Expr, ) -> Result> { - let props = ExecutionProps::new(); - let simplify_context = SimplifyContext::new(&props).with_schema(df_schema.clone().into()); + let simplify_context = SimplifyContext::default().with_schema(df_schema.clone().into()); let simplifier = ExprSimplifier::new(simplify_context).with_max_cycles(10); let simplified = simplifier.simplify(expr)?; session.create_physical_expr(simplified, df_schema) diff --git a/crates/sail-iceberg/src/datasource/provider.rs b/crates/sail-iceberg/src/datasource/provider.rs index 01ea260f52..946e495be8 100644 --- a/crates/sail-iceberg/src/datasource/provider.rs +++ b/crates/sail-iceberg/src/datasource/provider.rs @@ -45,6 +45,7 @@ use crate::io::{ load_manifest as io_load_manifest, load_manifest_list as io_load_manifest_list, StoreContext, }; use crate::spec::manifest::DataContentType; +use crate::spec::transform::Transform; use crate::spec::types::values::Literal; use crate::spec::{ DataFile, ManifestContentType, ManifestList, ManifestStatus, PartitionSpec, Schema, Snapshot, @@ -71,6 +72,9 @@ pub struct IcebergTableProvider { snapshot: Snapshot, /// All partition specs referenced by the table partition_specs: Vec, + /// Default partition spec id (for schema ordering / partition metadata) + #[expect(unused)] + default_spec_id: i32, /// Arrow schema for DataFusion arrow_schema: Arc, } @@ -82,14 +86,21 @@ impl IcebergTableProvider { schema: Schema, snapshot: Snapshot, partition_specs: Vec, + default_spec_id: i32, ) -> Result { let table_uri_str = table_uri.to_string(); log::trace!("Creating table provider for: {}", table_uri_str); - let arrow_schema = Arc::new(iceberg_schema_to_arrow(&schema).map_err(|e| { + let arrow_schema = iceberg_schema_to_arrow(&schema).map_err(|e| { log::trace!("Failed to convert schema to Arrow: {:?}", e); e - })?); + })?; + let arrow_schema = Arc::new(Self::reorder_arrow_schema_for_identity_partitions( + &schema, + &partition_specs, + default_spec_id, + &arrow_schema, + )); log::trace!( "Converted schema to Arrow with {} fields", @@ -101,10 +112,66 @@ impl IcebergTableProvider { schema, snapshot, partition_specs, + default_spec_id, arrow_schema, }) } + fn reorder_arrow_schema_for_identity_partitions( + schema: &Schema, + partition_specs: &[PartitionSpec], + default_spec_id: i32, + arrow_schema: &ArrowSchema, + ) -> ArrowSchema { + // BDD scenarios expect "data columns" first and identity-partition columns last (in spec order), + // but only for identity-only partition specs. For mixed transform specs (e.g. `years(x), y`) + // we keep the original schema order. + let Some(spec) = partition_specs + .iter() + .find(|s| s.spec_id() == default_spec_id) + else { + return arrow_schema.clone(); + }; + if spec + .fields() + .iter() + .any(|pf| !matches!(pf.transform, Transform::Identity)) + { + return arrow_schema.clone(); + } + + let mut identity_cols: Vec = Vec::new(); + for pf in spec.fields().iter() { + if matches!(pf.transform, Transform::Identity) { + if let Some(field) = schema.field_by_id(pf.source_id) { + identity_cols.push(field.name.clone()); + } + } + } + if identity_cols.is_empty() { + return arrow_schema.clone(); + } + + let identity_set: std::collections::HashSet<&str> = + identity_cols.iter().map(|s| s.as_str()).collect(); + let mut out_fields: Vec = Vec::new(); + + // Keep non-partition columns in original order. + for f in arrow_schema.fields().iter() { + if !identity_set.contains(f.name().as_str()) { + out_fields.push(Arc::new((**f).clone())); + } + } + // Append identity partition columns in spec order. + for name in identity_cols { + if let Ok(idx) = arrow_schema.index_of(&name) { + out_fields.push(Arc::new(arrow_schema.field(idx).clone())); + } + } + + ArrowSchema::new(out_fields) + } + /// Get the table URI pub fn table_uri(&self) -> &str { &self.table_uri @@ -296,6 +363,7 @@ impl IcebergTableProvider { partition_values, range: None, statistics: Some(Arc::new(self.create_file_statistics(&data_file))), + ordering: None, extensions, metadata_size_hint: None, }; @@ -690,6 +758,10 @@ impl IcebergTableProvider { match self.classify_pushdown_for_expr(f) { TableProviderFilterPushDown::Exact => { pruning_filters.push(f.clone()); + // Even if partition pruning is "exact", we still must apply the filter at scan + // time. Pruning is an optimization and can be conservative when stats are + // missing; correctness requires retaining the predicate. + parquet_pushdown_filters.push(f.clone()); } TableProviderFilterPushDown::Inexact => { pruning_filters.push(f.clone()); diff --git a/crates/sail-iceberg/src/datasource/pruning.rs b/crates/sail-iceberg/src/datasource/pruning.rs index b5c2731a57..ac8d6e3d78 100644 --- a/crates/sail-iceberg/src/datasource/pruning.rs +++ b/crates/sail-iceberg/src/datasource/pruning.rs @@ -188,7 +188,8 @@ impl PruningStatistics for IcebergPruningStats { } result.push(any_match); } else { - result.push(false); + // If stats are missing, we cannot safely prune the file. + result.push(true); } } Some(BooleanArray::from(result)) diff --git a/crates/sail-iceberg/src/io/mod.rs b/crates/sail-iceberg/src/io/mod.rs index adb7d02e79..bca1b7d6c6 100644 --- a/crates/sail-iceberg/src/io/mod.rs +++ b/crates/sail-iceberg/src/io/mod.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use datafusion::common::DataFusionError; use object_store::path::Path as ObjectPath; +use object_store::ObjectStoreExt; use url::Url; use crate::spec::{FormatVersion, Manifest, ManifestList}; @@ -30,8 +31,7 @@ impl StoreContext { base: Arc, table_url: &Url, ) -> Result { - let base_path = ObjectPath::parse(table_url.path()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let base_path = crate::utils::url_to_object_path(table_url)?; let prefixed: Arc = Arc::new( object_store::prefix::PrefixStore::new(base.clone(), base_path.clone()), ); @@ -47,13 +47,7 @@ impl StoreContext { raw: &str, ) -> Result<(&'a Arc, ObjectPath), DataFusionError> { if let Ok(url) = Url::parse(raw) { - let p = url.path(); - let no_leading = p.strip_prefix('/').unwrap_or(p); - return Ok(( - &self.base, - ObjectPath::parse(no_leading) - .map_err(|e| DataFusionError::External(Box::new(e)))?, - )); + return Ok((&self.base, crate::utils::url_to_object_path(&url)?)); } if raw.starts_with(object_store::path::DELIMITER) { let no_leading = raw.strip_prefix('/').unwrap_or(raw); @@ -67,21 +61,17 @@ impl StoreContext { pub fn resolve_to_absolute_path(&self, raw_path: &str) -> Result { if let Ok(url) = Url::parse(raw_path) { - let encoded_path = url.path(); - let path_no_leading = encoded_path.strip_prefix('/').unwrap_or(encoded_path); - return ObjectPath::parse(path_no_leading) - .map_err(|e| DataFusionError::External(Box::new(e))); + return crate::utils::url_to_object_path(&url); } if raw_path.starts_with(object_store::path::DELIMITER) { let no_leading = raw_path.strip_prefix('/').unwrap_or(raw_path); - return ObjectPath::parse(no_leading) - .map_err(|e| DataFusionError::External(Box::new(e))); + return Ok(ObjectPath::from(no_leading)); } let mut full = self.prefix_path.clone(); for comp in raw_path.split('/').filter(|s| !s.is_empty()) { - full = full.child(comp); + full = full.join(comp); } Ok(full) } diff --git a/crates/sail-iceberg/src/operations/bootstrap.rs b/crates/sail-iceberg/src/operations/bootstrap.rs index eec3245c40..89aa49a717 100644 --- a/crates/sail-iceberg/src/operations/bootstrap.rs +++ b/crates/sail-iceberg/src/operations/bootstrap.rs @@ -19,6 +19,7 @@ use std::sync::Arc; use bytes::Bytes; use datafusion_common::{DataFusionError, Result}; +use object_store::ObjectStoreExt; use url::Url; use crate::io::StoreContext; diff --git a/crates/sail-iceberg/src/operations/snapshot.rs b/crates/sail-iceberg/src/operations/snapshot.rs index 53885a8b3f..6d435c461d 100644 --- a/crates/sail-iceberg/src/operations/snapshot.rs +++ b/crates/sail-iceberg/src/operations/snapshot.rs @@ -11,6 +11,7 @@ // limitations under the License. use bytes::Bytes; +use object_store::ObjectStoreExt; use super::{ActionCommit, Transaction}; use crate::io::StoreContext; diff --git a/crates/sail-iceberg/src/operations/write/config.rs b/crates/sail-iceberg/src/operations/write/config.rs index b30aed72eb..1135d0b07f 100644 --- a/crates/sail-iceberg/src/operations/write/config.rs +++ b/crates/sail-iceberg/src/operations/write/config.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use datafusion::arrow::datatypes::SchemaRef as ArrowSchemaRef; use parquet::file::properties::WriterProperties; +use sail_common_datafusion::catalog::CatalogPartitionField; use crate::spec::partition::UnboundPartitionSpec; use crate::spec::Schema as IcebergSchema; @@ -21,7 +22,7 @@ use crate::spec::Schema as IcebergSchema; #[derive(Debug, Clone)] pub struct WriterConfig { pub table_schema: ArrowSchemaRef, - pub partition_columns: Vec, + pub partition_columns: Vec, pub writer_properties: WriterProperties, pub target_file_size: u64, pub write_batch_size: usize, diff --git a/crates/sail-iceberg/src/operations/write/file_writer/location_generator.rs b/crates/sail-iceberg/src/operations/write/file_writer/location_generator.rs index b7094fa625..06800df383 100644 --- a/crates/sail-iceberg/src/operations/write/file_writer/location_generator.rs +++ b/crates/sail-iceberg/src/operations/write/file_writer/location_generator.rs @@ -52,7 +52,7 @@ impl LocationGenerator for DefaultLocationGenerator { fn with_partition_dir(&self, partition_dir: Option<&str>) -> (String, ObjectPath) { let id = self.counter.fetch_add(1, Ordering::Relaxed); let file = format!("part-{}-{:020}.parquet", Uuid::new_v4(), id); - let rel = match partition_dir { + let rel_unencoded = match partition_dir { Some(dir) if !dir.is_empty() => { format!("{}/{}/{}", self.data_dir, dir.trim_matches('/'), file) } @@ -60,9 +60,16 @@ impl LocationGenerator for DefaultLocationGenerator { }; // Join each component to avoid encoding '/' into '%2F' let mut full = self.base.clone(); - for comp in rel.split('/').filter(|s| !s.is_empty()) { - full = full.child(comp); + for comp in rel_unencoded.split('/').filter(|s| !s.is_empty()) { + full = full.join(comp); } + // Derive relative path from encoded ObjectPath so manifest file_path matches actual object keys. + let rel = full + .as_ref() + .strip_prefix(self.base.as_ref()) + .unwrap_or(full.as_ref()) + .trim_start_matches('/') + .to_string(); (rel, full) } } diff --git a/crates/sail-iceberg/src/operations/write/partition.rs b/crates/sail-iceberg/src/operations/write/partition.rs index 7893a76c54..d6a7936014 100644 --- a/crates/sail-iceberg/src/operations/write/partition.rs +++ b/crates/sail-iceberg/src/operations/write/partition.rs @@ -10,9 +10,11 @@ // See the License for the specific language governing permissions and // limitations under the License. +use chrono::{Datelike, NaiveDate, NaiveDateTime, Timelike}; use datafusion::arrow::array::{ArrayRef, UInt32Array}; use datafusion::arrow::compute; use datafusion::arrow::record_batch::RecordBatch; +use sail_common_datafusion::catalog::CatalogPartitionField; use crate::spec::partition::UnboundPartitionSpec as PartitionSpec; use crate::spec::schema::Schema as IcebergSchema; @@ -41,19 +43,25 @@ pub fn field_name_from_id(schema: &IcebergSchema, field_id: i32) -> Option], ) -> Result { if spec.fields.is_empty() { return Ok(String::new()); } + #[expect(clippy::unwrap_used)] + let epoch = NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(); let mut segs = Vec::new(); for (i, f) in spec.fields.iter().enumerate() { + let field_type = iceberg_schema + .field_by_id(f.source_id) + .map(|nf| nf.field_type.as_ref()) + .unwrap_or(&Type::Primitive(PrimitiveType::String)); // Use already-transformed values to build partition directories. // This ensures bucket paths are simple integers like `number_bucket=4` // instead of verbose strings like `bucket[8](4)`. let val = values.get(i).cloned().flatten(); - let human = match val { + let base_human = match val.as_ref() { None => "null".to_string(), Some(Literal::Primitive(p)) => match p { PrimitiveLiteral::Boolean(v) => v.to_string(), @@ -62,27 +70,82 @@ pub fn build_partition_dir( PrimitiveLiteral::Float(v) => v.0.to_string(), PrimitiveLiteral::Double(v) => v.0.to_string(), PrimitiveLiteral::Int128(v) => v.to_string(), - PrimitiveLiteral::String(s) => s, + PrimitiveLiteral::String(s) => s.clone(), PrimitiveLiteral::UInt128(v) => v.to_string(), PrimitiveLiteral::Binary(b) => { // hex-encode binary values for stability let mut s = String::with_capacity(b.len() * 2 + 2); s.push_str("0x"); - for byte in &b { + for byte in b.iter() { use std::fmt::Write as _; let _ = write!(&mut s, "{:02x}", byte); } s } }, - Some(Literal::Struct(_)) | Some(Literal::List(_)) | Some(Literal::Map(_)) => { + Some(l @ (Literal::Struct(_) | Literal::List(_) | Literal::Map(_))) => { // Fallback debug formatting for complex types + format!("{l:?}") + } + }; + + // Human-readable partition path formatting for temporal transforms: + // - years(date) => YYYY + // - months(date) => YYYY-MM + // - days(date) => YYYY-MM-DD + // - hours(ts) => YYYY-MM-DD-HH + let human = match (f.transform, field_type, val.as_ref()) { + ( + crate::spec::transform::Transform::Year, + _, + Some(Literal::Primitive(PrimitiveLiteral::Int(v))), + ) => { + // current apply_transform returns years since 1970; format actual year. + (1970 + *v).to_string() + } + ( + crate::spec::transform::Transform::Month, + _, + Some(Literal::Primitive(PrimitiveLiteral::Int(v))), + ) => { + // months since 1970-01 (0-based) + let y = 1970 + v.div_euclid(12); + let m0 = v.rem_euclid(12); + format!("{:04}-{:02}", y, m0 + 1) + } + ( + crate::spec::transform::Transform::Day, + _, + Some(Literal::Primitive(PrimitiveLiteral::Int(v))), + ) => { + // days since epoch + let date = epoch + chrono::Duration::days(i64::from(*v)); + format!("{:04}-{:02}-{:02}", date.year(), date.month(), date.day()) + } + ( + crate::spec::transform::Transform::Hour, + _, + Some(Literal::Primitive(PrimitiveLiteral::Int(v))), + ) => { + // hours since epoch + let secs = i64::from(*v) * 3600; + let dt = chrono::DateTime::from_timestamp(secs, 0) + .map(|dt| dt.naive_utc()) + .unwrap_or_else(|| { + #[expect(clippy::unwrap_used)] + NaiveDateTime::new(epoch, chrono::NaiveTime::from_hms_opt(0, 0, 0).unwrap()) + }); format!( - "{:?}", - val.ok_or_else(|| "Invalid partition literal".to_string())? + "{:04}-{:02}-{:02}-{:02}", + dt.year(), + dt.month(), + dt.day(), + dt.hour() ) } + _ => base_human, }; + segs.push(format!("{}={}", f.name, human)); } Ok(segs.join("/")) @@ -92,7 +155,7 @@ pub fn compute_partition_values( batch: &RecordBatch, spec: &PartitionSpec, iceberg_schema: &IcebergSchema, - partition_columns: &[String], + partition_columns: &[CatalogPartitionField], ) -> Result<(Vec>, String), String> { let _ = partition_columns; // not used in single-group fallback let mut values = Vec::with_capacity(spec.fields.len()); diff --git a/crates/sail-iceberg/src/operations/write/table_writer.rs b/crates/sail-iceberg/src/operations/write/table_writer.rs index 06c316d641..4b8c34e574 100644 --- a/crates/sail-iceberg/src/operations/write/table_writer.rs +++ b/crates/sail-iceberg/src/operations/write/table_writer.rs @@ -18,6 +18,7 @@ use datafusion::arrow::datatypes::{FieldRef, Schema, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, Result}; use object_store::path::Path as ObjectPath; +use object_store::ObjectStoreExt; use parquet::arrow::PARQUET_FIELD_ID_META_KEY; use sail_common_datafusion::array::record_batch::cast_record_batch_relaxed_tz; use url::Url; diff --git a/crates/sail-iceberg/src/physical_plan/action_schema.rs b/crates/sail-iceberg/src/physical_plan/action_schema.rs index 36df74481c..39125a47e1 100644 --- a/crates/sail-iceberg/src/physical_plan/action_schema.rs +++ b/crates/sail-iceberg/src/physical_plan/action_schema.rs @@ -1,7 +1,9 @@ use std::collections::BTreeMap; use std::sync::{Arc, LazyLock}; -use datafusion::arrow::datatypes::{DataType, Field, FieldRef, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{ + DataType, Field, FieldRef, Schema, SchemaRef, UnionFields, UnionMode, +}; use datafusion::arrow::record_batch::RecordBatch; use datafusion_common::{DataFusionError, Result}; use serde::{Deserialize, Serialize}; @@ -100,6 +102,35 @@ pub struct ActionRow { pub action: ExecAction, } +fn partition_value_union_type() -> DataType { + // serde_arrow 0.14 always creates non-null Union arrays (UnionBuilder::is_nullable = false). + // The union variants match the PartitionValue enum definition order. + #[expect( + clippy::unwrap_used, + reason = "partition_value_union_type is a process-global constant." + )] + let union_fields = UnionFields::try_new( + [0i8, 1, 2, 3, 4, 5, 6, 7, 8], + [ + Arc::new(Field::new("Boolean", DataType::Boolean, false)), + Arc::new(Field::new("Int", DataType::Int32, false)), + Arc::new(Field::new("Long", DataType::Int64, false)), + Arc::new(Field::new("Float", DataType::Float32, false)), + Arc::new(Field::new("Double", DataType::Float64, false)), + Arc::new(Field::new("String", DataType::Utf8, false)), + Arc::new(Field::new("Int128", DataType::Utf8, false)), + Arc::new(Field::new("UInt128", DataType::Utf8, false)), + Arc::new(Field::new( + "Binary", + DataType::List(Arc::new(Field::new("element", DataType::UInt8, false))), + false, + )), + ], + ) + .unwrap(); + DataType::Union(union_fields, UnionMode::Dense) +} + fn map_type_i32_u64() -> DataType { // Arrow Map is represented as `Map>`. let entries_struct = DataType::Struct( @@ -137,6 +168,18 @@ fn iceberg_action_tracing_options( Field::new("null_value_counts", map_type_i32_u64(), false), ) }) + .and_then(|opts| { + // serde_arrow 0.14 always produces non-null Union arrays (UnionBuilder::is_nullable + // is hardcoded to false). Override the partition field so that the static schema + // matches the actual serialized data, preventing Arrow 58's RecordBatch::try_new + // from rejecting the nullability mismatch. + let partition_item = + Arc::new(Field::new("element", partition_value_union_type(), false)); + opts.overwrite( + "action.add.partition", + Field::new("partition", DataType::List(partition_item), false), + ) + }) .map_err(|e| format!("failed to build serde_arrow tracing options: {e}")) } diff --git a/crates/sail-iceberg/src/physical_plan/commit/commit_exec.rs b/crates/sail-iceberg/src/physical_plan/commit/commit_exec.rs index 919de41f35..ffbdf8e083 100644 --- a/crates/sail-iceberg/src/physical_plan/commit/commit_exec.rs +++ b/crates/sail-iceberg/src/physical_plan/commit/commit_exec.rs @@ -29,6 +29,7 @@ use datafusion::physical_plan::{ use datafusion_common::{internal_err, DataFusionError, Result}; use futures::stream::once; use futures::StreamExt; +use object_store::ObjectStoreExt; use url::Url; use crate::io::StoreContext; @@ -43,14 +44,13 @@ use crate::spec::metadata::table_metadata::SnapshotLog; use crate::spec::snapshots::MAIN_BRANCH; use crate::spec::{PartitionSpec, Schema as IcebergSchema, TableMetadata, TableRequirement}; use crate::utils::get_object_store_from_context; - const MAX_COMMIT_RETRIES: usize = 5; #[derive(Debug)] pub struct IcebergCommitExec { input: Arc, table_url: Url, - cache: PlanProperties, + cache: Arc, } impl IcebergCommitExec { @@ -60,12 +60,12 @@ impl IcebergCommitExec { DataType::UInt64, true, )])); - let cache = PlanProperties::new( + let cache = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { input, table_url, @@ -209,7 +209,7 @@ impl ExecutionPlan for IcebergCommitExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } diff --git a/crates/sail-iceberg/src/physical_plan/plan_builder.rs b/crates/sail-iceberg/src/physical_plan/plan_builder.rs index 0e87a84b37..c36292f18e 100644 --- a/crates/sail-iceberg/src/physical_plan/plan_builder.rs +++ b/crates/sail-iceberg/src/physical_plan/plan_builder.rs @@ -16,19 +16,20 @@ use datafusion::catalog::Session; use datafusion::common::Result; use datafusion::physical_expr::expressions::Column; use datafusion::physical_expr::{LexOrdering, PhysicalExpr, PhysicalSortExpr}; -use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::repartition::RepartitionExec; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::{ExecutionPlan, Partitioning}; +use sail_common_datafusion::catalog::CatalogPartitionField; use sail_common_datafusion::datasource::PhysicalSinkMode; use url::Url; use crate::options::TableIcebergOptions; use crate::physical_plan::writer_exec::IcebergWriterExec; +use crate::utils::partition_transform::format_partition_expr; pub struct IcebergTableConfig { pub table_url: Url, - pub partition_columns: Vec, + pub partition_columns: Vec, pub table_exists: bool, pub options: TableIcebergOptions, } @@ -68,31 +69,19 @@ impl<'a> IcebergPlanBuilder<'a> { } fn add_projection_node(&self, input: Arc) -> Result> { - let input_schema = input.schema(); - let mut projection_exprs: Vec<(Arc, String)> = Vec::new(); - let mut part_idx = std::collections::HashMap::new(); - let part_set: std::collections::HashSet<&String> = - self.table_config.partition_columns.iter().collect(); - - for (i, f) in input_schema.fields().iter().enumerate() { - if part_set.contains(f.name()) { - part_idx.insert(f.name().clone(), i); - } else { - projection_exprs.push((Arc::new(Column::new(f.name(), i)), f.name().clone())); - } - } - - for name in &self.table_config.partition_columns { - let idx = *part_idx.get(name).ok_or_else(|| { - datafusion::common::DataFusionError::Plan(format!( + // Validate that partition transform expressions refer to real source columns. + // Do not reorder columns here: BDD "query result ordered" checks expect the original + // table column order from `SELECT *`. + let schema = input.schema(); + for field in &self.table_config.partition_columns { + if schema.index_of(&field.column).is_err() { + return Err(datafusion::common::DataFusionError::Plan(format!( "Partition column '{}' not found in schema", - name - )) - })?; - projection_exprs.push((Arc::new(Column::new(name, idx)), name.clone())); + format_partition_expr(field) + ))); + } } - - Ok(Arc::new(ProjectionExec::try_new(projection_exprs, input)?)) + Ok(input) } fn add_repartition_node( @@ -103,12 +92,31 @@ impl<'a> IcebergPlanBuilder<'a> { Partitioning::RoundRobinBatch(4) } else { let schema = input.schema(); - let n = schema.fields().len(); - let k = self.table_config.partition_columns.len(); - let exprs: Vec> = (n - k..n) - .zip(self.table_config.partition_columns.iter()) - .map(|(idx, name)| Arc::new(Column::new(name, idx)) as Arc) - .collect(); + let mut seen = std::collections::HashSet::new(); + let partition_source_columns = self + .table_config + .partition_columns + .iter() + .filter_map(|field| { + if seen.insert(field.column.clone()) { + Some(field.column.clone()) + } else { + None + } + }) + .collect::>(); + let exprs: Vec> = partition_source_columns + .iter() + .map(|name| { + let idx = schema.index_of(name).map_err(|_| { + datafusion::common::DataFusionError::Plan(format!( + "Partition column '{}' not found in schema", + name + )) + })?; + Ok(Arc::new(Column::new(name, idx)) as Arc) + }) + .collect::>>()?; Partitioning::Hash(exprs, 4) }; diff --git a/crates/sail-iceberg/src/physical_plan/writer_exec.rs b/crates/sail-iceberg/src/physical_plan/writer_exec.rs index 206c56c380..92e53c1615 100644 --- a/crates/sail-iceberg/src/physical_plan/writer_exec.rs +++ b/crates/sail-iceberg/src/physical_plan/writer_exec.rs @@ -27,7 +27,9 @@ use datafusion::physical_plan::{ use datafusion_common::{internal_err, DataFusionError, Result}; use futures::stream::once; use futures::StreamExt; +use object_store::ObjectStoreExt; use parquet::file::properties::WriterProperties; +use sail_common_datafusion::catalog::CatalogPartitionField; use sail_common_datafusion::datasource::PhysicalSinkMode; use url::Url; @@ -45,23 +47,27 @@ use crate::spec::partition::{ use crate::spec::schema::Schema as IcebergSchema; use crate::spec::{TableMetadata, TableRequirement}; use crate::utils::get_object_store_from_context; +use crate::utils::partition_transform::{ + catalog_partition_field_from_iceberg, format_partition_expr, + iceberg_transform_from_partition_field, partition_field_name, +}; #[derive(Debug)] pub struct IcebergWriterExec { input: Arc, table_url: Url, - partition_columns: Vec, + partition_columns: Vec, sink_mode: PhysicalSinkMode, table_exists: bool, options: TableIcebergOptions, - cache: PlanProperties, + cache: Arc, } impl IcebergWriterExec { fn extract_partition_columns( spec: &Option, iceberg_schema: &IcebergSchema, - ) -> Result> { + ) -> Result> { if let Some(spec) = spec { let mut cols = Vec::with_capacity(spec.fields().len()); for f in spec.fields() { @@ -71,7 +77,10 @@ impl IcebergWriterExec { f.source_id )) })?; - cols.push(field.name.clone()); + cols.push( + catalog_partition_field_from_iceberg(field.name.clone(), f.transform) + .map_err(DataFusionError::Plan)?, + ); } Ok(cols) } else { @@ -82,7 +91,7 @@ impl IcebergWriterExec { pub fn new( input: Arc, table_url: Url, - partition_columns: Vec, + partition_columns: Vec, sink_mode: PhysicalSinkMode, table_exists: bool, options: TableIcebergOptions, @@ -106,20 +115,20 @@ impl IcebergWriterExec { } } - fn compute_properties(schema: datafusion::arrow::datatypes::SchemaRef) -> PlanProperties { - PlanProperties::new( + fn compute_properties(schema: datafusion::arrow::datatypes::SchemaRef) -> Arc { + Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), EmissionType::Final, Boundedness::Bounded, - ) + )) } pub fn table_url(&self) -> &Url { &self.table_url } - pub fn partition_columns(&self) -> &[String] { + pub fn partition_columns(&self) -> &[CatalogPartitionField] { &self.partition_columns } @@ -165,6 +174,7 @@ impl IcebergWriterExec { fn resolve_data_dir(table_meta: &TableMetadata, table_url: &Url) -> String { let data_dir = "data".to_string(); + let base_path = crate::utils::url_to_object_path(table_url).ok(); if let Some(val) = table_meta .properties .get("write.data.path") @@ -176,27 +186,31 @@ impl IcebergWriterExec { if prop_url.scheme() == table_url.scheme() && prop_url.host_str() == table_url.host_str() { - let base_path = table_url.path().trim_end_matches('/'); - let prop_path = prop_url.path().trim_start_matches('/'); - let base_no_leading = base_path.trim_start_matches('/'); - if let Some(stripped) = prop_path.strip_prefix(base_no_leading) { - let rel = stripped.trim_start_matches('/').trim_matches('/'); - if !rel.is_empty() { - return rel.to_string(); + if let (Ok(prop_path), Some(base_path)) = ( + crate::utils::url_to_object_path(&prop_url), + base_path.as_ref(), + ) { + let prop_str = prop_path.as_ref(); + let base_str = base_path.as_ref(); + if let Some(stripped) = prop_str.strip_prefix(base_str) { + let rel = stripped.trim_start_matches('/').trim_matches('/'); + if !rel.is_empty() { + return rel.to_string(); + } } } } } else { - let prop_path = raw; - let base_path = table_url.path(); + let prop_path = raw.replace('\\', "/"); if prop_path.starts_with('/') { - if let Some(stripped) = prop_path - .strip_prefix(base_path) - .or_else(|| prop_path.strip_prefix(base_path.trim_start_matches('/'))) - { - let rel = stripped.trim_start_matches('/').trim_matches('/'); - if !rel.is_empty() { - return rel.to_string(); + if let Some(base_path) = base_path.as_ref() { + let base_str = base_path.as_ref(); + let prop_no_leading = prop_path.trim_start_matches('/'); + if let Some(stripped) = prop_no_leading.strip_prefix(base_str) { + let rel = stripped.trim_start_matches('/').trim_matches('/'); + if !rel.is_empty() { + return rel.to_string(); + } } } } else { @@ -222,7 +236,7 @@ impl ExecutionPlan for IcebergWriterExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.cache } @@ -338,15 +352,20 @@ impl ExecutionPlan for IcebergWriterExec { if let Some(existing) = &default_spec { builder = builder.with_spec_id(existing.spec_id()); } - use crate::spec::transform::Transform; - for name in &partition_columns { - let fid = current_schema.field_id_by_name(name).ok_or_else(|| { - DataFusionError::Plan(format!( - "Partition column mismatch: column '{}' not found in schema", - name - )) - })?; - builder = builder.add_field(fid, name.clone(), Transform::Identity); + for field in &partition_columns { + let fid = current_schema.field_id_by_name(&field.column).ok_or_else( + || { + DataFusionError::Plan(format!( + "Partition column mismatch: column '{}' not found in schema", + format_partition_expr(field) + )) + }, + )?; + builder = builder.add_field( + fid, + partition_field_name(field), + iceberg_transform_from_partition_field(field), + ); } default_spec = Some(builder.build()); } @@ -364,7 +383,12 @@ impl ExecutionPlan for IcebergWriterExec { } else if partition_columns != table_partition_columns { return Err(DataFusionError::Plan(format!( "Partition column mismatch: table uses {:?}, requested {:?}", - table_partition_columns, partition_columns + crate::utils::partition_transform::format_partition_exprs( + &table_partition_columns + ), + crate::utils::partition_transform::format_partition_exprs( + &partition_columns + ) ))); } } @@ -398,19 +422,22 @@ impl ExecutionPlan for IcebergWriterExec { "Invalid Iceberg schema: field id 0 detected after assignment".to_string(), )); } - for name in &partition_columns { - if iceberg_schema.field_id_by_name(name).is_none() { + for field in &partition_columns { + if iceberg_schema.field_id_by_name(&field.column).is_none() { return Err(DataFusionError::Plan(format!( "Partition column mismatch: column '{}' not found in schema", - name + format_partition_expr(field) ))); } } let mut builder = crate::spec::partition::PartitionSpec::builder(); - use crate::spec::transform::Transform; - for name in &partition_columns { - if let Some(fid) = iceberg_schema.field_id_by_name(name) { - builder = builder.add_field(fid, name.clone(), Transform::Identity); + for field in &partition_columns { + if let Some(fid) = iceberg_schema.field_id_by_name(&field.column) { + builder = builder.add_field( + fid, + partition_field_name(field), + iceberg_transform_from_partition_field(field), + ); } } let spec = builder.build(); @@ -454,7 +481,8 @@ impl ExecutionPlan for IcebergWriterExec { partition_spec: unbound_spec, }; - let writer_root = object_store::path::Path::from(table_url.path()); + let writer_root = crate::utils::url_to_object_path(&table_url) + .map_err(|e| DataFusionError::Plan(e.to_string()))?; let mut writer = IcebergTableWriter::new( object_store.clone(), writer_root, diff --git a/crates/sail-iceberg/src/table/metadata_loader.rs b/crates/sail-iceberg/src/table/metadata_loader.rs index 0af736ff4a..4e737693cb 100644 --- a/crates/sail-iceberg/src/table/metadata_loader.rs +++ b/crates/sail-iceberg/src/table/metadata_loader.rs @@ -12,7 +12,7 @@ use std::sync::Arc; -use datafusion::common::{plan_err, DataFusionError, Result}; +use datafusion::common::{plan_err, Result}; use url::Url; pub async fn find_latest_metadata_file( @@ -20,12 +20,11 @@ pub async fn find_latest_metadata_file( table_url: &Url, ) -> Result { use futures::TryStreamExt; - use object_store::path::Path as ObjectPath; + use object_store::ObjectStoreExt; log::trace!("Finding latest metadata file"); - let version_hint_path = - ObjectPath::parse(format!("{}metadata/version-hint.text", table_url.path()).as_str()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let base_path = crate::utils::url_to_object_path(table_url)?; + let version_hint_path = base_path.clone().join("metadata").join("version-hint.text"); let mut hinted_version: Option = None; let mut hinted_filename: Option = None; if let Ok(version_hint_data) = object_store.get(&version_hint_path).await { @@ -49,8 +48,7 @@ pub async fn find_latest_metadata_file( } log::trace!("Listing metadata directory"); - let metadata_prefix = ObjectPath::parse(format!("{}metadata/", table_url.path()).as_str()) - .map_err(|e| DataFusionError::External(Box::new(e)))?; + let metadata_prefix = base_path.join("metadata"); let objects = object_store.list(Some(&metadata_prefix)); diff --git a/crates/sail-iceberg/src/table/mod.rs b/crates/sail-iceberg/src/table/mod.rs index 2323c5762f..d1a31e1a27 100644 --- a/crates/sail-iceberg/src/table/mod.rs +++ b/crates/sail-iceberg/src/table/mod.rs @@ -17,6 +17,7 @@ use datafusion::catalog::Session; use datafusion::common::{DataFusionError, Result}; pub use metadata_loader::find_latest_metadata_file; use object_store::path::Path as ObjectPath; +use object_store::ObjectStoreExt; use url::Url; use crate::datasource::provider::IcebergTableProvider; @@ -98,6 +99,7 @@ impl Table { schema, snapshot, partition_specs, + self.metadata.default_spec_id, ) } @@ -110,71 +112,89 @@ impl Table { } fn select_snapshot(&self, options: &TableIcebergOptions) -> Result<(Schema, Snapshot)> { - let chosen_snapshot = if let Some(id) = options.snapshot_id { - self.metadata - .snapshots - .iter() - .find(|s| s.snapshot_id() == id) - .cloned() - .ok_or_else(|| { - DataFusionError::Plan(format!("Snapshot with id {} not found", id)) - })? + let (chosen_snapshot, use_snapshot_schema) = if let Some(id) = options.snapshot_id { + ( + self.metadata + .snapshots + .iter() + .find(|s| s.snapshot_id() == id) + .cloned() + .ok_or_else(|| { + DataFusionError::Plan(format!("Snapshot with id {} not found", id)) + })?, + true, + ) } else if let Some(ref_name) = options.use_ref.as_deref() { - let sid = if ref_name == MAIN_BRANCH { - self.metadata.current_snapshot_id.ok_or_else(|| { - DataFusionError::Plan( - "Iceberg table metadata is missing current snapshot id".to_string(), - ) - })? + let (sid, use_snapshot_schema) = if ref_name == MAIN_BRANCH { + ( + self.metadata.current_snapshot_id.ok_or_else(|| { + DataFusionError::Plan( + "Iceberg table metadata is missing current snapshot id".to_string(), + ) + })?, + false, + ) } else { + let reference = self.metadata.refs.get(ref_name).ok_or_else(|| { + DataFusionError::Plan(format!("Unknown Iceberg ref: {}", ref_name)) + })?; + (reference.snapshot_id, !reference.is_branch()) + }; + ( self.metadata - .refs - .get(ref_name) - .map(|r| r.snapshot_id) + .snapshots + .iter() + .find(|s| s.snapshot_id() == sid) + .cloned() .ok_or_else(|| { - DataFusionError::Plan(format!("Unknown Iceberg ref: {}", ref_name)) - })? - }; - self.metadata - .snapshots - .iter() - .find(|s| s.snapshot_id() == sid) - .cloned() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "Snapshot for ref {} (id={}) not found", - ref_name, sid - )) - })? + DataFusionError::Plan(format!( + "Snapshot for ref {} (id={}) not found", + ref_name, sid + )) + })?, + use_snapshot_schema, + ) } else if let Some(ts_str) = options.timestamp_as_of.as_deref() { let ts_ms = parse_timestamp_to_ms(ts_str).map_err(|e| DataFusionError::Plan(e.to_string()))?; - find_snapshot_by_ts(&self.metadata, ts_ms) - .cloned() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "No Iceberg snapshot exists at or before timestamp {}", - ts_str - )) - })? + ( + find_snapshot_by_ts(&self.metadata, ts_ms) + .cloned() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "No Iceberg snapshot exists at or before timestamp {}", + ts_str + )) + })?, + true, + ) } else { - self.metadata.current_snapshot().cloned().ok_or_else(|| { - DataFusionError::Plan("No current snapshot found in table metadata".to_string()) - })? + ( + self.metadata.current_snapshot().cloned().ok_or_else(|| { + DataFusionError::Plan("No current snapshot found in table metadata".to_string()) + })?, + false, + ) }; - let schema = if let Some(schema_id) = chosen_snapshot.schema_id() { - self.metadata - .schemas - .iter() - .find(|s| s.schema_id() == schema_id) - .cloned() - .ok_or_else(|| { - DataFusionError::Plan(format!( - "Schema with id {} not found for chosen snapshot", - schema_id - )) + let schema = if use_snapshot_schema { + if let Some(schema_id) = chosen_snapshot.schema_id() { + self.metadata + .schemas + .iter() + .find(|s| s.schema_id() == schema_id) + .cloned() + .ok_or_else(|| { + DataFusionError::Plan(format!( + "Schema with id {} not found for chosen snapshot", + schema_id + )) + })? + } else { + self.metadata.current_schema().cloned().ok_or_else(|| { + DataFusionError::Plan("No current schema found in table metadata".to_string()) })? + } } else { self.metadata.current_schema().cloned().ok_or_else(|| { DataFusionError::Plan("No current schema found in table metadata".to_string()) @@ -186,20 +206,39 @@ impl Table { } fn parse_timestamp_to_ms(s: &str) -> std::result::Result { - // Try RFC3339 first - if let Ok(dt) = DateTime::parse_from_rfc3339(s) { + let rfc3339_result = DateTime::parse_from_rfc3339(s); + if let Ok(dt) = rfc3339_result { return Ok(dt.with_timezone(&Utc).timestamp_millis()); } - // Fallback format "yyyy-MM-dd HH:mm:ss.SSS" - let fmt = "%Y-%m-%d %H:%M:%S%.3f"; - let naive = NaiveDateTime::parse_from_str(s, fmt) - .map_err(|e| format!("Invalid timestamp '{s}': {e}"))?; - Ok(Utc.from_utc_datetime(&naive).timestamp_millis()) + let mut last_error = rfc3339_result + .err() + .map(|e| format!("RFC3339 parsing error: {e}")); + + for format in [ + "%Y-%m-%d %H:%M:%S%.f", + "%Y-%m-%dT%H:%M:%S%.f", + "%Y-%m-%d %H:%M:%S", + "%Y-%m-%dT%H:%M:%S", + ] { + match NaiveDateTime::parse_from_str(s, format) { + Ok(naive) => return Ok(Utc.from_utc_datetime(&naive).timestamp_millis()), + Err(e) => { + last_error = Some(format!("Failed to parse with format '{format}': {e}")); + } + } + } + + let detail = last_error + .map(|e| format!(" Details: {e}")) + .unwrap_or_default(); + Err(format!( + "Invalid timestamp '{s}'. Supported formats are: RFC3339 (e.g. '2024-01-02T03:04:05Z'), '%Y-%m-%d %H:%M:%S%.f', '%Y-%m-%dT%H:%M:%S%.f', '%Y-%m-%d %H:%M:%S', '%Y-%m-%dT%H:%M:%S'.{detail}" + )) } fn find_snapshot_by_ts(meta: &TableMetadata, ts_ms: i64) -> Option<&Snapshot> { - if let Some(log_entry) = meta + let from_log = meta .snapshot_log .iter() .filter(|e| e.timestamp_ms <= ts_ms) @@ -208,19 +247,21 @@ fn find_snapshot_by_ts(meta: &TableMetadata, ts_ms: i64) -> Option<&Snapshot> { .cmp(&b.timestamp_ms) .then_with(|| a.snapshot_id.cmp(&b.snapshot_id)) }) - { - return meta - .snapshots - .iter() - .find(|s| s.snapshot_id() == log_entry.snapshot_id); - } + .and_then(|log_entry| { + meta.snapshots + .iter() + .find(|s| s.snapshot_id() == log_entry.snapshot_id) + .map(|snapshot| (log_entry.timestamp_ms, snapshot.snapshot_id(), snapshot)) + }); - meta.snapshots - .iter() - .filter(|s| s.timestamp_ms() <= ts_ms) - .max_by(|a, b| { - a.timestamp_ms() - .cmp(&b.timestamp_ms()) - .then_with(|| a.snapshot_id().cmp(&b.snapshot_id())) - }) + from_log.map(|(_, _, snapshot)| snapshot).or_else(|| { + meta.snapshots + .iter() + .filter(|s| s.timestamp_ms() <= ts_ms) + .max_by(|a, b| { + a.timestamp_ms() + .cmp(&b.timestamp_ms()) + .then_with(|| a.snapshot_id().cmp(&b.snapshot_id())) + }) + }) } diff --git a/crates/sail-iceberg/src/table_format.rs b/crates/sail-iceberg/src/table_format.rs index f8f2f07ea2..9e1fca8393 100644 --- a/crates/sail-iceberg/src/table_format.rs +++ b/crates/sail-iceberg/src/table_format.rs @@ -17,6 +17,7 @@ use datafusion::catalog::Session; use datafusion::common::{not_impl_err, plan_err, DataFusionError, Result}; use datafusion::datasource::TableProvider; use datafusion::physical_plan::ExecutionPlan; +use sail_common_datafusion::catalog::CatalogPartitionField; use sail_common_datafusion::datasource::{ PhysicalSinkMode, SinkInfo, SourceInfo, TableFormat, TableFormatRegistry, }; @@ -29,6 +30,9 @@ use crate::options::TableIcebergOptions; use crate::physical_plan::plan_builder::{IcebergPlanBuilder, IcebergTableConfig}; use crate::spec::{PartitionSpec, Schema, Snapshot}; use crate::table::{find_latest_metadata_file, Table}; +use crate::utils::partition_transform::{ + catalog_partition_field_from_iceberg, format_partition_exprs, +}; /// Iceberg implementation of [`TableFormat`]. #[derive(Debug, Default)] @@ -74,13 +78,14 @@ impl TableFormat for IcebergTableFormat { ) -> Result> { use datafusion::physical_plan::empty::EmptyExec; + let path = info.path(); let SinkInfo { input, - path, mode, partition_by, bucket_by, sort_order, + table_properties: _, options, } = info; @@ -116,7 +121,7 @@ impl TableFormat for IcebergTableFormat { _ => {} } - // Get existing partition columns if table exists + // Get existing partition spec (encoded as partition expressions) if table exists let existing_partition_columns = if table_exists { let table = Table::load(ctx, table_url.clone()).await?; Some(Self::partition_columns_from_metadata(&table)?) @@ -133,8 +138,8 @@ impl TableFormat for IcebergTableFormat { return plan_err!( "Partition column mismatch. Table is partitioned by {:?}, but write specified {:?}. \ Cannot change partitioning on append.", - existing_partitions, - partition_by + format_partition_exprs(existing_partitions), + format_partition_exprs(&partition_by) ); } PhysicalSinkMode::Overwrite => { @@ -143,8 +148,8 @@ impl TableFormat for IcebergTableFormat { return plan_err!( "Partition column mismatch. Table is partitioned by {:?}, but write specified {:?}. \ Set overwriteSchema=true to change partitioning.", - existing_partitions, - partition_by + format_partition_exprs(existing_partitions), + format_partition_exprs(&partition_by) ); } } @@ -226,7 +231,7 @@ impl IcebergTableFormat { Ok(table_url) } - fn partition_columns_from_metadata(table: &Table) -> Result> { + fn partition_columns_from_metadata(table: &Table) -> Result> { let metadata = table.metadata(); let spec = match metadata.default_partition_spec() { Some(spec) => spec, @@ -251,7 +256,10 @@ impl IcebergTableFormat { field.source_id )) })?; - columns.push(col_name); + columns.push( + catalog_partition_field_from_iceberg(col_name, field.transform) + .map_err(DataFusionError::Plan)?, + ); } Ok(columns) diff --git a/crates/sail-iceberg/src/utils/conversions.rs b/crates/sail-iceberg/src/utils/conversions.rs index 304ec2ccbe..2f1f362f23 100644 --- a/crates/sail-iceberg/src/utils/conversions.rs +++ b/crates/sail-iceberg/src/utils/conversions.rs @@ -50,7 +50,8 @@ pub fn to_scalar(literal: &Literal, iceberg_type: &Type) -> Result /// Convert a PrimitiveLiteral with type context to the correct ScalarValue. fn primitive_literal_to_scalar(prim: &PrimitiveLiteral, prim_type: &PrimitiveType) -> ScalarValue { - use {PrimitiveLiteral as PL, ScalarValue as SV}; + use PrimitiveLiteral as PL; + use ScalarValue as SV; match (prim_type, prim) { // Date: Int -> Date32 @@ -86,6 +87,11 @@ fn primitive_literal_to_scalar(prim: &PrimitiveLiteral, prim_type: &PrimitiveTyp (PrimitiveType::Fixed(_), PL::Binary(b)) | (PrimitiveType::Binary, PL::Binary(b)) => { SV::Binary(Some(b.clone())) } + // Iceberg encodes String lower/upper bounds as raw bytes (UTF-8) in file metrics. + // Decode them so pruning predicates comparing against Utf8 literals work. + (PrimitiveType::String, PL::Binary(b)) => { + SV::Utf8(Some(String::from_utf8_lossy(b).into_owned())) + } // Fallback to basic conversion for other combinations _ => primitive_to_scalar_default(prim), } @@ -93,7 +99,8 @@ fn primitive_literal_to_scalar(prim: &PrimitiveLiteral, prim_type: &PrimitiveTyp /// Basic conversion without explicit Iceberg type context (primitive-only). pub fn primitive_to_scalar_default(prim: &PrimitiveLiteral) -> ScalarValue { - use {PrimitiveLiteral as PL, ScalarValue as SV}; + use PrimitiveLiteral as PL; + use ScalarValue as SV; match prim { PL::Boolean(v) => SV::Boolean(Some(*v)), @@ -252,7 +259,8 @@ pub fn scalar_to_iceberg_literal( scalar: &ScalarValue, _arrow_type: &ArrowDataType, ) -> Result { - use {PrimitiveLiteral as PL, ScalarValue as SV}; + use PrimitiveLiteral as PL; + use ScalarValue as SV; match scalar { SV::Boolean(Some(v)) => Ok(Literal::Primitive(PL::Boolean(*v))), diff --git a/crates/sail-iceberg/src/utils/mod.rs b/crates/sail-iceberg/src/utils/mod.rs index a8796be12b..d8fa3e318b 100644 --- a/crates/sail-iceberg/src/utils/mod.rs +++ b/crates/sail-iceberg/src/utils/mod.rs @@ -11,6 +11,7 @@ // limitations under the License. pub mod conversions; +pub mod partition_transform; pub mod snapshot_id; pub mod timestamp; pub mod transform; @@ -49,6 +50,28 @@ pub fn join_table_uri(table_uri: &str, rel: &str, mode: &WritePathMode) -> Strin } } +pub fn url_to_object_path(url: &Url) -> Result { + let is_file = url.scheme() == "file"; + let p = if is_file { + if cfg!(windows) { + // On Windows, decode percent-encoding and normalize drive-letter file URLs. + url.to_file_path() + .map(|path| path.to_string_lossy().into_owned()) + .unwrap_or_else(|_| url.path().to_string()) + } else { + // On Unix, keep raw URL path to avoid decoding partition literals like `%3A`. + url.path().to_string() + } + } else { + url.path().to_string() + }; + // object_store::path::Path requires slash-delimited paths. + let p = p.replace('\\', "/"); + let path_no_leading = p.strip_prefix('/').unwrap_or(&p); + object_store::path::Path::parse(path_no_leading) + .map_err(|e| DataFusionError::External(Box::new(e))) +} + pub fn get_object_store_from_context( context: &Arc, table_url: &Url, diff --git a/crates/sail-iceberg/src/utils/partition_transform.rs b/crates/sail-iceberg/src/utils/partition_transform.rs new file mode 100644 index 0000000000..38c5762506 --- /dev/null +++ b/crates/sail-iceberg/src/utils/partition_transform.rs @@ -0,0 +1,116 @@ +use sail_common_datafusion::catalog::{CatalogPartitionField, PartitionTransform}; + +use crate::spec::transform::Transform; + +pub fn partition_field_name(field: &CatalogPartitionField) -> String { + match field.transform.unwrap_or(PartitionTransform::Identity) { + PartitionTransform::Identity => field.column.clone(), + PartitionTransform::Year => format!("{}_year", field.column), + PartitionTransform::Month => format!("{}_month", field.column), + PartitionTransform::Day => format!("{}_day", field.column), + PartitionTransform::Hour => format!("{}_hour", field.column), + PartitionTransform::Bucket(_) => format!("{}_bucket", field.column), + PartitionTransform::Truncate(_) => format!("{}_trunc", field.column), + } +} + +pub fn format_partition_expr(field: &CatalogPartitionField) -> String { + match field.transform.unwrap_or(PartitionTransform::Identity) { + PartitionTransform::Identity => field.column.clone(), + PartitionTransform::Year => format!("years({})", field.column), + PartitionTransform::Month => format!("months({})", field.column), + PartitionTransform::Day => format!("days({})", field.column), + PartitionTransform::Hour => format!("hours({})", field.column), + PartitionTransform::Bucket(n) => format!("bucket({n}, {})", field.column), + PartitionTransform::Truncate(w) => format!("truncate({w}, {})", field.column), + } +} + +pub fn format_partition_exprs(fields: &[CatalogPartitionField]) -> Vec { + fields.iter().map(format_partition_expr).collect() +} + +pub fn iceberg_transform_from_partition_field(field: &CatalogPartitionField) -> Transform { + match field.transform.unwrap_or(PartitionTransform::Identity) { + PartitionTransform::Identity => Transform::Identity, + PartitionTransform::Year => Transform::Year, + PartitionTransform::Month => Transform::Month, + PartitionTransform::Day => Transform::Day, + PartitionTransform::Hour => Transform::Hour, + PartitionTransform::Bucket(n) => Transform::Bucket(n), + PartitionTransform::Truncate(w) => Transform::Truncate(w), + } +} + +pub fn catalog_partition_field_from_iceberg( + source_column: String, + transform: Transform, +) -> Result { + let transform = match transform { + Transform::Identity => None, + Transform::Year => Some(PartitionTransform::Year), + Transform::Month => Some(PartitionTransform::Month), + Transform::Day => Some(PartitionTransform::Day), + Transform::Hour => Some(PartitionTransform::Hour), + Transform::Bucket(n) => Some(PartitionTransform::Bucket(n)), + Transform::Truncate(w) => Some(PartitionTransform::Truncate(w)), + other => { + return Err(format!( + "unsupported Iceberg partition transform '{other}' for column '{source_column}'" + )) + } + }; + Ok(CatalogPartitionField { + column: source_column, + transform, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn format_identity_column() { + let field = CatalogPartitionField { + column: "event_date".to_string(), + transform: None, + }; + assert_eq!(partition_field_name(&field), "event_date"); + assert_eq!(format_partition_expr(&field), "event_date"); + } + + #[test] + fn format_years() { + let field = CatalogPartitionField { + column: "event_date".to_string(), + transform: Some(PartitionTransform::Year), + }; + assert_eq!(partition_field_name(&field), "event_date_year"); + assert_eq!(format_partition_expr(&field), "years(event_date)"); + } + + #[test] + fn format_truncate() { + let field = CatalogPartitionField { + column: "user_id".to_string(), + transform: Some(PartitionTransform::Truncate(8)), + }; + assert_eq!(partition_field_name(&field), "user_id_trunc"); + assert_eq!(format_partition_expr(&field), "truncate(8, user_id)"); + } + + #[test] + fn convert_from_iceberg_transform() -> Result<(), String> { + let field = + catalog_partition_field_from_iceberg("event_date".to_string(), Transform::Year)?; + assert_eq!( + field, + CatalogPartitionField { + column: "event_date".to_string(), + transform: Some(PartitionTransform::Year), + } + ); + Ok(()) + } +} diff --git a/crates/sail-logical-plan/src/precondition.rs b/crates/sail-logical-plan/src/barrier.rs similarity index 80% rename from crates/sail-logical-plan/src/precondition.rs rename to crates/sail-logical-plan/src/barrier.rs index d73e67837d..3a1f9bbd3e 100644 --- a/crates/sail-logical-plan/src/precondition.rs +++ b/crates/sail-logical-plan/src/barrier.rs @@ -5,19 +5,18 @@ use datafusion_common::{plan_err, DFSchemaRef}; use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; use sail_common_datafusion::utils::items::ItemTaker; -/// A logical plan node that represents a plan with "logical preconditions". -/// The preconditions are logical plans that will be executed before physical planning -/// of the main plan. +/// A logical plan node that represents a plan with preconditions. +/// The precondition plans must be executed before the main plan. /// For example, this is useful for executing catalog operations before physical execution /// of the main plan. Such catalog operations are not supposed to be executed when /// resolving the logical plan since the plan resolver should not have side effects. #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Hash)] -pub struct WithPreconditionsNode { +pub struct BarrierNode { preconditions: Vec>, plan: Arc, } -impl WithPreconditionsNode { +impl BarrierNode { pub fn new(preconditions: Vec>, plan: Arc) -> Self { Self { preconditions, @@ -34,9 +33,9 @@ impl WithPreconditionsNode { } } -impl UserDefinedLogicalNodeCore for WithPreconditionsNode { +impl UserDefinedLogicalNodeCore for BarrierNode { fn name(&self) -> &str { - "WithPreconditions" + "Barrier" } fn inputs(&self) -> Vec<&LogicalPlan> { @@ -76,8 +75,6 @@ impl UserDefinedLogicalNodeCore for WithPreconditionsNode { } fn necessary_children_exprs(&self, _output_columns: &[usize]) -> Option>> { - // We do not need to precisely implement this method since this node is "executed" - // and removed before logical optimization. None } } diff --git a/crates/sail-logical-plan/src/file_write.rs b/crates/sail-logical-plan/src/file_write.rs index 17374b6243..97c1a8ce4f 100644 --- a/crates/sail-logical-plan/src/file_write.rs +++ b/crates/sail-logical-plan/src/file_write.rs @@ -5,17 +5,18 @@ use datafusion_common::{DFSchema, DFSchemaRef}; use datafusion_expr::expr::Sort; use datafusion_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; use educe::Educe; +use sail_common_datafusion::catalog::CatalogPartitionField; use sail_common_datafusion::datasource::{BucketBy, SinkMode}; use sail_common_datafusion::utils::items::ItemTaker; #[derive(Clone, Debug, Eq, PartialEq, Hash, PartialOrd)] pub struct FileWriteOptions { - pub path: String, pub format: String, pub mode: SinkMode, - pub partition_by: Vec, + pub partition_by: Vec, pub sort_by: Vec, pub bucket_by: Option, + pub table_properties: Vec<(String, String)>, pub options: Vec>, } diff --git a/crates/sail-logical-plan/src/lib.rs b/crates/sail-logical-plan/src/lib.rs index da8e1fd538..9a5a7d52fa 100644 --- a/crates/sail-logical-plan/src/lib.rs +++ b/crates/sail-logical-plan/src/lib.rs @@ -1,9 +1,9 @@ +pub mod barrier; pub mod file_delete; pub mod file_write; pub mod map_partitions; pub mod merge; pub mod monotonic_id; -pub mod precondition; pub mod range; pub mod repartition; pub mod schema_pivot; diff --git a/crates/sail-logical-plan/src/merge.rs b/crates/sail-logical-plan/src/merge.rs index d1d9482da8..07fb965508 100644 --- a/crates/sail-logical-plan/src/merge.rs +++ b/crates/sail-logical-plan/src/merge.rs @@ -652,6 +652,7 @@ pub fn expand_merge(node: &MergeIntoNode, path_column: &str) -> Result Result object_store::Result { - Err(object_store::Error::NotImplemented) + Err(object_store::Error::NotImplemented { + operation: "unsupported".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }) } async fn put_multipart_opts( @@ -253,7 +256,10 @@ impl ObjectStore for HuggingFaceObjectStore { _location: &Path, _opts: PutMultipartOptions, ) -> object_store::Result> { - Err(object_store::Error::NotImplemented) + Err(object_store::Error::NotImplemented { + operation: "unsupported".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }) } async fn get_opts( @@ -277,7 +283,10 @@ impl ObjectStore for HuggingFaceObjectStore { || if_unmodified_since.is_some() || version.is_some() { - return Err(object_store::Error::NotImplemented); + return Err(object_store::Error::NotImplemented { + operation: "unsupported".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }); } let path = HuggingFacePath::parse(location)?; let repo = self.api.repo(path.repo()); @@ -321,8 +330,16 @@ impl ObjectStore for HuggingFaceObjectStore { } } - async fn delete(&self, _location: &Path) -> object_store::Result<()> { - Err(object_store::Error::NotImplemented) + fn delete_stream( + &self, + _locations: BoxStream<'static, object_store::Result>, + ) -> BoxStream<'static, object_store::Result> { + Box::pin(stream::once(async { + Err(object_store::Error::NotImplemented { + operation: "delete_stream".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }) + })) } fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, object_store::Result> { @@ -348,14 +365,21 @@ impl ObjectStore for HuggingFaceObjectStore { &self, _prefix: Option<&Path>, ) -> object_store::Result { - Err(object_store::Error::NotImplemented) - } - - async fn copy(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { - Err(object_store::Error::NotImplemented) + Err(object_store::Error::NotImplemented { + operation: "unsupported".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }) } - async fn copy_if_not_exists(&self, _from: &Path, _to: &Path) -> object_store::Result<()> { - Err(object_store::Error::NotImplemented) + async fn copy_opts( + &self, + _from: &Path, + _to: &Path, + _options: object_store::CopyOptions, + ) -> object_store::Result<()> { + Err(object_store::Error::NotImplemented { + operation: "copy_opts".to_string(), + implementer: "HuggingFaceObjectStore".to_string(), + }) } } diff --git a/crates/sail-object-store/src/layers/lazy.rs b/crates/sail-object-store/src/layers/lazy.rs index 8cf02eaa8a..0f06fbfc6d 100644 --- a/crates/sail-object-store/src/layers/lazy.rs +++ b/crates/sail-object-store/src/layers/lazy.rs @@ -8,8 +8,8 @@ use futures::stream::BoxStream; use futures::StreamExt; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutMultipartOptions, PutOptions, PutPayload, PutResult, RenameOptions, Result, }; use tokio::sync::OnceCell; use tonic::codegen::Bytes; @@ -91,14 +91,6 @@ where F: Fn() -> Fut + Send + Sync + 'static, Fut: Future> + Send, { - async fn put(&self, location: &Path, payload: PutPayload) -> Result { - self.inner - .get_or_try_init() - .await? - .put(location, payload) - .await - } - async fn put_opts( &self, location: &Path, @@ -112,14 +104,6 @@ where .await } - async fn put_multipart(&self, location: &Path) -> Result> { - self.inner - .get_or_try_init() - .await? - .put_multipart(location) - .await - } - async fn put_multipart_opts( &self, location: &Path, @@ -132,10 +116,6 @@ where .await } - async fn get(&self, location: &Path) -> Result { - self.inner.get_or_try_init().await?.get(location).await - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { self.inner .get_or_try_init() @@ -144,14 +124,6 @@ where .await } - async fn get_range(&self, location: &Path, range: Range) -> Result { - self.inner - .get_or_try_init() - .await? - .get_range(location, range) - .await - } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { self.inner .get_or_try_init() @@ -160,20 +132,13 @@ where .await } - async fn head(&self, location: &Path) -> Result { - self.inner.get_or_try_init().await?.head(location).await - } - - async fn delete(&self, location: &Path) -> Result<()> { - self.inner.get_or_try_init().await?.delete(location).await - } - - fn delete_stream<'a>( - &'a self, - locations: BoxStream<'a, Result>, - ) -> BoxStream<'a, Result> { - futures::stream::once(async { - match self.inner.get_or_try_init().await { + fn delete_stream( + &self, + locations: BoxStream<'static, Result>, + ) -> BoxStream<'static, Result> { + let inner = self.inner.clone(); + futures::stream::once(async move { + match inner.get_or_try_init().await { Ok(inner) => inner.delete_stream(locations), Err(e) => futures::stream::once(async { Err(e) }).boxed(), } @@ -221,27 +186,19 @@ where .await } - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - self.inner.get_or_try_init().await?.copy(from, to).await - } - - async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - self.inner.get_or_try_init().await?.rename(from, to).await - } - - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> Result<()> { self.inner .get_or_try_init() .await? - .copy_if_not_exists(from, to) + .copy_opts(from, to, options) .await } - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + async fn rename_opts(&self, from: &Path, to: &Path, options: RenameOptions) -> Result<()> { self.inner .get_or_try_init() .await? - .rename_if_not_exists(from, to) + .rename_opts(from, to, options) .await } } diff --git a/crates/sail-object-store/src/layers/logging.rs b/crates/sail-object-store/src/layers/logging.rs index 58dfaf5c4d..8a0d6a603f 100644 --- a/crates/sail-object-store/src/layers/logging.rs +++ b/crates/sail-object-store/src/layers/logging.rs @@ -9,8 +9,8 @@ use futures::FutureExt; use log::debug; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, UploadPart, + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutMultipartOptions, PutOptions, PutPayload, PutResult, RenameOptions, Result, UploadPart, }; use sail_telemetry::common::SpanAttribute; use sail_telemetry::futures::TracingFutureExt; @@ -37,25 +37,6 @@ impl fmt::Display for LoggingObjectStore { #[async_trait::async_trait] #[warn(clippy::missing_trait_methods)] impl ObjectStore for LoggingObjectStore { - async fn put(&self, location: &Path, payload: PutPayload) -> Result { - debug!("put: location: {location:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::put").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - ( - SpanAttribute::OBJECT_STORE_SIZE, - payload.content_length().to_string(), - ), - ] - }); - self.inner - .put(location, payload) - .in_span_with_recorder(span, record_error) - .await - } - async fn put_opts( &self, location: &Path, @@ -81,23 +62,6 @@ impl ObjectStore for LoggingObjectStore { .await } - async fn put_multipart(&self, location: &Path) -> Result> { - debug!("put_multipart: location: {location:?}"); - - let span = - Span::enter_with_local_parent("ObjectStore::put_multipart").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - ] - }); - self.inner - .put_multipart(location) - .in_span_with_recorder(span, record_error) - .await - .map(|upload| Box::new(TracingMultipartUpload::new(upload)) as Box) - } - async fn put_multipart_opts( &self, location: &Path, @@ -120,31 +84,6 @@ impl ObjectStore for LoggingObjectStore { .map(|upload| Box::new(TracingMultipartUpload::new(upload)) as Box) } - async fn get(&self, location: &Path) -> Result { - debug!("get: location: {location:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::get").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - ] - }); - self.inner - .get(location) - .in_span_with_recorder(span, |span, output| { - record_error(span, output); - if let Ok(output) = output { - span.add_property(|| { - ( - SpanAttribute::OBJECT_STORE_RANGE, - format!("{:?}", output.range), - ) - }); - } - }) - .await - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { debug!("get_opts: location: {location:?} options: {options:?}"); @@ -171,29 +110,6 @@ impl ObjectStore for LoggingObjectStore { .await } - async fn get_range(&self, location: &Path, range: Range) -> Result { - debug!("get_range: location: {location:?} range: {range:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::get_range").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - (SpanAttribute::OBJECT_STORE_RANGE, format!("{range:?}")), - ] - }); - self.inner - .get_range(location, range) - .in_span_with_recorder(span, |span, output| { - record_error(span, output); - if let Ok(output) = output { - span.add_property(|| { - (SpanAttribute::OBJECT_STORE_SIZE, output.len().to_string()) - }); - } - }) - .await - } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { debug!("get_ranges: location: {location:?} ranges: {ranges:?}"); @@ -221,40 +137,10 @@ impl ObjectStore for LoggingObjectStore { .await } - async fn head(&self, location: &Path) -> Result { - debug!("head: location: {location:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::head").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - ] - }); - self.inner - .head(location) - .in_span_with_recorder(span, record_error) - .await - } - - async fn delete(&self, location: &Path) -> Result<()> { - debug!("delete: location: {location:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::delete").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_LOCATION, location.to_string()), - ] - }); - self.inner - .delete(location) - .in_span_with_recorder(span, record_error) - .await - } - - fn delete_stream<'a>( - &'a self, - locations: BoxStream<'a, Result>, - ) -> BoxStream<'a, Result> { + fn delete_stream( + &self, + locations: BoxStream<'static, Result>, + ) -> BoxStream<'static, Result> { debug!("delete_stream"); let span = Span::enter_with_local_parent("ObjectStore::delete_stream") @@ -303,10 +189,10 @@ impl ObjectStore for LoggingObjectStore { .await } - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - debug!("copy: from: {from:?} to: {to:?}"); + async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> Result<()> { + debug!("copy_opts: from: {from:?} to: {to:?} options: {options:?}"); - let span = Span::enter_with_local_parent("ObjectStore::copy").with_properties(|| { + let span = Span::enter_with_local_parent("ObjectStore::copy_opts").with_properties(|| { [ (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), (SpanAttribute::OBJECT_STORE_FROM, from.to_string()), @@ -314,49 +200,16 @@ impl ObjectStore for LoggingObjectStore { ] }); self.inner - .copy(from, to) + .copy_opts(from, to, options) .in_span_with_recorder(span, record_error) .await } - async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - debug!("rename: from: {from:?} to: {to:?}"); + async fn rename_opts(&self, from: &Path, to: &Path, options: RenameOptions) -> Result<()> { + debug!("rename_opts: from: {from:?} to: {to:?} options: {options:?}"); - let span = Span::enter_with_local_parent("ObjectStore::rename").with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_FROM, from.to_string()), - (SpanAttribute::OBJECT_STORE_TO, to.to_string()), - ] - }); - self.inner - .rename(from, to) - .in_span_with_recorder(span, record_error) - .await - } - - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - debug!("copy_if_not_exists: from: {from:?} to: {to:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::copy_if_not_exists") - .with_properties(|| { - [ - (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), - (SpanAttribute::OBJECT_STORE_FROM, from.to_string()), - (SpanAttribute::OBJECT_STORE_TO, to.to_string()), - ] - }); - self.inner - .copy_if_not_exists(from, to) - .in_span_with_recorder(span, record_error) - .await - } - - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { - debug!("rename_if_not_exists: from: {from:?} to: {to:?}"); - - let span = Span::enter_with_local_parent("ObjectStore::rename_if_not_exists") - .with_properties(|| { + let span = + Span::enter_with_local_parent("ObjectStore::rename_opts").with_properties(|| { [ (SpanAttribute::OBJECT_STORE_INSTANCE, self.inner.to_string()), (SpanAttribute::OBJECT_STORE_FROM, from.to_string()), @@ -364,7 +217,7 @@ impl ObjectStore for LoggingObjectStore { ] }); self.inner - .rename_if_not_exists(from, to) + .rename_opts(from, to, options) .in_span_with_recorder(span, record_error) .await } diff --git a/crates/sail-object-store/src/layers/runtime.rs b/crates/sail-object-store/src/layers/runtime.rs index 7915a6ed9c..5a5c961125 100644 --- a/crates/sail-object-store/src/layers/runtime.rs +++ b/crates/sail-object-store/src/layers/runtime.rs @@ -8,8 +8,9 @@ use futures::stream::BoxStream; use futures::{Stream, StreamExt}; use object_store::path::Path; use object_store::{ - GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, ObjectStore, - PutMultipartOptions, PutOptions, PutPayload, PutResult, Result, UploadPart, + CopyOptions, GetOptions, GetResult, GetResultPayload, ListResult, MultipartUpload, ObjectMeta, + ObjectStore, PutMultipartOptions, PutOptions, PutPayload, PutResult, RenameOptions, Result, + UploadPart, }; use tokio::runtime::Handle; use tokio::sync::{mpsc, Mutex}; @@ -78,14 +79,6 @@ impl fmt::Display for RuntimeAwareObjectStore { #[async_trait::async_trait] #[warn(clippy::missing_trait_methods)] impl ObjectStore for RuntimeAwareObjectStore { - async fn put(&self, location: &Path, payload: PutPayload) -> Result { - let inner = self.inner.clone(); - let location = location.clone(); - self.handle - .spawn(async move { inner.put(&location, payload).await }) - .await? - } - async fn put_opts( &self, location: &Path, @@ -99,16 +92,6 @@ impl ObjectStore for RuntimeAwareObjectStore { .await? } - async fn put_multipart(&self, location: &Path) -> Result> { - let inner = self.inner.clone(); - let location = location.clone(); - let multipart = self - .handle - .spawn(async move { inner.put_multipart(&location).await }) - .await??; - Ok(self.wrap_multipart_upload(multipart)) - } - async fn put_multipart_opts( &self, location: &Path, @@ -123,16 +106,6 @@ impl ObjectStore for RuntimeAwareObjectStore { Ok(self.wrap_multipart_upload(multipart)) } - async fn get(&self, location: &Path) -> Result { - let inner = self.inner.clone(); - let location = location.clone(); - let result = self - .handle - .spawn(async move { inner.get(&location).await }) - .await??; - Ok(self.wrap_get_result(result)) - } - async fn get_opts(&self, location: &Path, options: GetOptions) -> Result { let inner = self.inner.clone(); let location = location.clone(); @@ -143,14 +116,6 @@ impl ObjectStore for RuntimeAwareObjectStore { Ok(self.wrap_get_result(result)) } - async fn get_range(&self, location: &Path, range: Range) -> Result { - let inner = self.inner.clone(); - let location = location.clone(); - self.handle - .spawn(async move { inner.get_range(&location, range).await }) - .await? - } - async fn get_ranges(&self, location: &Path, ranges: &[Range]) -> Result> { let inner = self.inner.clone(); let location = location.clone(); @@ -160,30 +125,17 @@ impl ObjectStore for RuntimeAwareObjectStore { .await? } - async fn head(&self, location: &Path) -> Result { - let inner = self.inner.clone(); - let location = location.clone(); - self.handle - .spawn(async move { inner.head(&location).await }) - .await? - } - - async fn delete(&self, location: &Path) -> Result<()> { - let inner = self.inner.clone(); - let location = location.clone(); - self.handle - .spawn(async move { inner.delete(&location).await }) - .await? - } - - fn delete_stream<'a>( - &'a self, - _locations: BoxStream<'a, Result>, - ) -> BoxStream<'a, Result> { + fn delete_stream( + &self, + _locations: BoxStream<'static, Result>, + ) -> BoxStream<'static, Result> { // FIXME: We cannot run `delete_stream` in a runtime-aware manner because - // the input and output streams are expected to have the lifetime `'a`, - // while tasks spawned by the runtime handle must be `'static`. - once(Err(object_store::Error::NotImplemented)).boxed() + // tasks spawned by the runtime handle must be `'static`. + once(Err(object_store::Error::NotImplemented { + operation: "delete_stream".to_string(), + implementer: "RuntimeAwareObjectStore".to_string(), + })) + .boxed() } fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, Result> { @@ -219,39 +171,21 @@ impl ObjectStore for RuntimeAwareObjectStore { .await? } - async fn copy(&self, from: &Path, to: &Path) -> Result<()> { - let inner = self.inner.clone(); - let from = from.clone(); - let to = to.clone(); - self.handle - .spawn(async move { inner.copy(&from, &to).await }) - .await? - } - - async fn rename(&self, from: &Path, to: &Path) -> Result<()> { - let inner = self.inner.clone(); - let from = from.clone(); - let to = to.clone(); - self.handle - .spawn(async move { inner.rename(&from, &to).await }) - .await? - } - - async fn copy_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> Result<()> { let inner = self.inner.clone(); let from = from.clone(); let to = to.clone(); self.handle - .spawn(async move { inner.copy_if_not_exists(&from, &to).await }) + .spawn(async move { inner.copy_opts(&from, &to, options).await }) .await? } - async fn rename_if_not_exists(&self, from: &Path, to: &Path) -> Result<()> { + async fn rename_opts(&self, from: &Path, to: &Path, options: RenameOptions) -> Result<()> { let inner = self.inner.clone(); let from = from.clone(); let to = to.clone(); self.handle - .spawn(async move { inner.rename_if_not_exists(&from, &to).await }) + .spawn(async move { inner.rename_opts(&from, &to, options).await }) .await? } } diff --git a/crates/sail-physical-optimizer/src/barrier.rs b/crates/sail-physical-optimizer/src/barrier.rs new file mode 100644 index 0000000000..446ce924dd --- /dev/null +++ b/crates/sail-physical-optimizer/src/barrier.rs @@ -0,0 +1,94 @@ +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::config::ConfigOptions; +use datafusion::error::Result; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion::physical_plan::repartition::RepartitionExec; +use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; +use datafusion_physical_expr::Partitioning; +use sail_physical_plan::barrier::BarrierExec; + +/// A physical optimizer rule that wraps all precondition children of a [`BarrierExec`] +/// with `RepartitionExec` (round-robin) or `CoalescePartitionsExec` to match the partition +/// count of the actual plan. +/// +/// By wrapping preconditions this way, the actual plan will not start until all partitions of +/// the preconditions are completed, even if we only call `execute()` for one partition. +/// Such wrapping can be skipped if the precondition and the actual plan both have only one +/// partition, since a single precondition partition is sufficient to block the actual plan. +pub struct EnforceBarrierPartitioning {} + +impl EnforceBarrierPartitioning { + pub fn new() -> Self { + Self {} + } +} + +impl Default for EnforceBarrierPartitioning { + fn default() -> Self { + Self::new() + } +} + +impl PhysicalOptimizerRule for EnforceBarrierPartitioning { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + let result = plan.transform_up(|node: Arc| { + let Some(barrier) = node.as_any().downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + + let plan = barrier.plan(); + let target_partitions = plan.output_partitioning().partition_count(); + + let preconditions: Vec> = barrier + .preconditions() + .iter() + .map(|precondition| { + let precondition_partitions = + precondition.output_partitioning().partition_count(); + // Skip wrapping if both the precondition and the actual plan have only one + // partition, since a single precondition partition is sufficient. + if precondition_partitions == 1 && target_partitions == 1 { + return Ok(precondition.clone()); + } + if target_partitions == 1 { + // Coalesce to a single partition. + Ok(Arc::new(CoalescePartitionsExec::new(precondition.clone())) + as Arc) + } else { + // Fan out to the target partition count using round-robin. + Ok(Arc::new(RepartitionExec::try_new( + precondition.clone(), + Partitioning::RoundRobinBatch(target_partitions), + )?) as Arc) + } + }) + .collect::>()?; + + let barrier = BarrierExec::new(preconditions, plan.clone()); + Ok(Transformed::yes(Arc::new(barrier) as Arc)) + })?; + Ok(result.data) + } + + fn name(&self) -> &str { + "EnforceBarrierPartitioning" + } + + fn schema_check(&self) -> bool { + true + } +} + +impl Debug for EnforceBarrierPartitioning { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.name()) + } +} diff --git a/crates/sail-physical-optimizer/src/collect_left.rs b/crates/sail-physical-optimizer/src/collect_left.rs new file mode 100644 index 0000000000..0e5ef593da --- /dev/null +++ b/crates/sail-physical-optimizer/src/collect_left.rs @@ -0,0 +1,211 @@ +use std::sync::Arc; + +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::error::Result; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::coalesce_partitions::CoalescePartitionsExec; +use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode}; +use datafusion::physical_plan::{ + with_new_children_if_necessary, ExecutionPlan, ExecutionPlanProperties, +}; + +/// Safety-net rule that ensures the build side (left child) of every +/// `HashJoinExec` in `CollectLeft` mode has exactly one output partition. +/// +/// DataFusion's `EnforceDistribution` rule normally takes care of this, +/// but after join reordering or other plan transformations the invariant +/// can be violated. This rule wraps the left child in a +/// `CoalescePartitionsExec` when needed, and runs late in the optimizer +/// pipeline, before `EnforceBarrierPartitioning` and `SanityCheckPlan`. +#[derive(Debug, Default)] +pub struct RewriteCollectLeftHashJoin; + +impl RewriteCollectLeftHashJoin { + pub fn new() -> Self { + Self + } +} + +impl PhysicalOptimizerRule for RewriteCollectLeftHashJoin { + fn optimize( + &self, + plan: Arc, + _config: &datafusion::config::ConfigOptions, + ) -> Result> { + let result = plan.transform_up(|node| { + let Some(join) = node.as_any().downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + + if join.mode != PartitionMode::CollectLeft { + return Ok(Transformed::no(node)); + } + + let left = join.left.clone(); + if left.output_partitioning().partition_count() == 1 { + return Ok(Transformed::no(node)); + } + + // Wrap in CoalescePartitionsExec to merge into a single partition. + let coalesced: Arc = Arc::new(CoalescePartitionsExec::new(left)); + let new_children = vec![coalesced, join.right.clone()]; + let new_node = with_new_children_if_necessary(node, new_children)?; + Ok(Transformed::yes(new_node)) + })?; + Ok(result.data) + } + + fn name(&self) -> &str { + "RewriteCollectLeftHashJoin" + } + + fn schema_check(&self) -> bool { + true + } +} + +#[cfg(test)] +#[expect(clippy::unwrap_used)] +mod tests { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::{JoinType, NullEquality}; + use datafusion::config::ConfigOptions; + use datafusion::physical_expr::expressions::Column; + use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode}; + use datafusion::physical_plan::union::UnionExec; + use datafusion::physical_plan::{ExecutionPlan, ExecutionPlanProperties}; + + use super::*; + + fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("val", DataType::Utf8, true), + ])) + } + + /// Build a plan with multiple partitions by unioning two EmptyExec. + fn multi_partition_plan(s: &Arc) -> Arc { + let a = Arc::new(EmptyExec::new(s.clone())) as Arc; + let b = Arc::new(EmptyExec::new(s.clone())) as Arc; + UnionExec::try_new(vec![a, b]).unwrap() + } + + #[test] + fn test_adds_coalesce_when_left_has_multiple_partitions() { + let s = schema(); + let left = multi_partition_plan(&s); + let right = Arc::new(EmptyExec::new(s.clone())) as Arc; + + assert!(left.output_partitioning().partition_count() > 1); + + let join = Arc::new( + HashJoinExec::try_new( + left, + right, + vec![( + Arc::new(Column::new("id", 0)), + Arc::new(Column::new("id", 0)), + )], + None, + &JoinType::Inner, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + false, // null_aware + ) + .unwrap(), + ) as Arc; + + let rule = RewriteCollectLeftHashJoin::new(); + let config = ConfigOptions::default(); + let result = rule.optimize(join, &config).unwrap(); + + let new_join = result.as_any().downcast_ref::().unwrap(); + assert_eq!( + new_join.children()[0] + .output_partitioning() + .partition_count(), + 1 + ); + assert!(new_join.children()[0] + .as_any() + .downcast_ref::() + .is_some()); + } + + #[test] + fn test_no_change_when_left_has_single_partition() { + let s = schema(); + let left = Arc::new(EmptyExec::new(s.clone())) as Arc; + let right = Arc::new(EmptyExec::new(s.clone())) as Arc; + + assert_eq!(left.output_partitioning().partition_count(), 1); + + let join = Arc::new( + HashJoinExec::try_new( + left, + right, + vec![( + Arc::new(Column::new("id", 0)), + Arc::new(Column::new("id", 0)), + )], + None, + &JoinType::Inner, + None, + PartitionMode::CollectLeft, + NullEquality::NullEqualsNothing, + false, // null_aware + ) + .unwrap(), + ) as Arc; + + let rule = RewriteCollectLeftHashJoin::new(); + let config = ConfigOptions::default(); + let result = rule.optimize(join, &config).unwrap(); + + let new_join = result.as_any().downcast_ref::().unwrap(); + assert!(new_join.children()[0] + .as_any() + .downcast_ref::() + .is_none()); + } + + #[test] + fn test_no_change_for_partitioned_mode() { + let s = schema(); + let left = multi_partition_plan(&s); + let right = multi_partition_plan(&s); + + let join = Arc::new( + HashJoinExec::try_new( + left, + right, + vec![( + Arc::new(Column::new("id", 0)), + Arc::new(Column::new("id", 0)), + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + ) + .unwrap(), + ) as Arc; + + let rule = RewriteCollectLeftHashJoin::new(); + let config = ConfigOptions::default(); + let result = rule.optimize(join, &config).unwrap(); + + let new_join = result.as_any().downcast_ref::().unwrap(); + assert!(new_join.children()[0] + .as_any() + .downcast_ref::() + .is_none()); + } +} diff --git a/crates/sail-physical-optimizer/src/join_reorder/builder.rs b/crates/sail-physical-optimizer/src/join_reorder/builder.rs index ed267f7f78..2b67ee43db 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/builder.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/builder.rs @@ -1,22 +1,32 @@ use std::collections::HashMap; use std::sync::Arc; +use datafusion::common::stats::Precision; use datafusion::common::tree_node::{Transformed, TreeNode}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::{JoinType, Operator}; use datafusion::physical_expr::expressions::{BinaryExpr, Column}; +use datafusion::physical_expr::utils::collect_columns; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::aggregates::AggregateExec; +use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::joins::HashJoinExec; use datafusion::physical_plan::projection::ProjectionExec; use datafusion::physical_plan::ExecutionPlan; +use datafusion_physical_expr::intervals::utils::check_support; +use datafusion_physical_expr::{analyze, AnalysisContext}; use log::trace; use crate::join_reorder::graph::{JoinEdge, QueryGraph, RelationNode, StableColumn}; use crate::join_reorder::join_set::JoinSet; -/// Type alias for join condition pairs to reduce complexity -type JoinConditionPairs = [(Arc, Arc)]; +type PhysicalExprRef = Arc; +type EquiPair = (StableColumn, StableColumn); +type PhysicalExprWithEquiPairs = (PhysicalExprRef, Vec); +type GroupedPredicates = HashMap>; + +/// Hard limit on the number of base relations in a single reorderable graph. +const MAX_RELATIONS: usize = 12; /// Maps an output column from an ExecutionPlan back to a stable identifier. /// The vector is indexed by the column index in the plan's output schema. @@ -109,7 +119,25 @@ impl GraphBuilder { if let Some(proj_plan) = any_plan.downcast_ref::() { trace!("Visiting projection: {}", proj_plan.name()); - return self.visit_projection(proj_plan); + // Only "see through" projections that are a pure column passthrough + // (possibly pruning/reordering/renaming/duplicating columns). + // + // Projections that introduce computed expressions are treated as a boundary leaf: + // seeing through them can lose the expression semantics and/or create join keys + // we can't reconstruct from stable base columns. + if self.is_trivial_projection(proj_plan) { + //TODO: consider apply this limit in different levels of the plan. + if self.graph.relation_count() >= MAX_RELATIONS { + trace!( + "GraphBuilder: relation_count={} >= MAX_RELATIONS={}, treating ProjectionExec as boundary leaf", + self.graph.relation_count(), + MAX_RELATIONS + ); + return self.visit_boundary_or_leaf(plan); + } + return self.visit_projection(proj_plan); + } + return self.visit_boundary_or_leaf(plan); } // If it's not a reorderable join or a projection we can see through, @@ -142,55 +170,83 @@ impl GraphBuilder { let left_map = self.visit_plan(join_plan.left().clone())?; let right_map = self.visit_plan(join_plan.right().clone())?; - // Parse Join conditions, create JoinEdge - let mut all_relations_in_condition = JoinSet::default(); - let mut equi_pairs = Vec::new(); + // Build join predicates and add edges. + // + // Note: avoid creating a single hyperedge that unions unrelated binary predicates + // (e.g. a join node with ON conditions that touch different base table pairs). Instead, + // split AND-conjuncts by their base-relation dependencies and create one edge per group. + + // Collect conjunct predicates originating from join `on` conditions. + // These are equi-join predicates by construction (HashJoin keys), and we require that + // each side resolves to a single base StableColumn so we can reconstruct HashJoinExec. + let mut conjuncts: Vec = Vec::new(); for (left_on, right_on) in join_plan.on() { - // Parse left and right expressions, find their corresponding stable IDs - let left_stable_ids = self.resolve_expr_to_relations(left_on, &left_map)?; - let right_stable_ids = self.resolve_expr_to_relations(right_on, &right_map)?; - - // Merge relations involved in all_relations_in_condition - for rel_id in left_stable_ids.iter().chain(right_stable_ids.iter()) { - all_relations_in_condition = - all_relations_in_condition.union(&JoinSet::new_singleton(*rel_id)?); - } + let left_stable_col = self + .resolve_to_single_stable_col(left_on, &left_map)? + .ok_or_else(|| { + DataFusionError::Internal( + "JoinReorder: join key is not a simple column reference".to_string(), + ) + })?; + let right_stable_col = self + .resolve_to_single_stable_col(right_on, &right_map)? + .ok_or_else(|| { + DataFusionError::Internal( + "JoinReorder: join key is not a simple column reference".to_string(), + ) + })?; - // Try to resolve expressions to single stable columns for equi-join pairs - if let (Some(left_stable_col), Some(right_stable_col)) = ( - self.resolve_to_single_stable_col(left_on, &left_map)?, - self.resolve_to_single_stable_col(right_on, &right_map)?, - ) { - equi_pairs.push((left_stable_col, right_stable_col)); - } - } + // Represent the equality in stable-name form so downstream dependency analysis and + // join-filter rewriting can work without relying on transient schema names. + let l: PhysicalExprRef = Arc::new(Column::new(&left_stable_col.name, 0)); + let r: PhysicalExprRef = Arc::new(Column::new(&right_stable_col.name, 0)); + let eq = Arc::new(BinaryExpr::new(l, Operator::Eq, r)) as PhysicalExprRef; - // Create an expression representing the entire ON condition - let mut filter_expr = self.build_conjunction_from_on(join_plan.on())?; + conjuncts.push((eq, vec![(left_stable_col, right_stable_col)])); + } - // Incorporate additional non-equi filters from the original join + // Collect conjunct predicates from join filter (non-equi predicates / residuals). if let Some(join_filter) = join_plan.filter() { - // Rewrite filter expressions to use stable column names (R{relation}.C{index}) - // instead of ephemeral projection names and join-local indices. + // Rewrite join filter to stable column names. let extra = self.rewrite_join_filter_to_stable( join_plan, join_filter.expression(), &left_map, &right_map, )?; - // TODO: Separating the equi-join conditions from the non-equi filter expression at the source - filter_expr = Arc::new(BinaryExpr::new(filter_expr, Operator::And, extra)) - as Arc; + + for c in Self::decompose_conjuncts(&extra) { + conjuncts.push((c, vec![])); + } } - let edge = JoinEdge::new( - all_relations_in_condition, - filter_expr, - *join_plan.join_type(), - equi_pairs, - ); - self.graph.add_edge(edge)?; + // Group conjuncts by their base-relation dependency set and add one edge per group. + // + // This avoids generating hyperedges for join nodes where each conjunct is actually + // a binary predicate between two base relations. + let mut grouped: GroupedPredicates = HashMap::new(); + + for (pred, pairs) in conjuncts { + let deps = self.relations_for_expr(&pred, &left_map, &right_map)?; + + // NOTE: If a predicate depends on <2 base relations, keep it associated with the full + // join node so it is not lost (it may be a single-side residual predicate). + let deps = if deps.cardinality() < 2 { + self.all_relations_in_maps(&left_map, &right_map)? + } else { + deps + }; + + grouped.entry(deps).or_default().push((pred, pairs)); + } + + for (join_set, preds) in grouped { + let (filter, equi_pairs) = Self::combine_predicates(preds)?; + let mut edge = JoinEdge::new(join_set, filter, *join_plan.join_type(), equi_pairs); + edge.null_equality = join_plan.null_equality(); + self.graph.add_edge(edge)?; + } // Build and return the output ColumnMap for current Join node // Inner Join output is concatenation of left and right child outputs @@ -214,6 +270,170 @@ impl GraphBuilder { Ok(output_map) } + fn decompose_conjuncts(expr: &Arc) -> Vec> { + let mut result = Vec::new(); + if let Some(binary) = expr.as_any().downcast_ref::() { + match binary.op() { + Operator::And => { + result.extend(Self::decompose_conjuncts(binary.left())); + result.extend(Self::decompose_conjuncts(binary.right())); + } + _ => result.push(Arc::clone(expr)), + } + } else { + result.push(Arc::clone(expr)); + } + result + } + + /// Returns true if this projection is a pure column passthrough: + /// - each output expression is a Column (aliases may differ) + /// + /// We allow pruning/reordering/duplication: join reordering will later recompute the + /// minimal required columns for the chosen join tree, and re-apply projections during + /// reconstruction. + fn is_trivial_projection(&self, proj_plan: &ProjectionExec) -> bool { + let input_len = proj_plan.input().schema().fields().len(); + let exprs = proj_plan.expr(); + if exprs.is_empty() { + return false; + } + for p in exprs.iter() { + let Some(c) = p.expr.as_any().downcast_ref::() else { + return false; + }; + let idx = c.index(); + if idx >= input_len { + return false; + } + } + true + } + + /// Return the set of base relations referenced by `expr`. + /// + /// The expression can refer to: + /// - stable names ("R{rel}.C{col}") emitted by join-filter rewriting, or + /// - Column indices into either `left_map` or `right_map` (for expressions that have not + /// been rewritten yet). + fn relations_for_expr( + &self, + expr: &Arc, + left_map: &ColumnMap, + right_map: &ColumnMap, + ) -> Result { + let cols = collect_columns(expr); + let mut bits: u64 = 0; + + for c in &cols { + // Prefer parsing stable names if present. + if let Some((rel, _cidx)) = StableColumn::parse_stable_name(c.name()) { + bits |= 1u64 << rel; + continue; + } + + // Otherwise, fall back to column index mapping. The same expression can exist in + // either the left or the right join input; if present in both, it's ambiguous. + let l = left_map.get(c.index()); + let r = right_map.get(c.index()); + + match (l, r) { + (Some(entry), None) => self.add_relation_bits_from_entry(entry, &mut bits)?, + (None, Some(entry)) => self.add_relation_bits_from_entry(entry, &mut bits)?, + (Some(_), Some(_)) => { + return Err(DataFusionError::Internal(format!( + "JoinReorder: ambiguous column index {} found in both left and right maps while analyzing predicate dependencies", + c.index() + ))); + } + (None, None) => { + return Err(DataFusionError::Internal(format!( + "JoinReorder: column index {} out of bounds for both left_map (len {}) and right_map (len {})", + c.index(), + left_map.len(), + right_map.len() + ))); + } + } + } + + Ok(JoinSet::from_bits(bits)) + } + + fn add_relation_bits_from_entry(&self, entry: &ColumnMapEntry, bits: &mut u64) -> Result<()> { + match entry { + ColumnMapEntry::Stable { relation_id, .. } => { + *bits |= 1u64 << *relation_id; + Ok(()) + } + ColumnMapEntry::Expression { expr, input_map } => { + // Conservative: if a predicate references a derived column, analyze its base + // dependencies so we can still attach the predicate at the right time. + self.add_relation_bits_from_expr(expr, input_map, bits) + } + } + } + + fn add_relation_bits_from_expr( + &self, + expr: &Arc, + column_map: &ColumnMap, + bits: &mut u64, + ) -> Result<()> { + let cols = collect_columns(expr); + for c in &cols { + if let Some((rel, _)) = StableColumn::parse_stable_name(c.name()) { + *bits |= 1u64 << rel; + continue; + } + + let entry = column_map.get(c.index()).ok_or_else(|| { + DataFusionError::Internal(format!( + "JoinReorder: expression column index {} out of bounds (len {}) while analyzing predicate dependencies", + c.index(), + column_map.len() + )) + })?; + + self.add_relation_bits_from_entry(entry, bits)?; + } + Ok(()) + } + + /// Return all base relations present in either column map. + fn all_relations_in_maps( + &self, + left_map: &ColumnMap, + right_map: &ColumnMap, + ) -> Result { + let mut bits: u64 = 0; + for e in left_map.iter().chain(right_map.iter()) { + self.add_relation_bits_from_entry(e, &mut bits)?; + } + Ok(JoinSet::from_bits(bits)) + } + + fn combine_predicates( + preds: Vec, + ) -> Result<(PhysicalExprRef, Vec)> { + if preds.is_empty() { + return Err(DataFusionError::Internal( + "JoinReorder: cannot combine empty predicate list".to_string(), + )); + } + + let mut filter = preds[0].0.clone(); + let mut equi_pairs: Vec = Vec::new(); + equi_pairs.extend_from_slice(&preds[0].1); + + for (pred, pairs) in preds.into_iter().skip(1) { + filter = Arc::new(BinaryExpr::new(filter, Operator::And, pred)) as PhysicalExprRef; + equi_pairs.extend(pairs); + } + + Ok((filter, equi_pairs)) + } + /// Rewrite a HashJoin's JoinFilter expression so that any Column references are /// converted to stable names based on base relations: "R{relation_id}.C{column_index}". /// This avoids depending on transient projection aliases like "#37" and local indices. @@ -258,9 +478,8 @@ impl GraphBuilder { column_index, }) = stable_entry_opt.cloned() { - // Build a stable name like R{relation_id}.C{column_index} - // TODO: Consider implement PhysicalExpr trait for StableColumn. - let stable_name = format!("R{}.C{}", relation_id, column_index); + // Build the canonical stable name used by dependency analysis. + let stable_name = StableColumn::format_stable_name(relation_id, column_index); // TODO: Reconstructor will retarget indices to its compact schema let new_col = Column::new(&stable_name, 0); return Ok(Transformed::yes(Arc::new(new_col) as Arc)); @@ -303,22 +522,83 @@ impl GraphBuilder { } fn create_relation_node(&mut self, plan: Arc) -> Result { + if self.graph.relation_count() >= MAX_RELATIONS { + return Err(DataFusionError::Internal(format!( + "JoinReorder: relation_count {} reached MAX_RELATIONS {}", + self.graph.relation_count(), + MAX_RELATIONS + ))); + } + // Assign new relation_id let relation_id = self.relation_counter; self.relation_counter += 1; - // Estimate initial cardinality - let stats = plan.partition_statistics(None)?; + // Estimate initial cardinality and choose statistics for downstream estimation. + // + // Special-case FilterExec: treat it as a boundary leaf (same as before), but pull + // statistics from its *input* (pre-filter) so we retain the most original/accurate + // datasource stats (e.g., Parquet), and apply the filter's selectivity as a penalty + // factor to initial cardinality. + let (stats, initial_cardinality, base_cardinality) = if plan.as_any().is::() { + // NOTE: We still keep FilterExec as the boundary leaf (filter-boundary strategy), but + // we must avoid + // an inconsistent stats state where num_rows is "post-filter" while distinct_count + // (and thus TDom) remains "pre-filter". That mismatch can cause greedy join ordering + // to prefer catastrophic NLJs. + // + // We therefore: + // - read base (pre-filter) datasource statistics from beneath the filter chain, and + // - estimate a selectivity factor for the filter predicate(s), and + // - apply that selectivity to num_rows / total_byte_size (inexact) while preserving + // base column stats as a best-effort proxy for join planning. + let (pre_filter_plan, selectivity) = + self.peel_filter_chain_and_estimate_selectivity(plan.clone())?; + let pre_stats = pre_filter_plan.partition_statistics(None)?; + let base = match pre_stats.num_rows { + Precision::Exact(count) => count as f64, + Precision::Inexact(count) => count as f64, + Precision::Absent => 1000.0, + }; - // FIXME: Initial cardinality estimation does not account for table-level filters. - let initial_cardinality = match stats.num_rows { - datafusion::common::stats::Precision::Exact(count) => count as f64, - datafusion::common::stats::Precision::Inexact(count) => count as f64, - datafusion::common::stats::Precision::Absent => 1000.0, // Default estimation + let mut adjusted = pre_stats.to_inexact(); + adjusted.num_rows = adjusted.num_rows.with_estimated_selectivity(selectivity); + adjusted.total_byte_size = adjusted + .total_byte_size + .with_estimated_selectivity(selectivity); + + (adjusted, base * selectivity, base) + } else if plan.as_any().is::() { + // Preserve ProjectionExec as a relation leaf, but prefer its input statistics + // (ProjectionExec may not have accurate stats of its own). + let mut cur = plan.clone(); + while let Some(p) = cur.as_any().downcast_ref::() { + cur = p.input().clone(); + } + let stats = cur.partition_statistics(None)?; + let initial_cardinality = match stats.num_rows { + Precision::Exact(count) => count as f64, + Precision::Inexact(count) => count as f64, + Precision::Absent => 1000.0, // Default estimation + }; + (stats, initial_cardinality, initial_cardinality) + } else { + let stats = plan.partition_statistics(None)?; + let initial_cardinality = match stats.num_rows { + Precision::Exact(count) => count as f64, + Precision::Inexact(count) => count as f64, + Precision::Absent => 1000.0, // Default estimation + }; + (stats, initial_cardinality, initial_cardinality) }; - let relation_node = - RelationNode::new(plan.clone(), relation_id, initial_cardinality, stats); + let relation_node = RelationNode::new( + plan.clone(), + relation_id, + initial_cardinality, + base_cardinality, + stats, + ); self.graph.add_relation(relation_node); // Create stable IDs for all output columns of this new relation and build ColumnMap @@ -338,47 +618,221 @@ impl GraphBuilder { Ok(output_map) } - /// Helper function to resolve an expression to the set of relation IDs it references. - /// Traverses the expression tree to find all underlying Stable columns. - fn resolve_expr_to_relations( + /// If `plan` is a FilterExec (possibly a chain of stacked FilterExecs), returns: + /// - the first non-Filter child plan (the "pre-filter" plan), and + /// - a multiplicative selectivity factor estimated from filter predicates. + /// + /// This is used to keep base datasource statistics (e.g., Parquet) while still + /// penalizing cardinality for table/subplan filters. + fn peel_filter_chain_and_estimate_selectivity( &self, - expr: &Arc, - column_map: &ColumnMap, - ) -> Result> { - let mut relation_ids = Vec::new(); + plan: Arc, + ) -> Result<(Arc, f64)> { + let mut cur = plan; + let mut selectivity: f64 = 1.0; + + while let Some(filter) = cur.as_any().downcast_ref::() { + let input = filter.input().clone(); + let input_stats = input.partition_statistics(None)?; + let input_schema = input.schema(); + + let sel = self.estimate_filter_selectivity( + filter.predicate(), + &input_schema, + &input_stats, + filter.default_selectivity(), + ); + selectivity *= sel; + cur = input; + } - if let Some(col) = expr.as_any().downcast_ref::() { - // This is a direct column reference - if let Some(entry) = column_map.get(col.index()) { - match entry { - ColumnMapEntry::Stable { relation_id, .. } => { - relation_ids.push(*relation_id); - } - ColumnMapEntry::Expression { .. } => { - // FIXME: Support join keys / predicates that reference derived columns by - // recursively walking the expression's input_map and collecting all base - // relation_ids it depends on (similar to predicate dependency analysis). - - // This column comes from a complex expression (e.g., aggregate output) - // We cannot resolve it to a *specific base relation* for join condition purposes. - // If a join condition relies on a column that is an aggregate output, - // that join condition cannot be directly mapped to base relations. - return Err(DataFusionError::Internal( - "Join condition uses a column derived from an expression (e.g., aggregate), cannot map to stable join columns.".to_string(), - )); - } - } + Ok((cur, selectivity.clamp(0.0, 1.0))) + } + + /// Estimate selectivity for a Filter predicate using DataFusion's interval analysis when + /// possible, falling back to FilterExec's default selectivity otherwise. + fn estimate_filter_selectivity( + &self, + predicate: &Arc, + schema: &datafusion::arrow::datatypes::SchemaRef, + input_stats: &datafusion::common::Statistics, + default_selectivity: u8, + ) -> f64 { + // Default: `FilterExec.default_selectivity` is expressed as percent [0, 100]. + // + // In practice this can be configured to 100 (no reduction), which is too optimistic + // for join ordering when the predicate is selective but cannot be analyzed. + // Cap our fallback to a conservative upper bound. + let configured = (default_selectivity as f64 / 100.0).clamp(0.0, 1.0); + // If configured to 100% ("no filtering"), use a conservative fallback when we can't + // estimate from stats to avoid catastrophic join orderings. + let fallback = if configured >= 0.999 { 0.2 } else { configured }; + + // First try a small set of cheap, deterministic heuristics that work well for common + // predicates (e.g. col = literal, range filters, conjunctions). + if let Some(sel) = self.estimate_selectivity_from_stats(predicate, schema, input_stats) { + return sel.clamp(0.0, 1.0); + } + + if !check_support(predicate, schema) { + return fallback; + } + + // Best effort: analyze predicate to derive a selectivity from column statistics. + let Ok(input_ctx) = + AnalysisContext::try_from_statistics(schema, &input_stats.column_statistics) + else { + return fallback; + }; + + let Ok(ctx) = analyze(predicate, input_ctx, schema) else { + return fallback; + }; + + ctx.selectivity.unwrap_or(fallback).clamp(0.0, 1.0) + } + + fn estimate_selectivity_from_stats( + &self, + expr: &Arc, + schema: &datafusion::arrow::datatypes::SchemaRef, + input_stats: &datafusion::common::Statistics, + ) -> Option { + let bin = expr.as_any().downcast_ref::()?; + match bin.op() { + Operator::And => { + let l = self.estimate_selectivity_from_stats(bin.left(), schema, input_stats)?; + let r = self.estimate_selectivity_from_stats(bin.right(), schema, input_stats)?; + Some((l * r).clamp(0.0, 1.0)) + } + Operator::Or => { + let l = self.estimate_selectivity_from_stats(bin.left(), schema, input_stats)?; + let r = self.estimate_selectivity_from_stats(bin.right(), schema, input_stats)?; + // Independence assumption: P(A ∪ B) = P(A) + P(B) - P(A)P(B). + // This can misestimate correlated predicates (e.g. `x > 10 OR x > 5`), but is + // still a reasonable fallback when we do not have correlation statistics. + Some((l + r - l * r).clamp(0.0, 1.0)) } + Operator::Eq | Operator::Lt | Operator::LtEq | Operator::Gt | Operator::GtEq => { + self.estimate_binary_selectivity_from_stats(bin, schema, input_stats) + } + _ => None, + } + } + + fn estimate_binary_selectivity_from_stats( + &self, + bin: &BinaryExpr, + schema: &datafusion::arrow::datatypes::SchemaRef, + input_stats: &datafusion::common::Statistics, + ) -> Option { + use datafusion::common::ScalarValue; + use datafusion::physical_expr::expressions::Literal; + + // Normalize into (Column, Literal) if possible. + let (col, lit) = if let (Some(c), Some(l)) = ( + bin.left().as_any().downcast_ref::(), + bin.right().as_any().downcast_ref::(), + ) { + (c, l) + } else if let (Some(l), Some(c)) = ( + bin.left().as_any().downcast_ref::(), + bin.right().as_any().downcast_ref::(), + ) { + // Flip operator direction if needed. + // For Eq it's symmetric; for inequalities we can invert. + let flipped_op = match bin.op() { + Operator::Lt => Operator::Gt, + Operator::LtEq => Operator::GtEq, + Operator::Gt => Operator::Lt, + Operator::GtEq => Operator::LtEq, + Operator::Eq => Operator::Eq, + _ => return None, + }; + let tmp = BinaryExpr::new( + Arc::new(Column::new(c.name(), c.index())), + flipped_op, + Arc::new(Literal::new(l.value().clone())), + ); + return self.estimate_binary_selectivity_from_stats(&tmp, schema, input_stats); } else { - // TODO: Implement recursive traversal of expression tree for complex expressions - return Err(DataFusionError::Internal( - "Complex expression resolution not yet implemented".to_string(), - )); + return None; + }; + + let col_idx = schema.index_of(col.name()).ok()?; + let stats = input_stats.column_statistics.get(col_idx)?; + + // Prefer distinct_count for equality predicates. + if bin.op() == &Operator::Eq { + let ndv = match stats.distinct_count { + Precision::Exact(v) => v as f64, + Precision::Inexact(v) => v as f64, + Precision::Absent => 0.0, + }; + if ndv.is_finite() && ndv > 0.0 { + return Some((1.0 / ndv).clamp(0.0, 1.0)); + } } - Ok(relation_ids) + // Best-effort range reasoning from min/max for numeric scalars. + let (min, max) = match (&stats.min_value, &stats.max_value) { + (Precision::Exact(min), Precision::Exact(max)) + | (Precision::Inexact(min), Precision::Inexact(max)) + | (Precision::Exact(min), Precision::Inexact(max)) + | (Precision::Inexact(min), Precision::Exact(max)) => (min, max), + _ => return None, + }; + + // Only implement a small int range model. + let (min, max, v) = match (min, max, lit.value()) { + ( + ScalarValue::Int32(Some(min)), + ScalarValue::Int32(Some(max)), + ScalarValue::Int32(Some(v)), + ) => (*min as f64, *max as f64, *v as f64), + ( + ScalarValue::Int64(Some(min)), + ScalarValue::Int64(Some(max)), + ScalarValue::Int64(Some(v)), + ) => (*min as f64, *max as f64, *v as f64), + ( + ScalarValue::UInt32(Some(min)), + ScalarValue::UInt32(Some(max)), + ScalarValue::UInt32(Some(v)), + ) => (*min as f64, *max as f64, *v as f64), + ( + ScalarValue::UInt64(Some(min)), + ScalarValue::UInt64(Some(max)), + ScalarValue::UInt64(Some(v)), + ) => (*min as f64, *max as f64, *v as f64), + _ => return None, + }; + + if !(min.is_finite() && max.is_finite() && v.is_finite()) || max < min { + return None; + } + + let width = (max - min + 1.0).max(1.0); + let frac = match bin.op() { + Operator::Eq => { + if v < min || v > max { + 0.0 + } else { + 1.0 / width + } + } + Operator::Lt => ((v - min) / width).clamp(0.0, 1.0), + Operator::LtEq => (((v - min) + 1.0) / width).clamp(0.0, 1.0), + Operator::Gt => ((max - v) / width).clamp(0.0, 1.0), + Operator::GtEq => (((max - v) + 1.0) / width).clamp(0.0, 1.0), + _ => return None, + }; + + Some(frac.clamp(0.0, 1.0)) } + /// Helper function to resolve an expression to the set of relation IDs it references. + /// Traverses the expression tree to find all underlying Stable columns. /// Helper function to resolve an expression to a single StableColumn if possible. /// Returns None if the expression is not a simple column reference. fn resolve_to_single_stable_col( @@ -397,7 +851,8 @@ impl GraphBuilder { return Ok(Some(StableColumn { relation_id: *relation_id, column_index: *column_index, - name: col.name().to_string(), + // Use a stable, parseable name that uniquely identifies the base column. + name: format!("R{}.C{}", relation_id, column_index), })); } ColumnMapEntry::Expression { .. } => { @@ -410,29 +865,6 @@ impl GraphBuilder { // For complex expressions, return None Ok(None) } - - /// Helper function to build a conjunction expression from join ON conditions. - /// Converts (left_expr, right_expr) pairs into a single AND expression. - fn build_conjunction_from_on( - &self, - on_conditions: &JoinConditionPairs, - ) -> Result> { - on_conditions - .iter() - .map(|(a, b)| -> Arc { - Arc::new(BinaryExpr::new(a.clone(), Operator::Eq, b.clone())) - }) - .fold(None, |acc, expr| -> Option> { - if let Some(acc_expr) = acc { - Some(Arc::new(BinaryExpr::new(acc_expr, Operator::And, expr))) - } else { - Some(expr) - } - }) - .ok_or_else(|| { - DataFusionError::Internal("Join must have at least one ON condition".to_string()) - }) - } } impl Default for GraphBuilder { @@ -444,9 +876,13 @@ impl Default for GraphBuilder { #[cfg(test)] mod tests { use datafusion::arrow::datatypes::{DataType, Field, Schema}; - use datafusion::physical_expr::expressions::Column; + use datafusion::common::ScalarValue; + use datafusion::physical_expr::expressions::{Column, Literal}; use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::joins::HashJoinExec; + use datafusion::physical_plan::placeholder_row::PlaceholderRowExec; + use datafusion::physical_plan::projection::ProjectionExpr; use super::*; @@ -559,6 +995,7 @@ mod tests { None, // projection PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, // null_aware )?); // Test that visit_inner_join correctly handles HashJoinExec @@ -573,6 +1010,397 @@ mod tests { Ok(()) } + #[test] + fn test_trivial_projection_is_seen_through_for_region_building() -> Result<()> { + use datafusion::common::NullEquality; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // A(a_id) JOIN B(b_id) -> (a_id, b_id) + let schema_a = Arc::new(Schema::new(vec![Field::new( + "a_id", + DataType::Int32, + false, + )])); + let schema_b = Arc::new(Schema::new(vec![Field::new( + "b_id", + DataType::Int32, + false, + )])); + let schema_c = Arc::new(Schema::new(vec![Field::new( + "c_id", + DataType::Int32, + false, + )])); + + let a = Arc::new(EmptyExec::new(schema_a)) as Arc; + let b = Arc::new(EmptyExec::new(schema_b)) as Arc; + let c = Arc::new(EmptyExec::new(schema_c)) as Arc; + + let ab = Arc::new(HashJoinExec::try_new( + a, + b, + vec![( + Arc::new(Column::new("a_id", 0)) as Arc, + Arc::new(Column::new("b_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Trivial projection over AB: identity (same indices) but with renamed aliases. + let ab_proj = Arc::new(ProjectionExec::try_new( + [ + ProjectionExpr { + expr: Arc::new(Column::new("a_id", 0)), + alias: "a_id_renamed".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Column::new("b_id", 1)), + alias: "b_id_renamed".to_string(), + }, + ], + ab, + )?); + + // (AB_proj) JOIN C on a_id_renamed = c_id + let top = Arc::new(HashJoinExec::try_new( + ab_proj, + c, + vec![( + Arc::new(Column::new("a_id_renamed", 0)) as Arc, + Arc::new(Column::new("c_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?) as Arc; + + let Some((graph, _col_map)) = builder.build(top)? else { + return Err(DataFusionError::Internal("expected a graph".to_string())); + }; + assert_eq!( + graph.relation_count(), + 3, + "trivial projection should be seen through so the region includes A, B, and C" + ); + Ok(()) + } + + #[test] + fn test_trivial_projection_permutation_is_seen_through_for_region_building() -> Result<()> { + use datafusion::common::NullEquality; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // A(a_id) JOIN B(b_id) -> (a_id, b_id) + let schema_a = Arc::new(Schema::new(vec![Field::new( + "a_id", + DataType::Int32, + false, + )])); + let schema_b = Arc::new(Schema::new(vec![Field::new( + "b_id", + DataType::Int32, + false, + )])); + let schema_c = Arc::new(Schema::new(vec![Field::new( + "c_id", + DataType::Int32, + false, + )])); + + let a = Arc::new(EmptyExec::new(schema_a)) as Arc; + let b = Arc::new(EmptyExec::new(schema_b)) as Arc; + let c = Arc::new(EmptyExec::new(schema_c)) as Arc; + + let ab = Arc::new(HashJoinExec::try_new( + a, + b, + vec![( + Arc::new(Column::new("a_id", 0)) as Arc, + Arc::new(Column::new("b_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Trivial projection over AB: pure permutation of columns (no prune/dup/exprs). + let ab_proj = Arc::new(ProjectionExec::try_new( + [ + ProjectionExpr { + expr: Arc::new(Column::new("b_id", 1)), + alias: "b_first".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Column::new("a_id", 0)), + alias: "a_second".to_string(), + }, + ], + ab, + )?); + + // (AB_proj) JOIN C on a_second (idx=1) = c_id + let top = Arc::new(HashJoinExec::try_new( + ab_proj, + c, + vec![( + Arc::new(Column::new("a_second", 1)) as Arc, + Arc::new(Column::new("c_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?) as Arc; + + let Some((graph, _col_map)) = builder.build(top)? else { + return Err(DataFusionError::Internal("expected a graph".to_string())); + }; + assert_eq!( + graph.relation_count(), + 3, + "permutation-only projection should be seen through so the region includes A, B, and C" + ); + Ok(()) + } + + #[test] + fn test_projection_with_expression_is_boundary_leaf() -> Result<()> { + use datafusion::common::NullEquality; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // A(a_id) JOIN B(b_id) -> (a_id, b_id) + let schema_a = Arc::new(Schema::new(vec![Field::new( + "a_id", + DataType::Int32, + false, + )])); + let schema_b = Arc::new(Schema::new(vec![Field::new( + "b_id", + DataType::Int32, + false, + )])); + let schema_c = Arc::new(Schema::new(vec![Field::new( + "c_id", + DataType::Int32, + false, + )])); + + let a = Arc::new(EmptyExec::new(schema_a)) as Arc; + let b = Arc::new(EmptyExec::new(schema_b)) as Arc; + let c = Arc::new(EmptyExec::new(schema_c)) as Arc; + + let ab = Arc::new(HashJoinExec::try_new( + a, + b, + vec![( + Arc::new(Column::new("a_id", 0)) as Arc, + Arc::new(Column::new("b_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Non-trivial projection: contains a computed expression. + let ab_proj = Arc::new(ProjectionExec::try_new( + [ + ProjectionExpr { + expr: Arc::new(Column::new("a_id", 0)), + alias: "a_id_only".to_string(), + }, + ProjectionExpr { + expr: Arc::new(Literal::new(ScalarValue::Int32(Some(1)))), + alias: "one".to_string(), + }, + ], + ab, + )?); + + // (AB_proj) JOIN C on a_id_only = c_id + let top = Arc::new(HashJoinExec::try_new( + ab_proj, + c, + vec![( + Arc::new(Column::new("a_id_only", 0)) as Arc, + Arc::new(Column::new("c_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?) as Arc; + + let Some((graph, _col_map)) = builder.build(top)? else { + return Err(DataFusionError::Internal("expected a graph".to_string())); + }; + assert_eq!( + graph.relation_count(), + 2, + "projection with expressions should be a boundary leaf so AB is treated as one relation" + ); + Ok(()) + } + + #[test] + fn test_column_only_projection_pruning_is_seen_through_for_region_building() -> Result<()> { + use datafusion::common::NullEquality; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // A(a_id) JOIN B(b_id) -> (a_id, b_id) + let schema_a = Arc::new(Schema::new(vec![Field::new( + "a_id", + DataType::Int32, + false, + )])); + let schema_b = Arc::new(Schema::new(vec![Field::new( + "b_id", + DataType::Int32, + false, + )])); + let schema_c = Arc::new(Schema::new(vec![Field::new( + "c_id", + DataType::Int32, + false, + )])); + + let a = Arc::new(EmptyExec::new(schema_a)) as Arc; + let b = Arc::new(EmptyExec::new(schema_b)) as Arc; + let c = Arc::new(EmptyExec::new(schema_c)) as Arc; + + let ab = Arc::new(HashJoinExec::try_new( + a, + b, + vec![( + Arc::new(Column::new("a_id", 0)) as Arc, + Arc::new(Column::new("b_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Column-only projection with pruning (drops b_id). + let ab_proj = Arc::new(ProjectionExec::try_new( + [ProjectionExpr { + expr: Arc::new(Column::new("a_id", 0)), + alias: "a_id_only".to_string(), + }], + ab, + )?); + + // (AB_proj) JOIN C on a_id_only = c_id + let top = Arc::new(HashJoinExec::try_new( + ab_proj, + c, + vec![( + Arc::new(Column::new("a_id_only", 0)) as Arc, + Arc::new(Column::new("c_id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Partitioned, + NullEquality::NullEqualsNothing, + false, // null_aware + )?) as Arc; + + let Some((graph, _col_map)) = builder.build(top)? else { + return Err(DataFusionError::Internal("expected a graph".to_string())); + }; + assert_eq!( + graph.relation_count(), + 3, + "column-only projection with pruning should be seen through so the region includes A, B, and C" + ); + Ok(()) + } + + #[test] + fn test_filter_leaf_adjusts_statistics_and_penalizes_cardinality() -> Result<()> { + // Build an input plan with exact row count statistics. + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + let input = Arc::new(PlaceholderRowExec::new(schema)); + + // Use a predicate shape that the interval analysis doesn't support so we deterministically + // fall back to `default_selectivity`: + // + // (a > 0) OR (a < 0) + let a = Arc::new(Column::new("a", 0)) as Arc; + let gt0 = Arc::new(BinaryExpr::new( + a.clone(), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Int32(Some(0)))), + )) as Arc; + let lt0 = Arc::new(BinaryExpr::new( + a, + Operator::Lt, + Arc::new(Literal::new(ScalarValue::Int32(Some(0)))), + )) as Arc; + let pred: Arc = Arc::new(BinaryExpr::new(gt0, Operator::Or, lt0)); + + let filter = FilterExec::try_new(pred, input)?.with_default_selectivity(50)?; + let filter: Arc = Arc::new(filter); + + let mut builder = GraphBuilder::new(); + let _ = builder.visit_plan(filter.clone())?; + + assert_eq!(builder.graph.relation_count(), 1); + + let rel = &builder.graph.relations[0]; + // Keep FilterExec as the boundary leaf (filter-boundary strategy). + assert_eq!(rel.plan.name(), "FilterExec"); + + // Statistics should reflect the estimated selectivity. + match rel.statistics.num_rows { + datafusion::common::stats::Precision::Inexact(n) => assert_eq!(n, 1), + #[expect(clippy::panic)] + other => panic!("expected Inexact(1) filtered stats, got {other:?}"), + } + + // Penalize initial cardinality by default_selectivity (50% here). + assert!( + (rel.initial_cardinality - 0.5).abs() < 1e-9, + "expected initial_cardinality ~= 0.5, got {}", + rel.initial_cardinality + ); + assert!( + (rel.base_cardinality - 1.0).abs() < 1e-9, + "expected base_cardinality ~= 1.0, got {}", + rel.base_cardinality + ); + + Ok(()) + } + #[test] fn test_penetrate_complex_plan_structure() -> Result<()> { use datafusion::common::NullEquality; @@ -612,6 +1440,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, // null_aware )?); // Create second join: (table1 ⋈ table2) ⋈ table3 @@ -628,6 +1457,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, // null_aware )?); // Test that our enhanced visit_plan can find the joins directly @@ -654,6 +1484,212 @@ mod tests { Ok(()) } + #[test] + fn test_builder_splits_multi_pair_join_into_binary_edges() -> Result<()> { + use datafusion::common::NullEquality; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // Each table has a single column "id" to keep join key construction simple. + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + + let t1 = Arc::new(EmptyExec::new(schema.clone())); + let t2 = Arc::new(EmptyExec::new(schema.clone())); + let t3 = Arc::new(EmptyExec::new(schema.clone())); + let t4 = Arc::new(EmptyExec::new(schema.clone())); + + // Join t1 ⋈ t2 on id = id + let join12 = Arc::new(HashJoinExec::try_new( + t1, + t2, + vec![( + Arc::new(Column::new("id", 0)) as Arc, + Arc::new(Column::new("id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Join t3 ⋈ t4 on id = id + let join34 = Arc::new(HashJoinExec::try_new( + t3, + t4, + vec![( + Arc::new(Column::new("id", 0)) as Arc, + Arc::new(Column::new("id", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Root join: (t1 ⋈ t2) ⋈ (t3 ⋈ t4) with two independent equi predicates: + // - t1.id = t3.id (left index 0, right index 0) + // - t2.id = t4.id (left index 1, right index 1) + // + // GraphBuilder should emit two *binary* edges ({t1,t3} and {t2,t4}), + // not a single hyperedge {t1,t2,t3,t4}. + let root = Arc::new(HashJoinExec::try_new( + join12, + join34, + vec![ + ( + Arc::new(Column::new("id", 0)) as Arc, + Arc::new(Column::new("id", 0)) as Arc, + ), + ( + Arc::new(Column::new("id", 1)) as Arc, + Arc::new(Column::new("id", 1)) as Arc, + ), + ], + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + let (graph, _map) = builder + .build(root)? + .ok_or_else(|| DataFusionError::Internal("expected Some(graph)".to_string()))?; + + assert_eq!(graph.relation_count(), 4); + + // We expect: + // - edge {0,1} from join12 + // - edge {2,3} from join34 + // - edges {0,2} and {1,3} from the root join's two predicates + assert_eq!(graph.edges.len(), 4); + + assert!( + graph.edges.iter().all(|e| e.join_set.cardinality() <= 2), + "expected all edges to be binary (no hyperedge) for this plan" + ); + + let s02 = JoinSet::from_iter([0, 2])?; + let s13 = JoinSet::from_iter([1, 3])?; + assert!( + graph.edges.iter().any(|e| e.join_set == s02), + "expected an edge connecting relations {{0,2}}" + ); + assert!( + graph.edges.iter().any(|e| e.join_set == s13), + "expected an edge connecting relations {{1,3}}" + ); + + Ok(()) + } + + #[test] + fn test_builder_keeps_true_multi_relation_predicate_as_hyperedge() -> Result<()> { + use datafusion::common::{JoinSide, NullEquality}; + use datafusion::physical_plan::joins::utils::{ColumnIndex, JoinFilter}; + use datafusion::physical_plan::joins::PartitionMode; + + let mut builder = GraphBuilder::new(); + + // Three base relations with distinct column names to avoid name ambiguity. + let schema_a = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let schema_b = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, false)])); + let schema_c = Arc::new(Schema::new(vec![Field::new("c", DataType::Int32, false)])); + + let t1 = Arc::new(EmptyExec::new(schema_a.clone())); + let t2 = Arc::new(EmptyExec::new(schema_b.clone())); + let t3 = Arc::new(EmptyExec::new(schema_c.clone())); + + // Join t1 ⋈ t2 on a = b + let join12 = Arc::new(HashJoinExec::try_new( + t1, + t2, + vec![( + Arc::new(Column::new("a", 0)) as Arc, + Arc::new(Column::new("b", 0)) as Arc, + )], + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Root join: (t1 ⋈ t2) ⋈ t3 on a = c, plus a filter that truly depends on all 3 relations: + // (a + b) > c + // + // This conjunct cannot be split into binary predicates, so GraphBuilder must keep it as a + // hyperedge with join_set {t1,t2,t3}. + let filter_column_indices = vec![ + ColumnIndex { + side: JoinSide::Left, + index: 0, // a + }, + ColumnIndex { + side: JoinSide::Left, + index: 1, // b + }, + ColumnIndex { + side: JoinSide::Right, + index: 0, // c + }, + ]; + + let filter_expr: Arc = Arc::new(BinaryExpr::new( + Arc::new(BinaryExpr::new( + Arc::new(Column::new("a", 0)), + Operator::Plus, + Arc::new(Column::new("b", 1)), + )), + Operator::Gt, + Arc::new(Column::new("c", 2)), + )); + + let filter_schema = Arc::new(Schema::new(vec![ + join12.schema().field(0).clone(), + join12.schema().field(1).clone(), + schema_c.field(0).clone(), + ])); + + let join_filter = JoinFilter::new(filter_expr, filter_column_indices, filter_schema); + + let root = Arc::new(HashJoinExec::try_new( + join12, + t3, + vec![( + Arc::new(Column::new("a", 0)) as Arc, + Arc::new(Column::new("c", 0)) as Arc, + )], + Some(join_filter), + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + let (graph, _map) = builder + .build(root)? + .ok_or_else(|| DataFusionError::Internal("expected Some(graph)".to_string()))?; + + assert_eq!(graph.relation_count(), 3); + + // Ensure we have at least one hyperedge with 3 relations. + assert!( + graph.edges.iter().any(|e| e.join_set.cardinality() == 3), + "expected a 3-relation hyperedge from the (a + b) > c predicate" + ); + + Ok(()) + } + #[test] fn test_visit_inner_join_applies_hash_join_projection() -> Result<()> { use datafusion::common::NullEquality; @@ -694,6 +1730,7 @@ mod tests { projection, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, // null_aware )?); let output_map = builder.visit_inner_join(&join_plan)?; diff --git a/crates/sail-physical-optimizer/src/join_reorder/cardinality_estimator.rs b/crates/sail-physical-optimizer/src/join_reorder/cardinality_estimator.rs index 58ea9ce2a3..8f42347e7b 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/cardinality_estimator.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/cardinality_estimator.rs @@ -12,6 +12,13 @@ use crate::join_reorder::join_set::JoinSet; /// Heuristic selectivity for non-equi filter conditions const HEURISTIC_FILTER_SELECTIVITY: f64 = 0.1; +/// Heuristic selectivity for *theta joins* (no equi-join keys, i.e. `equi_pairs` empty). +/// +/// For safety in greedy join ordering, we assume such predicates are *not very selective*. +/// Under-estimating theta-join output can cause catastrophic join orders (e.g. joining two +/// dimensions on `!=` early, materializing a near-cross-product). +const HEURISTIC_THETA_JOIN_SELECTIVITY: f64 = 1.0; + /// Represents a group of columns that have the same domain due to equi-joins. #[derive(Debug, Default, Clone)] pub struct EquivalenceSet { @@ -50,6 +57,8 @@ pub struct CardinalityEstimator { cardinality_cache: HashMap, /// List of equivalence sets. equivalence_sets: Vec, + /// Fast lookup from stable column -> equivalence set index. + column_to_equiv_set: HashMap, /// Mapping from (relation_id, column_index) to initial distinct_count initial_distinct_counts: HashMap, } @@ -60,6 +69,7 @@ impl CardinalityEstimator { graph, cardinality_cache: HashMap::new(), equivalence_sets: vec![], + column_to_equiv_set: HashMap::new(), initial_distinct_counts: HashMap::new(), }; @@ -125,7 +135,16 @@ impl CardinalityEstimator { self.estimate_tdom_for_set(set); } + // Build a lookup map for fast edge selectivity estimation. + let mut column_to_equiv_set = HashMap::new(); + for (idx, set) in sets.iter().enumerate() { + for col in &set.columns { + column_to_equiv_set.insert(col.clone(), idx); + } + } + self.equivalence_sets = sets; + self.column_to_equiv_set = column_to_equiv_set; } /// Merge two columns into equivalence sets using Union-Find like logic. @@ -186,25 +205,43 @@ impl CardinalityEstimator { /// Estimate TDom (Total Domain) for an equivalence set. fn estimate_tdom_for_set(&self, set: &mut EquivalenceSet) { - let mut max_distinct_count = 1.0; // TDom is at least 1 + let mut max_known_distinct: f64 = 0.0; + let mut min_base_card: f64 = f64::INFINITY; + let mut has_known_stats = false; for stable_col in &set.columns { if let Some(distinct_count) = self.initial_distinct_counts.get(stable_col) { - if *distinct_count > max_distinct_count { - max_distinct_count = *distinct_count; - } - } else { - // If a column has no statistics, use heuristic based on relation cardinality - if let Some(relation) = self.graph.get_relation(stable_col.relation_id) { - let card = relation.initial_cardinality; - if card > max_distinct_count { - max_distinct_count = card; - } - } + max_known_distinct = max_known_distinct.max(*distinct_count); + has_known_stats = true; + } + + if let Some(relation) = self.graph.get_relation(stable_col.relation_id) { + min_base_card = min_base_card.min(relation.base_cardinality); } } - set.set_t_dom_count(max_distinct_count); + // If we have any usable distinct-count statistics, prefer them. Importantly we must NOT + // "inflate" TDom to a table cardinality just because some columns in the equivalence set + // lack column stats, as that would make join selectivity unrealistically tiny and + // underestimate join sizes. + // + // If no stats exist, use a conservative upper bound: the smallest relation cardinality in + // the equivalence set. Domain cardinality cannot exceed any participating relation's row + // count, and using `min` avoids the pathological underestimation caused by `max`. + let mut tdom = if has_known_stats { + max_known_distinct.max(1.0) + } else if min_base_card.is_finite() { + min_base_card.max(1.0) + } else { + 1.0 + }; + + // Enforce the obvious upper bound when relation cardinalities are available. + if min_base_card.is_finite() { + tdom = tdom.min(min_base_card).max(1.0); + } + + set.set_t_dom_count(tdom); } /// Estimate cardinality after joining a set of relations. @@ -272,29 +309,69 @@ impl CardinalityEstimator { .collect() } - /// Get TDom count for a join edge by finding the equivalence set of its join keys. + /// Get a domain cardinality (TDom) for a join edge. + /// + /// For multi-column equi-joins (e.g. `(a.x = b.x) AND (a.y = b.y)`), we combine + /// all involved equivalence sets to avoid underestimating join selectivity by + /// accidentally using only one of the keys. + /// + /// We cap the combined domain by the smallest participating relation cardinality, + /// as the distinct count of a composite key cannot exceed the row count of any + /// participating relation. fn get_tdom_for_edge(&self, edge: &JoinEdge) -> f64 { - // Find the equivalence set that contains the join keys from this edge - for equiv_set in &self.equivalence_sets { - // Check if any equi-pair from the edge is in this equivalence set - for (left_col, right_col) in &edge.equi_pairs { - if equiv_set.contains(left_col) || equiv_set.contains(right_col) { - return equiv_set.t_dom_count; - } + // TDom only makes sense for equi-join keys. If this edge has no equi-join pairs, do not + // invent a domain from relation cardinalities; treat it as "unknown / not applicable". + if edge.equi_pairs.is_empty() { + return 1.0; + } + + // Gather all equivalence sets referenced by this edge's equi-join pairs. + let mut used_equiv_sets: HashSet = HashSet::new(); + for (left_col, right_col) in &edge.equi_pairs { + if let Some(idx) = self.column_to_equiv_set.get(left_col) { + used_equiv_sets.insert(*idx); + } + if let Some(idx) = self.column_to_equiv_set.get(right_col) { + used_equiv_sets.insert(*idx); } } - // If no equivalence set found, use a conservative estimate - // Take the maximum cardinality of relations involved in this edge - edge.join_set + // Base fallback: smallest relation cardinality in the edge. + let min_base_card = edge + .join_set .iter() .map(|id| { self.graph .get_relation(id) - .map(|r| r.initial_cardinality) + .map(|r| r.base_cardinality) .unwrap_or(1.0) }) - .fold(1.0, f64::max) + .fold(f64::INFINITY, f64::min) + .max(1.0); + + // Defensive fallback: should not happen when equi_pairs is non-empty, but avoid returning + // an overly-large domain that would make the join appear unrealistically selective. + if used_equiv_sets.is_empty() { + return min_base_card; + } + + // Multiply the domains of each distinct equivalence set used by this edge. + // This assumes key components are roughly independent; correlated multi-column keys can + // make this overestimate selectivity. The cap by `min_base_card` below keeps it bounded. + let mut tdom_product = 1.0; + for idx in used_equiv_sets { + let tdom = self + .equivalence_sets + .get(idx) + .map(|s| s.t_dom_count) + .unwrap_or(1.0) + .max(1.0); + tdom_product *= tdom; + } + + // Cap by the smallest relation cardinality to avoid unrealistically tiny selectivity + // for multi-key joins (composite-key distinct count cannot exceed row count). + tdom_product.min(min_base_card).max(1.0) } /// Estimate join cardinality for a specific split (used by PlanEnumerator). @@ -308,20 +385,27 @@ impl CardinalityEstimator { for &index in connecting_edge_indices { let edge = &self.graph.edges[index]; - // TODO: Implement more granular join selectivity estimation. - // TDom-based estimation for equi-joins - let tdom = self.get_tdom_for_edge(edge); - if tdom > 1.0 { - selectivity *= 1.0 / tdom; + // Equi-join selectivity (TDom-based). + if !edge.equi_pairs.is_empty() { + let tdom = self.get_tdom_for_edge(edge); + if tdom > 1.0 { + selectivity *= 1.0 / tdom; + } else { + // Unknown TDom for equi-joins: use a conservative heuristic (still selective). + selectivity *= HEURISTIC_FILTER_SELECTIVITY; + } } else { - selectivity *= HEURISTIC_FILTER_SELECTIVITY; // Default for unknown TDom + // Theta join (no equi keys): assume *not selective* to avoid underestimating output. + selectivity *= HEURISTIC_THETA_JOIN_SELECTIVITY; } - // Apply additional selectivity for non-equi filters - if self.has_non_equi_filter(edge) { - // FIXME: This is too coarse. Non-equi selectivity should be calculated - // directly from the predicate instead of applying another generic factor. - selectivity *= HEURISTIC_FILTER_SELECTIVITY; + // Non-equi residual predicates: do NOT apply an extra aggressive heuristic here. + // A fixed 0.1 factor can severely under-estimate output and cause greedy ordering + // to pick NLJ-like joins too early (`... filter=... != ...`). + if self.has_non_equi_filter(edge) && !edge.equi_pairs.is_empty() { + // Keep the original heuristic only when we already have equi-keys, and treat the + // residual as a mild additional filter. + selectivity *= 0.8; } } @@ -354,7 +438,11 @@ mod tests { use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, Schema}; - use datafusion::common::Statistics; + use datafusion::common::stats::Precision; + use datafusion::common::{ScalarValue, Statistics}; + use datafusion::logical_expr::{JoinType, Operator}; + use datafusion::physical_expr::expressions::{BinaryExpr, Column, Literal}; + use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::empty::EmptyExec; use super::*; @@ -371,16 +459,61 @@ mod tests { // Add two relations let plan1 = Arc::new(EmptyExec::new(schema.clone())); - let relation1 = RelationNode::new(plan1, 0, 1000.0, Statistics::new_unknown(&schema)); + let relation1 = + RelationNode::new(plan1, 0, 1000.0, 1000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation1); let plan2 = Arc::new(EmptyExec::new(schema.clone())); - let relation2 = RelationNode::new(plan2, 1, 2000.0, Statistics::new_unknown(&schema)); + let relation2 = + RelationNode::new(plan2, 1, 2000.0, 2000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation2); graph } + #[test] + fn test_theta_join_is_not_treated_as_highly_selective() -> Result<()> { + // Two relations with large initial cardinalities. + let mut graph = QueryGraph::new(); + let schema = Arc::new(Schema::new(vec![Field::new( + "col1", + DataType::Int32, + false, + )])); + + let stats = Statistics::new_unknown(schema.as_ref()); + graph.add_relation(RelationNode::new( + Arc::new(EmptyExec::new(schema.clone())), + 0, + 1_000_000.0, + 1_000_000.0, + stats.clone(), + )); + graph.add_relation(RelationNode::new( + Arc::new(EmptyExec::new(schema.clone())), + 1, + 1_000_000.0, + 1_000_000.0, + stats, + )); + + // A theta predicate with no equi-join pairs. + let l: Arc = Arc::new(Column::new("R0.C0", 0)); + let r: Arc = Arc::new(Literal::new(ScalarValue::Int32(Some(1)))); + let pred: Arc = Arc::new(BinaryExpr::new(l, Operator::Gt, r)); + + let join_set = JoinSet::from_iter([0usize, 1usize].into_iter())?; + graph.add_edge(JoinEdge::new(join_set, pred, JoinType::Inner, vec![]))?; + + let estimator = CardinalityEstimator::new(graph); + let out = estimator.estimate_join_cardinality(1_000_000.0, 1_000_000.0, &[0]); + + // For theta joins, we should not estimate an unrealistically tiny output. + // A safe lower bound is that it's at least 1% of the cross product. + assert!(out >= 1_000_000.0 * 1_000_000.0 * 0.01, "out={out}"); + Ok(()) + } + #[test] fn test_cardinality_estimator_creation() { let graph = create_test_graph(); @@ -518,7 +651,7 @@ mod tests { for (id, rows) in [(0, 1000.0), (1, 2000.0), (2, 3000.0)] { let plan: Arc = Arc::new(EmptyExec::new(schema.clone())); let rel: RelationNode = - RelationNode::new(plan, id, rows, Statistics::new_unknown(&schema)); + RelationNode::new(plan, id, rows, rows, Statistics::new_unknown(&schema)); graph.add_relation(rel); } @@ -592,4 +725,356 @@ mod tests { .union(&JoinSet::new_singleton(2).unwrap()); assert_eq!(estimator.get_edges_contained_in_set(s02).len(), 0); } + + #[test] + fn test_tdom_prefers_distinct_stats_over_missing_cols() -> Result<()> { + use datafusion::logical_expr::JoinType; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + + let mut graph = QueryGraph::new(); + let schema: Arc = + Arc::new(Schema::new(vec![Field::new("k", DataType::Int32, false)])); + + // R0 has distinct stats on join key (k): 10 + let plan0 = Arc::new(EmptyExec::new(schema.clone())); + let mut stats0 = Statistics::new_unknown(&schema); + stats0.column_statistics[0].distinct_count = Precision::Exact(10); + graph.add_relation(RelationNode::new(plan0, 0, 1000.0, 1000.0, stats0)); + + // R1 has no distinct stats on join key (k) + let plan1 = Arc::new(EmptyExec::new(schema.clone())); + let stats1 = Statistics::new_unknown(&schema); + graph.add_relation(RelationNode::new(plan1, 1, 2000.0, 2000.0, stats1)); + + // Edge R0.k = R1.k + let l: Arc = Arc::new(Column::new("k", 0)); + let r: Arc = Arc::new(Column::new("k", 0)); + let filter = Arc::new(BinaryExpr::new(l, Operator::Eq, r)) as Arc; + let join_set = JoinSet::new_singleton(0)?.union(&JoinSet::new_singleton(1)?); + let edge = JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "k0".into(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "k1".into(), + }, + )], + ); + graph.add_edge(edge)?; + + let estimator = CardinalityEstimator::new(graph); + let s01 = JoinSet::from_iter([0, 1])?; + let edges = estimator.get_edges_contained_in_set(s01); + assert_eq!(edges.len(), 1); + + // TDom should be the known distinct-count (10), not inflated to a table cardinality. + assert!((estimator.get_tdom_for_edge(edges[0]) - 10.0).abs() < 1e-9); + Ok(()) + } + + #[test] + fn test_tdom_cap_uses_base_cardinality_for_filtered_dimension() -> Result<()> { + use datafusion::logical_expr::JoinType; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + + let mut graph = QueryGraph::new(); + let schema: Arc = + Arc::new(Schema::new(vec![Field::new("k", DataType::Int32, false)])); + + // Simulate a filtered dimension table: + // - base rows = 2,000,000 + // - post-filter rows = 400,000 + // - join-key distincts still reflect base-domain scale + let mut dim_stats = Statistics::new_unknown(&schema); + dim_stats.column_statistics[0].distinct_count = Precision::Exact(2_000_000); + graph.add_relation(RelationNode::new( + Arc::new(EmptyExec::new(schema.clone())), + 0, + 400_000.0, + 2_000_000.0, + dim_stats, + )); + + // Fact table with large cardinality. + graph.add_relation(RelationNode::new( + Arc::new(EmptyExec::new(schema.clone())), + 1, + 60_000_000.0, + 60_000_000.0, + Statistics::new_unknown(&schema), + )); + + let l: Arc = Arc::new(Column::new("k", 0)); + let r: Arc = Arc::new(Column::new("k", 0)); + let filter = Arc::new(BinaryExpr::new(l, Operator::Eq, r)) as Arc; + let join_set = JoinSet::new_singleton(0)?.union(&JoinSet::new_singleton(1)?); + graph.add_edge(JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "k0".into(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "k1".into(), + }, + )], + ))?; + + let estimator = CardinalityEstimator::new(graph); + let s01 = JoinSet::from_iter([0, 1])?; + let edges = estimator.get_edges_contained_in_set(s01); + assert_eq!(edges.len(), 1); + + // Cap by base cardinality (2M), not post-filter cardinality (400k). + assert!((estimator.get_tdom_for_edge(edges[0]) - 2_000_000.0).abs() < 1e-9); + + // This preserves the dimension-side filtering benefit in join output estimation. + let join_card = estimator.estimate_join_cardinality(400_000.0, 60_000_000.0, &[0]); + assert!((join_card - 12_000_000.0).abs() < 1e-6); + + Ok(()) + } + + #[test] + fn test_tdom_fallback_uses_min_relation_cardinality() -> Result<()> { + use datafusion::logical_expr::JoinType; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + + let mut graph = QueryGraph::new(); + let schema: Arc = + Arc::new(Schema::new(vec![Field::new("k", DataType::Int32, false)])); + + // No distinct stats available for either side. + let plan0 = Arc::new(EmptyExec::new(schema.clone())); + graph.add_relation(RelationNode::new( + plan0, + 0, + 1_500_000.0, + 1_500_000.0, + Statistics::new_unknown(&schema), + )); + let plan1 = Arc::new(EmptyExec::new(schema.clone())); + graph.add_relation(RelationNode::new( + plan1, + 1, + 100_000.0, + 100_000.0, + Statistics::new_unknown(&schema), + )); + + // Edge R0.k = R1.k + let l: Arc = Arc::new(Column::new("k", 0)); + let r: Arc = Arc::new(Column::new("k", 0)); + let filter = Arc::new(BinaryExpr::new(l, Operator::Eq, r)) as Arc; + let join_set = JoinSet::new_singleton(0)?.union(&JoinSet::new_singleton(1)?); + let edge = JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "k0".into(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "k1".into(), + }, + )], + ); + graph.add_edge(edge)?; + + let estimator = CardinalityEstimator::new(graph); + let s01 = JoinSet::from_iter([0, 1])?; + let edges = estimator.get_edges_contained_in_set(s01); + assert_eq!(edges.len(), 1); + + // With no stats, TDom should be bounded by the smaller relation (100k), not the larger. + assert!((estimator.get_tdom_for_edge(edges[0]) - 100_000.0).abs() < 1e-9); + Ok(()) + } + + #[test] + fn test_tdom_for_multi_key_edge_uses_all_equivalence_sets() -> Result<()> { + use datafusion::logical_expr::JoinType; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + + let mut graph = QueryGraph::new(); + let schema: Arc = Arc::new(Schema::new(vec![ + Field::new("k1", DataType::Int32, false), + Field::new("k2", DataType::Int32, false), + ])); + + // Ensure edge-level cap doesn't hide the multiplication behavior. + let huge_card = 1_000_000_000_000.0; + + // R0: k1 distinct=10, k2 distinct=100 + let plan0 = Arc::new(EmptyExec::new(schema.clone())); + let mut stats0 = Statistics::new_unknown(&schema); + stats0.column_statistics[0].distinct_count = Precision::Exact(10); + stats0.column_statistics[1].distinct_count = Precision::Exact(100); + graph.add_relation(RelationNode::new(plan0, 0, huge_card, huge_card, stats0)); + + // R1: k1 distinct=20, k2 distinct=200 + let plan1 = Arc::new(EmptyExec::new(schema.clone())); + let mut stats1 = Statistics::new_unknown(&schema); + stats1.column_statistics[0].distinct_count = Precision::Exact(20); + stats1.column_statistics[1].distinct_count = Precision::Exact(200); + graph.add_relation(RelationNode::new(plan1, 1, huge_card, huge_card, stats1)); + + // Edge: (R0.k1 = R1.k1) AND (R0.k2 = R1.k2) + let k1_l: Arc = Arc::new(Column::new("k1", 0)); + let k1_r: Arc = Arc::new(Column::new("k1", 0)); + let k2_l: Arc = Arc::new(Column::new("k2", 1)); + let k2_r: Arc = Arc::new(Column::new("k2", 1)); + let eq1 = Arc::new(BinaryExpr::new(k1_l, Operator::Eq, k1_r)) as Arc; + let eq2 = Arc::new(BinaryExpr::new(k2_l, Operator::Eq, k2_r)) as Arc; + let filter = Arc::new(BinaryExpr::new(eq1, Operator::And, eq2)) as Arc; + + let join_set = JoinSet::new_singleton(0)?.union(&JoinSet::new_singleton(1)?); + graph.add_edge(JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![ + ( + StableColumn { + relation_id: 0, + column_index: 0, + name: "k1".into(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "k1".into(), + }, + ), + ( + StableColumn { + relation_id: 0, + column_index: 1, + name: "k2".into(), + }, + StableColumn { + relation_id: 1, + column_index: 1, + name: "k2".into(), + }, + ), + ], + ))?; + + let estimator = CardinalityEstimator::new(graph); + let s01 = JoinSet::from_iter([0, 1])?; + let edges = estimator.get_edges_contained_in_set(s01); + assert_eq!(edges.len(), 1); + + // For each key, TDom uses the max distinct across the equivalence set: 20 and 200. + // Multi-key TDom should combine them (product) and not accidentally use only one key. + assert!((estimator.get_tdom_for_edge(edges[0]) - 4000.0).abs() < 1e-9); + Ok(()) + } + + #[test] + fn test_tdom_for_multi_key_edge_is_capped_by_min_relation_cardinality() -> Result<()> { + use datafusion::logical_expr::JoinType; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + + let mut graph = QueryGraph::new(); + let schema: Arc = Arc::new(Schema::new(vec![ + Field::new("k1", DataType::Int32, false), + Field::new("k2", DataType::Int32, false), + ])); + + // Min relation cardinality is small, so the composite-key domain must be capped. + let small_card = 1000.0; + let huge_card = 1_000_000_000_000.0; + + // R0: k1 distinct=100, k2 distinct=200 + let plan0 = Arc::new(EmptyExec::new(schema.clone())); + let mut stats0 = Statistics::new_unknown(&schema); + stats0.column_statistics[0].distinct_count = Precision::Exact(100); + stats0.column_statistics[1].distinct_count = Precision::Exact(200); + graph.add_relation(RelationNode::new(plan0, 0, small_card, small_card, stats0)); + + // R1: k1 distinct=100, k2 distinct=200 + let plan1 = Arc::new(EmptyExec::new(schema.clone())); + let mut stats1 = Statistics::new_unknown(&schema); + stats1.column_statistics[0].distinct_count = Precision::Exact(100); + stats1.column_statistics[1].distinct_count = Precision::Exact(200); + graph.add_relation(RelationNode::new(plan1, 1, huge_card, huge_card, stats1)); + + // Edge: (R0.k1 = R1.k1) AND (R0.k2 = R1.k2) + let k1_l: Arc = Arc::new(Column::new("k1", 0)); + let k1_r: Arc = Arc::new(Column::new("k1", 0)); + let k2_l: Arc = Arc::new(Column::new("k2", 1)); + let k2_r: Arc = Arc::new(Column::new("k2", 1)); + let eq1 = Arc::new(BinaryExpr::new(k1_l, Operator::Eq, k1_r)) as Arc; + let eq2 = Arc::new(BinaryExpr::new(k2_l, Operator::Eq, k2_r)) as Arc; + let filter = Arc::new(BinaryExpr::new(eq1, Operator::And, eq2)) as Arc; + + let join_set = JoinSet::new_singleton(0)?.union(&JoinSet::new_singleton(1)?); + graph.add_edge(JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![ + ( + StableColumn { + relation_id: 0, + column_index: 0, + name: "k1".into(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "k1".into(), + }, + ), + ( + StableColumn { + relation_id: 0, + column_index: 1, + name: "k2".into(), + }, + StableColumn { + relation_id: 1, + column_index: 1, + name: "k2".into(), + }, + ), + ], + ))?; + + let estimator = CardinalityEstimator::new(graph); + let s01 = JoinSet::from_iter([0, 1])?; + let edges = estimator.get_edges_contained_in_set(s01); + assert_eq!(edges.len(), 1); + + // Uncapped product would be 100 * 200 = 20000, but composite-key domain cannot exceed + // the smaller input (R0 has 1000 rows). + assert!((estimator.get_tdom_for_edge(edges[0]) - 1000.0).abs() < 1e-9); + Ok(()) + } } diff --git a/crates/sail-physical-optimizer/src/join_reorder/enumerator.rs b/crates/sail-physical-optimizer/src/join_reorder/enumerator.rs index 9a1e4f19ce..27e19e7784 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/enumerator.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/enumerator.rs @@ -1,6 +1,7 @@ use std::collections::HashMap; use std::sync::Arc; +use datafusion::common::stats::Precision; use datafusion::error::{DataFusionError, Result}; use crate::join_reorder::cardinality_estimator::CardinalityEstimator; @@ -17,6 +18,10 @@ pub struct PlanEnumerator { cost_model: CostModel, /// Counter for tracking the number of plans generated/evaluated emit_count: usize, + /// Relations considered "fact anchors" in skewed star/snowflake shapes. + anchor_relations: JoinSet, + /// Whether guarded anchor penalties should participate in DP costing. + enable_fact_anchor_heuristic: bool, } /// Threshold for maximum number of plans to generate before falling back to greedy algorithm @@ -25,7 +30,126 @@ const EMIT_THRESHOLD: usize = 10000; /// Threshold for relation count above which heuristic pruning is applied const RELATION_THRESHOLD: usize = 10; +/// Minimum relation count before enabling guarded fact-anchor penalties. +const FACT_ANCHOR_MIN_RELATIONS: usize = 5; +/// A relation is considered an anchor when base_cardinality >= max_base * threshold. +const FACT_ANCHOR_RELATIVE_THRESHOLD: f64 = 0.25; +/// Anchor relations should dominate this share of total base cardinality. +const FACT_ANCHOR_MIN_SHARE: f64 = 0.55; +/// Penalty applied to low-confidence joins that avoid all anchor relations. +const FACT_ANCHOR_PENALTY_MULTIPLIER: f64 = 8.0; + impl PlanEnumerator { + fn derive_anchor_relations(query_graph: &QueryGraph) -> (JoinSet, bool) { + let relation_count = query_graph.relation_count(); + if relation_count == 0 { + return (JoinSet::new(), false); + } + + let max_base = query_graph + .relations + .iter() + .map(|relation| { + if relation.base_cardinality.is_finite() && relation.base_cardinality > 0.0 { + relation.base_cardinality + } else { + 0.0 + } + }) + .fold(0.0, f64::max); + if max_base <= 0.0 { + return (JoinSet::new(), false); + } + + let threshold = max_base * FACT_ANCHOR_RELATIVE_THRESHOLD; + let mut anchor_bits = 0u64; + let mut anchor_total = 0.0; + let mut total = 0.0; + let mut anchor_count = 0usize; + + for relation in &query_graph.relations { + let base = if relation.base_cardinality.is_finite() && relation.base_cardinality > 0.0 { + relation.base_cardinality + } else { + 0.0 + }; + total += base; + + if base >= threshold { + anchor_bits |= 1u64 << relation.relation_id; + anchor_total += base; + anchor_count += 1; + } + } + + // Defensive fallback: always keep at least one anchor candidate. + if anchor_bits == 0 { + if let Some(relation) = query_graph + .relations + .iter() + .max_by(|left, right| left.base_cardinality.total_cmp(&right.base_cardinality)) + { + anchor_bits |= 1u64 << relation.relation_id; + anchor_total = relation.base_cardinality.max(0.0); + anchor_count = 1; + } + } + + let anchors = JoinSet::from_bits(anchor_bits); + let anchor_share = if total > 0.0 { + anchor_total / total + } else { + 0.0 + }; + let max_allowed_anchor_count = (relation_count / 2).max(1); + let enabled = relation_count >= FACT_ANCHOR_MIN_RELATIONS + && anchor_count > 0 + && anchor_count <= max_allowed_anchor_count + && anchor_share >= FACT_ANCHOR_MIN_SHARE; + + (anchors, enabled) + } + + fn relation_has_distinct_stat(&self, relation_id: usize, column_index: usize) -> bool { + self.query_graph + .get_relation(relation_id) + .and_then(|relation| relation.statistics.column_statistics.get(column_index)) + .is_some_and(|stats| !matches!(stats.distinct_count, Precision::Absent)) + } + + /// Returns true when join-key NDV confidence is low for this edge. + /// + /// We only treat an edge as low confidence when at least one equi-key pair lacks + /// distinct-count stats on both sides. + fn edge_is_low_confidence(&self, edge_index: usize) -> bool { + let Some(edge) = self.query_graph.edges.get(edge_index) else { + return false; + }; + + if edge.equi_pairs.is_empty() { + return false; + } + + edge.equi_pairs.iter().any(|(left, right)| { + !self.relation_has_distinct_stat(left.relation_id, left.column_index) + && !self.relation_has_distinct_stat(right.relation_id, right.column_index) + }) + } + + fn should_apply_fact_anchor_penalty(&self, parent: JoinSet, edge_indices: &[usize]) -> bool { + if !self.enable_fact_anchor_heuristic { + return false; + } + if !parent.is_disjoint(&self.anchor_relations) { + return false; + } + + edge_indices + .iter() + .copied() + .any(|edge_index| self.edge_is_low_confidence(edge_index)) + } + /// Generate all non-empty subsets of the given neighbor list. fn generate_all_nonempty_subsets(&self, elems: &[usize]) -> Vec> { let n = elems.len(); @@ -47,6 +171,8 @@ impl PlanEnumerator { } /// Creates a new plan enumerator. pub fn new(query_graph: QueryGraph) -> Self { + let (anchor_relations, enable_fact_anchor_heuristic) = + Self::derive_anchor_relations(&query_graph); let cardinality_estimator = CardinalityEstimator::new(query_graph.clone()); let cost_model = CostModel::new(); @@ -56,6 +182,8 @@ impl PlanEnumerator { cardinality_estimator, cost_model, emit_count: 0, + anchor_relations, + enable_fact_anchor_heuristic, } } @@ -74,20 +202,45 @@ impl PlanEnumerator { self.init_leaf_plans()?; // Run DPhyp join enumeration - if !self.join_reorder_by_dphyp()? { - return Ok(None); - } + let completed = self.join_reorder_by_dphyp()?; - // Return the plan containing all relations if found; otherwise fallback to greedy + // Return the plan containing all relations if found; otherwise fallback to greedy. let all_relations_set = self.create_all_relations_set(); if let Some(result) = self.dp_table.get(&all_relations_set).cloned() { Ok(Some(result)) + } else if !completed { + Ok(None) } else { let greedy_plan = self.solve_greedy()?; Ok(Some(greedy_plan)) } } + /// Ensure leaf plans exist for all single relations without overwriting existing entries. + /// + /// This is used by greedy fallback so it can reuse any DP results that already exist. + fn ensure_leaf_plans(&mut self) -> Result<()> { + for relation in &self.query_graph.relations { + let relation_id = relation.relation_id; + let join_set = JoinSet::new_singleton(relation_id)?; + + if self.dp_table.contains_key(&join_set) { + continue; + } + + // Estimate cardinality for single relation + let cardinality = self.cardinality_estimator.estimate_cardinality(join_set)?; + + // Create leaf plan (cost is set to cardinality in DPPlan::new_leaf) + let plan = Arc::new(DPPlan::new_leaf(relation_id, cardinality)?); + + // Insert into DP table + self.dp_table.insert(join_set, plan); + } + + Ok(()) + } + /// Initialize leaf plans for all single relations. fn init_leaf_plans(&mut self) -> Result<()> { for relation in &self.query_graph.relations { @@ -120,6 +273,57 @@ impl PlanEnumerator { .collect() } + fn relation_initial_cardinality(&self, relation_id: usize) -> f64 { + self.query_graph + .relations + .iter() + .find(|relation| relation.relation_id == relation_id) + .map(|relation| relation.initial_cardinality) + .unwrap_or(f64::INFINITY) + } + + fn sort_neighbors_by_heuristic( + &self, + anchor_set: JoinSet, + neighbors: &mut [usize], + ) -> Result<()> { + let mut scores: HashMap = HashMap::with_capacity(neighbors.len()); + for &neighbor in neighbors.iter() { + let neighbor_set = JoinSet::new_singleton(neighbor)?; + let edge_count = self + .query_graph + .get_connecting_edge_indices(anchor_set, neighbor_set) + .len(); + let cardinality = self.relation_initial_cardinality(neighbor); + scores.insert(neighbor, (edge_count, cardinality)); + } + + neighbors.sort_unstable_by(|left, right| { + let (left_edges, left_card) = scores.get(left).copied().unwrap_or((0, f64::INFINITY)); + let (right_edges, right_card) = + scores.get(right).copied().unwrap_or((0, f64::INFINITY)); + + right_edges + .cmp(&left_edges) + .then_with(|| left_card.total_cmp(&right_card)) + .then_with(|| left.cmp(right)) + }); + Ok(()) + } + + fn prune_neighbors(&self, anchor_set: JoinSet, neighbors: &mut Vec) -> Result<()> { + if self.query_graph.relation_count() < RELATION_THRESHOLD { + return Ok(()); + } + + self.sort_neighbors_by_heuristic(anchor_set, neighbors)?; + let limit = anchor_set.cardinality() as usize; + if neighbors.len() > limit { + neighbors.truncate(limit); + } + Ok(()) + } + /// Start enumeration from a single relation index. fn process_node_as_start(&mut self, idx: usize) -> Result { let nodes = JoinSet::new_singleton(idx)?; @@ -201,14 +405,7 @@ impl PlanEnumerator { return Ok(true); } - // TODO: Implement heuristic pruning for neighbor selection to accelerate DP. - // Instead of simple truncation, sort neighbors based on a heuristic. - if self.query_graph.relation_count() >= RELATION_THRESHOLD { - let limit = nodes.cardinality() as usize; - if neighbors.len() > limit { - neighbors.truncate(limit); - } - } + self.prune_neighbors(nodes, &mut neighbors)?; // Generate all non-empty neighbor subsets and union with current nodes let all_subsets = self.generate_all_nonempty_subsets(&neighbors); @@ -251,13 +448,7 @@ impl PlanEnumerator { return Ok(true); } - // TODO: Apply better pruning here as well, similar to `enumerate_csg_rec`. - if self.query_graph.relation_count() >= RELATION_THRESHOLD { - let limit = right.cardinality() as usize; - if neighbor_ids.len() > limit { - neighbor_ids.truncate(limit); - } - } + self.prune_neighbors(right, &mut neighbor_ids)?; // Generate all non-empty neighbor subsets and union with current right set let all_subsets = self.generate_all_nonempty_subsets(&neighbor_ids); @@ -331,9 +522,12 @@ impl PlanEnumerator { right_plan.cardinality, edge_indices, ); - let new_cost = self + let mut new_cost = self .cost_model .compute_cost(&left_plan, &right_plan, new_cardinality); + if self.should_apply_fact_anchor_penalty(parent, edge_indices) { + new_cost += new_cardinality * FACT_ANCHOR_PENALTY_MULTIPLIER; + } let new_plan = Arc::new(DPPlan::new_join( left, @@ -356,14 +550,6 @@ impl PlanEnumerator { Ok(new_cost) } - /// Check if two disjoint subsets are connected by at least one edge. - fn are_subsets_connected(&self, left_subset: JoinSet, right_subset: JoinSet) -> bool { - !self - .query_graph - .get_connecting_edge_indices(left_subset, right_subset) - .is_empty() - } - /// Create a JoinSet containing all relations. fn create_all_relations_set(&self) -> JoinSet { let relation_count = self.query_graph.relation_count(); @@ -372,7 +558,9 @@ impl PlanEnumerator { } /// Greedy join reorder algorithm as fallback when DP exceeds threshold. - /// Starts with all single relations and repeatedly joins the pair with lowest cost. + /// + /// This fallback intentionally constructs a strict left-deep tree to avoid catastrophic + /// bushy plans on large star/snowflake schemas. pub fn solve_greedy(&mut self) -> Result> { let relation_count = self.query_graph.relation_count(); @@ -382,10 +570,28 @@ impl PlanEnumerator { )); } + // Ensure leaf plans exist so greedy can run even when called standalone. + self.ensure_leaf_plans()?; + + // If DP (even partial) already produced a full plan, prefer it directly. + let all_relations_set = self.create_all_relations_set(); + if let Some(plan) = self.dp_table.get(&all_relations_set).cloned() { + return Ok(plan); + } + if relation_count == 1 { - // Initialize leaf plans and return the single relation - self.init_leaf_plans()?; - let single_relation_set = JoinSet::new_singleton(0)?; + // Return the single relation. + let relation_id = self + .query_graph + .relations + .first() + .map(|relation| relation.relation_id) + .ok_or_else(|| { + DataFusionError::Internal( + "Expected one relation but query graph is empty".to_string(), + ) + })?; + let single_relation_set = JoinSet::new_singleton(relation_id)?; return self .dp_table .get(&single_relation_set) @@ -393,176 +599,140 @@ impl PlanEnumerator { .ok_or_else(|| DataFusionError::Internal("Single relation not found".to_string())); } - // Initialize leaf plans for all single relations - self.init_leaf_plans()?; - - // Create a list of current subplans (initially all single relations) - let mut current_plans: Vec = (0..relation_count) - .map(JoinSet::new_singleton) - .collect::, _>>()?; + // Start from the largest base relation (typically fact table in star/snowflake schemas). + let start_relation = self + .query_graph + .relations + .iter() + .max_by(|left, right| { + left.initial_cardinality + .total_cmp(&right.initial_cardinality) + }) + .map(|relation| relation.relation_id) + .ok_or_else(|| { + DataFusionError::Internal("Failed to determine greedy start relation".to_string()) + })?; - // Repeatedly find and merge the best pair until only one plan remains - while current_plans.len() > 1 { + let mut current_set = JoinSet::new_singleton(start_relation)?; + let mut current_plan = self.dp_table.get(¤t_set).cloned().ok_or_else(|| { + DataFusionError::Internal("Start relation plan not found in DP table".to_string()) + })?; + let mut remaining = all_relations_set - current_set; + + // Grow the plan one relation at a time to preserve left-deep shape. + while remaining.bits() != 0 { + let mut best_next_rel: Option = None; + let mut best_edges = Vec::new(); + let mut best_cardinality = f64::INFINITY; let mut best_cost = f64::INFINITY; - let mut best_left_idx = 0; - let mut best_right_idx = 1; - let mut best_plan: Option> = None; - - // Try all pairs of current plans - for i in 0..current_plans.len() { - for j in (i + 1)..current_plans.len() { - let left_set = current_plans[i]; - let right_set = current_plans[j]; - - // Check if these subplans are connected - if !self.are_subsets_connected(left_set, right_set) { - continue; // Skip disconnected pairs to avoid cartesian products - } - // Get existing plans from DP table - let left_plan = match self.dp_table.get(&left_set) { - Some(plan) => plan.clone(), - None => continue, - }; - - let right_plan = match self.dp_table.get(&right_set) { - Some(plan) => plan.clone(), - None => continue, - }; - - // Get connecting edge indices - let edge_indices = self - .query_graph - .get_connecting_edge_indices(left_set, right_set); - - // Estimate join cardinality - let new_cardinality = self.cardinality_estimator.estimate_join_cardinality( - left_plan.cardinality, - right_plan.cardinality, - &edge_indices, - ); - - // Compute cost - let new_cost = - self.cost_model - .compute_cost(&left_plan, &right_plan, new_cardinality); + // Prefer connected extensions first. + for next_rel in remaining.iter() { + let next_set = JoinSet::new_singleton(next_rel)?; + let next_plan = self.dp_table.get(&next_set).ok_or_else(|| { + DataFusionError::Internal(format!( + "Leaf plan for relation {} not found in DP table", + next_rel + )) + })?; - // Check if this is the best pair so far - if new_cost < best_cost { - best_cost = new_cost; - best_left_idx = i; - best_right_idx = j; - - // Create the join plan - best_plan = Some(Arc::new(DPPlan::new_join( - left_set, - right_set, - edge_indices, - new_cost, - new_cardinality, - ))); - } + let edge_indices = self + .query_graph + .get_connecting_edge_indices(current_set, next_set); + if edge_indices.is_empty() { + continue; + } + + let new_cardinality = self.cardinality_estimator.estimate_join_cardinality( + current_plan.cardinality, + next_plan.cardinality, + &edge_indices, + ); + let new_cost = + self.cost_model + .compute_cost(¤t_plan, next_plan, new_cardinality); + + if new_cardinality < best_cardinality + || (new_cardinality == best_cardinality && new_cost < best_cost) + { + best_next_rel = Some(next_rel); + best_edges = edge_indices; + best_cardinality = new_cardinality; + best_cost = new_cost; } } - // Create cartesian product with minimum cardinality if no connected pairs found - if best_plan.is_none() && current_plans.len() >= 2 { - let mut min_cardinality_product = f64::INFINITY; - let mut selected_left_idx = 0; - let mut selected_right_idx = 1; - - // Find pair with minimum cardinality product - for i in 0..current_plans.len() { - for j in (i + 1)..current_plans.len() { - let left_set = current_plans[i]; - let right_set = current_plans[j]; - - if let (Some(left_plan), Some(right_plan)) = - (self.dp_table.get(&left_set), self.dp_table.get(&right_set)) - { - let cardinality_product = - left_plan.cardinality * right_plan.cardinality; - if cardinality_product < min_cardinality_product { - min_cardinality_product = cardinality_product; - selected_left_idx = i; - selected_right_idx = j; - } - } + // If no connected relation exists, use a penalized cross join fallback. + if best_next_rel.is_none() { + for next_rel in remaining.iter() { + let next_set = JoinSet::new_singleton(next_rel)?; + let next_plan = self.dp_table.get(&next_set).ok_or_else(|| { + DataFusionError::Internal(format!( + "Leaf plan for relation {} not found in DP table", + next_rel + )) + })?; + + let new_cardinality = current_plan.cardinality * next_plan.cardinality; + let new_cost = + self.cost_model + .compute_cost(¤t_plan, next_plan, new_cardinality) + + 1_000_000.0; + + if new_cardinality < best_cardinality + || (new_cardinality == best_cardinality && new_cost < best_cost) + { + best_next_rel = Some(next_rel); + best_edges = Vec::new(); + best_cardinality = new_cardinality; + best_cost = new_cost; } } - - let left_set = current_plans[selected_left_idx]; - let right_set = current_plans[selected_right_idx]; - - let left_plan = self.dp_table.get(&left_set).cloned().ok_or_else(|| { - DataFusionError::Internal( - "Left plan not found in DP table for cartesian product".to_string(), - ) - })?; - let right_plan = self.dp_table.get(&right_set).cloned().ok_or_else(|| { - DataFusionError::Internal( - "Right plan not found in DP table for cartesian product".to_string(), - ) - })?; - - let new_cardinality = left_plan.cardinality * right_plan.cardinality; - let cartesian_penalty = 1000000.0; - let new_cost = - self.cost_model - .compute_cost(&left_plan, &right_plan, new_cardinality) - + cartesian_penalty; - - best_plan = Some(Arc::new(DPPlan::new_join( - left_set, - right_set, - vec![], // No connecting edges for cartesian product - new_cost, - new_cardinality, - ))); - best_left_idx = selected_left_idx; - best_right_idx = selected_right_idx; } - // Verify we have a valid plan - let plan = best_plan.ok_or_else(|| { + let next_rel = best_next_rel.ok_or_else(|| { DataFusionError::Internal( - "Failed to find any joinable pair in greedy algorithm".to_string(), + "Failed to select next relation in greedy algorithm".to_string(), ) })?; - // Add the new plan to DP table - self.dp_table.insert(plan.join_set, plan.clone()); + let next_set = JoinSet::new_singleton(next_rel)?; + let next_join_set = current_set | next_set; + let new_plan = Arc::new(DPPlan::new_join( + current_set, + next_set, + best_edges, + best_cost, + best_cardinality, + )); - // Remove merged plans and add new one (reverse order to maintain indices) - if best_left_idx < best_right_idx { - current_plans.remove(best_right_idx); - current_plans.remove(best_left_idx); - } else { - current_plans.remove(best_left_idx); - current_plans.remove(best_right_idx); - } - current_plans.push(plan.join_set); + self.dp_table.insert(next_join_set, new_plan.clone()); + current_set = next_join_set; + current_plan = new_plan; + remaining -= next_set; } - // Return the final plan - let final_set = current_plans[0]; - self.dp_table.get(&final_set).cloned().ok_or_else(|| { - DataFusionError::Internal("Final plan not found in greedy algorithm".to_string()) - }) + Ok(current_plan) } } #[cfg(test)] #[expect(clippy::unwrap_used)] mod tests { + use std::collections::HashMap; use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::stats::Precision; use datafusion::common::Statistics; + use datafusion::logical_expr::{JoinType, Operator}; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::empty::EmptyExec; use super::*; - use crate::join_reorder::graph::{QueryGraph, RelationNode}; + use crate::join_reorder::dp_plan::PlanType; + use crate::join_reorder::graph::{JoinEdge, QueryGraph, RelationNode, StableColumn}; fn create_test_graph_with_relations(count: usize) -> QueryGraph { let mut graph = QueryGraph::new(); @@ -574,13 +744,151 @@ mod tests { for i in 0..count { let plan = Arc::new(EmptyExec::new(schema.clone())); - let relation = RelationNode::new(plan, i, 1000.0, Statistics::new_unknown(&schema)); + let relation = + RelationNode::new(plan, i, 1000.0, 1000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation); } graph } + fn create_star_graph(cardinalities: &[f64], center: usize) -> Result { + let mut graph = QueryGraph::new(); + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + + for (relation_id, &rows) in cardinalities.iter().enumerate() { + let plan = Arc::new(EmptyExec::new(schema.clone())); + let relation = RelationNode::new( + plan, + relation_id, + rows, + rows, + Statistics::new_unknown(&schema), + ); + graph.add_relation(relation); + } + + for relation_id in 0..cardinalities.len() { + if relation_id == center { + continue; + } + + let join_filter = Arc::new(BinaryExpr::new( + Arc::new(Column::new("id", 0)) as Arc, + Operator::Eq, + Arc::new(Column::new("id", 0)) as Arc, + )) as Arc; + + let edge = JoinEdge::new( + JoinSet::new_singleton(center)? | JoinSet::new_singleton(relation_id)?, + join_filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: center, + column_index: 0, + name: format!("R{}.C0", center), + }, + StableColumn { + relation_id, + column_index: 0, + name: format!("R{}.C0", relation_id), + }, + )], + ); + graph.add_edge(edge)?; + } + + Ok(graph) + } + + fn create_graph_with_custom_distinct_stats( + cardinalities: &[f64], + distinct_stats: &[Option], + ) -> Result { + assert_eq!(cardinalities.len(), distinct_stats.len()); + + let mut graph = QueryGraph::new(); + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + + for (relation_id, &rows) in cardinalities.iter().enumerate() { + let plan = Arc::new(EmptyExec::new(schema.clone())); + let mut stats = Statistics::new_unknown(&schema); + if let Some(distinct) = distinct_stats[relation_id] { + stats.column_statistics[0].distinct_count = Precision::Exact(distinct); + } + + let relation = RelationNode::new(plan, relation_id, rows, rows, stats); + graph.add_relation(relation); + } + + Ok(graph) + } + + fn add_equi_join_edge(graph: &mut QueryGraph, left: usize, right: usize) -> Result { + let join_filter = Arc::new(BinaryExpr::new( + Arc::new(Column::new("id", 0)) as Arc, + Operator::Eq, + Arc::new(Column::new("id", 0)) as Arc, + )) as Arc; + + let edge = JoinEdge::new( + JoinSet::new_singleton(left)? | JoinSet::new_singleton(right)?, + join_filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: left, + column_index: 0, + name: format!("R{}.C0", left), + }, + StableColumn { + relation_id: right, + column_index: 0, + name: format!("R{}.C0", right), + }, + )], + ); + let edge_index = graph.edges.len(); + graph.add_edge(edge)?; + Ok(edge_index) + } + + fn assert_strict_left_deep(plan: &Arc, dp_table: &HashMap>) { + match &plan.plan_type { + PlanType::Leaf { .. } => {} + PlanType::Join { + left_set, + right_set, + .. + } => { + assert_eq!( + right_set.cardinality(), + 1, + "each greedy step should add exactly one base relation" + ); + let right_plan = dp_table.get(right_set).unwrap(); + assert!( + matches!(right_plan.plan_type, PlanType::Leaf { .. }), + "right side must be a leaf in strict left-deep plan" + ); + + let left_plan = dp_table.get(left_set).unwrap(); + assert_strict_left_deep(left_plan, dp_table); + } + } + } + + fn leftmost_relation_id(plan: &Arc, dp_table: &HashMap>) -> usize { + match &plan.plan_type { + PlanType::Leaf { relation_id } => *relation_id, + PlanType::Join { left_set, .. } => { + let left_plan = dp_table.get(left_set).unwrap(); + leftmost_relation_id(left_plan, dp_table) + } + } + } + #[test] fn test_plan_enumerator_creation() { let graph = create_test_graph_with_relations(2); @@ -617,4 +925,114 @@ mod tests { assert_eq!(all_set.bits(), 7); // 111 in binary = 7 assert_eq!(all_set.cardinality(), 3); } + + #[test] + fn test_solve_greedy_generates_strict_left_deep_plan() -> Result<()> { + let graph = create_star_graph(&[1_000_000.0, 4_000.0, 3_000.0, 2_000.0, 1_500.0], 0)?; + let mut enumerator = PlanEnumerator::new(graph); + + let plan = enumerator.solve_greedy()?; + assert_eq!(plan.join_set.cardinality(), 5); + assert_strict_left_deep(&plan, &enumerator.dp_table); + + Ok(()) + } + + #[test] + fn test_solve_greedy_starts_from_largest_relation() -> Result<()> { + let graph = create_star_graph(&[1_000.0, 2_000.0, 50_000.0, 3_000.0], 2)?; + let mut enumerator = PlanEnumerator::new(graph); + + let plan = enumerator.solve_greedy()?; + let start_relation = leftmost_relation_id(&plan, &enumerator.dp_table); + assert_eq!(start_relation, 2); + + Ok(()) + } + + #[test] + fn test_prune_neighbors_uses_cardinality_heuristic_when_threshold_applies() -> Result<()> { + // RELATION_THRESHOLD is 10. Build a graph with 11 relations so pruning is active. + // All neighbors have identical edge counts to center relation 0, so ordering should + // fall back to ascending initial_cardinality instead of relation-id order. + let graph = create_star_graph( + &[ + 1_000_000.0, // center + 1_000.0, + 900.0, + 800.0, + 700.0, + 600.0, + 500.0, + 400.0, + 300.0, + 200.0, + 1.0, // relation_id 10 is smallest, should be kept first after pruning + ], + 0, + )?; + let enumerator = PlanEnumerator::new(graph); + + let mut neighbors: Vec = (1..=10).collect(); + enumerator.prune_neighbors(JoinSet::new_singleton(0)?, &mut neighbors)?; + + // anchor_set cardinality is 1, so prune keeps one neighbor. + assert_eq!(neighbors.len(), 1); + assert_eq!(neighbors[0], 10); + Ok(()) + } + + #[test] + fn test_fact_anchor_penalty_triggers_for_unanchored_low_confidence_join() -> Result<()> { + let cardinalities = [50_000_000.0, 1_920_800.0, 1_920_800.0, 20_000.0, 10_000.0]; + let distinct_stats = [None, None, None, None, None]; + let mut graph = create_graph_with_custom_distinct_stats(&cardinalities, &distinct_stats)?; + + let edge_01 = add_equi_join_edge(&mut graph, 0, 1)?; + let _edge_02 = add_equi_join_edge(&mut graph, 0, 2)?; + let _edge_03 = add_equi_join_edge(&mut graph, 0, 3)?; + let _edge_04 = add_equi_join_edge(&mut graph, 0, 4)?; + let edge_12 = add_equi_join_edge(&mut graph, 1, 2)?; + + let enumerator = PlanEnumerator::new(graph); + assert!(enumerator.enable_fact_anchor_heuristic); + + let dim_parent = JoinSet::from_iter([1, 2])?; + assert!(enumerator.should_apply_fact_anchor_penalty(dim_parent, &[edge_12])); + + let anchored_parent = JoinSet::from_iter([0, 1])?; + assert!(!enumerator.should_apply_fact_anchor_penalty(anchored_parent, &[edge_01])); + Ok(()) + } + + #[test] + fn test_fact_anchor_penalty_skips_when_one_side_has_distinct_stats() -> Result<()> { + let cardinalities = [50_000_000.0, 1_920_800.0, 1_920_800.0, 20_000.0, 10_000.0]; + let distinct_stats = [None, Some(1000), None, None, None]; + let mut graph = create_graph_with_custom_distinct_stats(&cardinalities, &distinct_stats)?; + + let _edge_01 = add_equi_join_edge(&mut graph, 0, 1)?; + let _edge_02 = add_equi_join_edge(&mut graph, 0, 2)?; + let _edge_03 = add_equi_join_edge(&mut graph, 0, 3)?; + let _edge_04 = add_equi_join_edge(&mut graph, 0, 4)?; + let edge_12 = add_equi_join_edge(&mut graph, 1, 2)?; + + let enumerator = PlanEnumerator::new(graph); + assert!(enumerator.enable_fact_anchor_heuristic); + + let dim_parent = JoinSet::from_iter([1, 2])?; + assert!(!enumerator.should_apply_fact_anchor_penalty(dim_parent, &[edge_12])); + Ok(()) + } + + #[test] + fn test_fact_anchor_heuristic_disabled_when_no_clear_anchor_shape() -> Result<()> { + let cardinalities = [1_000.0, 950.0, 900.0, 850.0, 800.0]; + let distinct_stats = [None, None, None, None, None]; + let graph = create_graph_with_custom_distinct_stats(&cardinalities, &distinct_stats)?; + + let enumerator = PlanEnumerator::new(graph); + assert!(!enumerator.enable_fact_anchor_heuristic); + Ok(()) + } } diff --git a/crates/sail-physical-optimizer/src/join_reorder/graph.rs b/crates/sail-physical-optimizer/src/join_reorder/graph.rs index 553c3b28ca..b392385937 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/graph.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/graph.rs @@ -1,7 +1,8 @@ use std::collections::HashMap; +use std::hash::{Hash, Hasher}; use std::sync::Arc; -use datafusion::common::Statistics; +use datafusion::common::{NullEquality, Statistics}; use datafusion::error::{DataFusionError, Result}; use datafusion::logical_expr::JoinType; use datafusion::physical_expr::PhysicalExpr; @@ -10,13 +11,55 @@ use datafusion::physical_plan::ExecutionPlan; use crate::join_reorder::join_set::JoinSet; /// Represents a stable column identifier across the query graph. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone)] pub struct StableColumn { pub relation_id: usize, pub column_index: usize, pub name: String, } +impl StableColumn { + /// Build the canonical stable-column name used across join reordering. + pub fn format_stable_name(relation_id: usize, column_index: usize) -> String { + format!("R{}.C{}", relation_id, column_index) + } + + /// Parse a stable-column name like "R{rel}.C{col}" -> (rel, col). + pub fn parse_stable_name(name: &str) -> Option<(usize, usize)> { + if !name.starts_with('R') { + return None; + } + let dot = name.find('.')?; + let rel_str = &name[1..dot]; + if !name[dot + 1..].starts_with('C') { + return None; + } + let col_str = &name[dot + 2..]; + let rel = rel_str.parse::().ok()?; + let col = col_str.parse::().ok()?; + Some((rel, col)) + } +} + +// NOTE: `name` is for display/debugging only and must not participate in identity. +// Join reordering uses StableColumn as a key in HashMaps/Sets. Column names can vary +// (projection aliases, empty placeholder names, etc.) while (relation_id, column_index) +// remain stable within the query graph. +impl PartialEq for StableColumn { + fn eq(&self, other: &Self) -> bool { + self.relation_id == other.relation_id && self.column_index == other.column_index + } +} + +impl Eq for StableColumn {} + +impl Hash for StableColumn { + fn hash(&self, state: &mut H) { + self.relation_id.hash(state); + self.column_index.hash(state); + } +} + /// Represents a single reorderable relation (e.g., TableScanExec). #[derive(Debug, Clone)] pub struct RelationNode { @@ -26,6 +69,8 @@ pub struct RelationNode { pub relation_id: usize, /// Initial cardinality estimate. pub initial_cardinality: f64, + /// Base cardinality before local filters are applied. + pub base_cardinality: f64, /// Statistics provided by DataFusion. pub statistics: Statistics, // TODO: Enhance statistics and its usage. @@ -36,12 +81,14 @@ impl RelationNode { plan: Arc, relation_id: usize, initial_cardinality: f64, + base_cardinality: f64, statistics: Statistics, ) -> Self { Self { plan, relation_id, initial_cardinality, + base_cardinality, statistics, } } @@ -57,6 +104,8 @@ pub struct JoinEdge { pub filter: Arc, /// Join type (Inner for reorderable joins). pub join_type: JoinType, + /// Null semantics for equi-join key comparison. + pub null_equality: NullEquality, // pub selectivity: f64, /// Parsed equi-join pairs from the join condition @@ -74,6 +123,7 @@ impl JoinEdge { join_set, filter, join_type, + null_equality: NullEquality::NullEqualsNothing, equi_pairs, } } @@ -395,7 +445,24 @@ impl QueryGraph { ); } - edge_indices.into_iter().collect() + let union = left | right; + + // NOTE: The trie neighbor lookup can surface hyperedges that *overlap* `left` and + // `right` but require additional relations not yet present. Those edges must not be used + // to connect two subsets in the DP enumerator, otherwise we may materialize only part of a + // multi-relation join predicate (e.g. split a compound join key across different joins), + // creating huge intermediates. + edge_indices + .into_iter() + .filter(|&idx| { + let Some(edge) = self.edges.get(idx) else { + return false; + }; + edge.join_set.is_subset(&union) + && !edge.join_set.is_disjoint(&left) + && !edge.join_set.is_disjoint(&right) + }) + .collect() } /// Finds connecting edges for all subsets of given size. @@ -517,7 +584,19 @@ mod tests { false, )])); let plan = Arc::new(EmptyExec::new(schema.clone())); - RelationNode::new(plan, id, 1000.0, Statistics::new_unknown(&schema)) + RelationNode::new(plan, id, 1000.0, 1000.0, Statistics::new_unknown(&schema)) + } + + #[test] + fn test_stable_name_round_trip() { + let samples = [(0usize, 0usize), (1, 3), (12, 99), (63, 7)]; + for (relation_id, column_index) in samples { + let stable = StableColumn::format_stable_name(relation_id, column_index); + assert_eq!( + StableColumn::parse_stable_name(&stable), + Some((relation_id, column_index)) + ); + } } #[test] @@ -576,6 +655,7 @@ mod tests { plan, i, 1000.0, + 1000.0, datafusion::common::Statistics::new_unknown(&schema), ); graph.add_relation(relation); @@ -637,6 +717,7 @@ mod tests { plan, i, 1000.0, + 1000.0, datafusion::common::Statistics::new_unknown(&schema), ); graph.add_relation(relation); @@ -657,5 +738,15 @@ mod tests { let set_01 = JoinSet::from_iter([0, 1]).unwrap(); let neighbors_01 = graph.get_neighbors(set_01); assert!(neighbors_01.contains(&2)); + + // A hyperedge {0,1,2} must NOT be treated as a binary connecting edge between {0} and {1} + // because the join condition for that edge isn't fully available until relation 2 is + // present. + let set_1 = JoinSet::new_singleton(1).unwrap(); + let connecting_edge_indices = graph.get_connecting_edge_indices(set_0, set_1); + assert!( + connecting_edge_indices.is_empty(), + "expected no connecting edges between {{0}} and {{1}} from a hyperedge {{0,1,2}}" + ); } } diff --git a/crates/sail-physical-optimizer/src/join_reorder/mod.rs b/crates/sail-physical-optimizer/src/join_reorder/mod.rs index afa9587be7..975a861773 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/mod.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/mod.rs @@ -45,7 +45,8 @@ impl PhysicalOptimizerRule for JoinReorder { displayable(plan.as_ref()).indent(true) ); - // Start the top-down region search and optimization from the root plan + // Search and optimize reorderable regions. We traverse bottom-up so nested reorderable + // regions inside "leaf" plans (as seen by a higher-level region) are also visited. self.find_and_optimize_regions(plan) } @@ -59,46 +60,41 @@ impl PhysicalOptimizerRule for JoinReorder { } impl JoinReorder { - /// Recursively searches for reorderable join regions from the top down. + /// Recursively searches for reorderable join regions bottom-up. fn find_and_optimize_regions( &self, plan: Arc, ) -> Result> { trace!("find_and_optimize_regions: Processing {}", plan.name()); - // Soft fallback: if join reordering fails for any reason, log a warning and - // continue optimizing children under the original plan. - match self.try_optimize_region(plan.clone()) { - Ok(Some(new_plan)) => return Ok(new_plan), - Ok(None) => {} - Err(e) => { - warn!( - "JoinReorder: Optimization failed for region rooted at {} (fallback to original plan): {}", - plan.name(), - e - ); - } - } - - // If no significant reorderable region was found starting at the current node, - // recursively optimize the children of the current node. - trace!("find_and_optimize_regions: No reorderable region found at {}, recursing to {} children", - plan.name(), plan.children().len()); - - // Allow recursion through Left Joins to find Inner Join regions below. - // Left Joins won't be included in reorderable regions but we optimize their children. - + // Optimize children first so any nested reorderable regions inside "leaf" plans + // (from the perspective of the current region) are not skipped. let optimized_children = plan .children() .into_iter() - .map(|child| self.find_and_optimize_regions(child.clone())) + .map(|child| self.find_and_optimize_regions(Arc::clone(child))) .collect::>>()?; - // Rebuild the current node with its optimized children. - if optimized_children.is_empty() { - Ok(plan) + let plan = if optimized_children.is_empty() { + plan } else { - plan.with_new_children(optimized_children) + plan.with_new_children(optimized_children)? + }; + + // Attempt to optimize a reorderable region rooted at this node. + // Soft fallback: if join reordering fails for any reason, log a warning and return + // the plan with optimized children. + match self.try_optimize_region(Arc::clone(&plan)) { + Ok(Some(new_plan)) => Ok(new_plan), + Ok(None) => Ok(plan), + Err(e) => { + warn!( + "JoinReorder: Optimization failed for region rooted at {} (fallback to children-optimized plan): {}", + plan.name(), + e + ); + Ok(plan) + } } } @@ -147,6 +143,11 @@ impl JoinReorder { let mut reconstructor = PlanReconstructor::new(&enumerator.dp_table, &enumerator.query_graph); + // Pre-compute required output columns for each join subtree based on the original + // region-root output columns. This keeps intermediate join outputs narrow before + // `JoinSelection` runs, helping avoid plan-shape regressions when we see through + // projection nodes while building the query graph. + reconstructor.prepare_required_output_columns(&best_plan, &target_column_map)?; let (join_tree, final_map) = reconstructor.reconstruct(&best_plan)?; trace!( @@ -365,16 +366,33 @@ mod tests { use datafusion::arrow::datatypes::{DataType, Field, Schema}; use datafusion::common::NullEquality; use datafusion::logical_expr::{JoinType, Operator}; - use datafusion::physical_expr::expressions::{BinaryExpr, Column}; + use datafusion::physical_expr::expressions::{BinaryExpr, Column, Literal}; use datafusion::physical_expr::utils::collect_columns; use datafusion::physical_expr::PhysicalExpr; use datafusion::physical_plan::aggregates::{AggregateExec, AggregateMode, PhysicalGroupBy}; use datafusion::physical_plan::empty::EmptyExec; + use datafusion::physical_plan::filter::FilterExec; use datafusion::physical_plan::joins::{HashJoinExec, PartitionMode}; use datafusion::physical_plan::projection::ProjectionExec; + use datafusion::scalar::ScalarValue; use super::*; + fn find_node_by_name( + plan: Arc, + name: &str, + ) -> Option> { + if plan.name() == name { + return Some(plan); + } + for child in plan.children() { + if let Some(found) = find_node_by_name(Arc::clone(child), name) { + return Some(found); + } + } + None + } + /// Test that the recursive optimizer correctly processes plans with boundary nodes /// This test verifies that the optimizer doesn't crash and preserves plan structure #[test] @@ -452,6 +470,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); // Create second join: (table1 ⋈ table2) ⋈ table3 @@ -470,6 +489,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); // Create an AggregateExec on top of the joins @@ -547,6 +567,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); // Create lower aggregate @@ -575,6 +596,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); let upper_join2 = Arc::new(HashJoinExec::try_new( @@ -586,6 +608,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); // Create upper aggregate @@ -610,6 +633,97 @@ mod tests { Ok(()) } + /// Regression test: nested reorderable regions inside leaf nodes of a higher-level region + /// must still be optimized. This requires a bottom-up traversal. + #[test] + fn test_nested_reorderable_region_under_leaf_is_optimized() -> Result<()> { + // Tables use the same simple schema so join conditions can consistently reference `id`. + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + + let t1 = Arc::new(EmptyExec::new(schema.clone())); + let t2 = Arc::new(EmptyExec::new(schema.clone())); + let t3 = Arc::new(EmptyExec::new(schema.clone())); + let t4 = Arc::new(EmptyExec::new(schema.clone())); + let t5 = Arc::new(EmptyExec::new(schema.clone())); + + let on = vec![( + Arc::new(Column::new("id", 0)) as Arc, + Arc::new(Column::new("id", 0)) as Arc, + )]; + + // Build a nested reorderable region (3 relations): (t1 ⋈ t2) ⋈ t3 + let join12 = Arc::new(HashJoinExec::try_new( + t1, + t2, + on.clone(), + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + let join123 = Arc::new(HashJoinExec::try_new( + join12, + t3, + on.clone(), + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + // Wrap the nested joins in a boundary node that GraphBuilder treats as a leaf. + // FilterExec is such a boundary (it has a child but isn't "see-through" for graph building). + let pred: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new("id", 0)), + Operator::Gt, + Arc::new(Literal::new(ScalarValue::Int32(Some(0)))), + )); + let filtered_subplan = Arc::new(FilterExec::try_new(pred, join123)?); + + // Higher-level reorderable region (3 relations): (filtered_subplan ⋈ t4) ⋈ t5 + let join4 = Arc::new(HashJoinExec::try_new( + filtered_subplan, + t4, + on.clone(), + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + let root = Arc::new(HashJoinExec::try_new( + join4, + t5, + on, + None, + &JoinType::Inner, + None, + PartitionMode::Auto, + NullEquality::NullEqualsNothing, + false, // null_aware + )?); + + let join_reorder = JoinReorder::new(); + let optimized_plan = join_reorder.find_and_optimize_regions(root)?; + + // Root region should be optimized (>= 3 relations), producing a ProjectionExec. + assert_eq!(optimized_plan.name(), "ProjectionExec"); + + // The nested region under FilterExec must also be optimized, producing its own ProjectionExec. + #[expect(clippy::expect_used)] + let filter_node = find_node_by_name(optimized_plan, "FilterExec") + .expect("expected FilterExec leaf to remain in the optimized plan"); + assert_eq!(filter_node.children().len(), 1); + assert_eq!(filter_node.children()[0].name(), "ProjectionExec"); + + Ok(()) + } + /// Test that the join reorder optimizer correctly handles complex expressions in projections /// This test verifies that expressions referencing columns from different tables are properly /// rewritten when the join order changes. @@ -648,6 +762,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); let join_abc_on = vec![( @@ -664,6 +779,7 @@ mod tests { None, PartitionMode::Auto, NullEquality::NullEqualsNothing, + false, )?); // 3. Create a ProjectionExec on top with a complex expression diff --git a/crates/sail-physical-optimizer/src/join_reorder/reconstructor.rs b/crates/sail-physical-optimizer/src/join_reorder/reconstructor.rs index 9037adfe98..054a4cc5ab 100644 --- a/crates/sail-physical-optimizer/src/join_reorder/reconstructor.rs +++ b/crates/sail-physical-optimizer/src/join_reorder/reconstructor.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use datafusion::common::{JoinSide, NullEquality}; @@ -12,7 +12,9 @@ use datafusion::physical_plan::joins::utils::{ColumnIndex, JoinFilter}; use datafusion::physical_plan::joins::{ CrossJoinExec, HashJoinExec, NestedLoopJoinExec, PartitionMode, }; +use datafusion::physical_plan::projection::{ProjectionExec, ProjectionExpr}; use datafusion::physical_plan::ExecutionPlan; +use log::warn; use crate::join_reorder::builder::{ColumnMap, ColumnMapEntry}; use crate::join_reorder::dp_plan::{DPPlan, PlanType}; @@ -33,6 +35,11 @@ pub struct PlanReconstructor<'a> { plan_cache: HashMap, ColumnMap)>, /// Pending filters that couldn't be applied yet due to missing dependencies pending_filters: Vec, + /// Required stable columns for each JoinSet in the chosen join tree. + /// + /// When present, the reconstructor will add projections to keep join inputs/outputs narrow + /// before `JoinSelection` runs, which can significantly impact physical operator choices. + required_output_cols: HashMap>, } /// Represents a filter that couldn't be applied yet due to missing table dependencies @@ -51,23 +58,211 @@ impl<'a> PlanReconstructor<'a> { query_graph, plan_cache: HashMap::new(), pending_filters: Vec::new(), + required_output_cols: HashMap::new(), } } - /// Parse stable column name like "R{rel}.C{col}" -> (rel, col) - fn parse_stable_name(name: &str) -> Option<(usize, usize)> { - if !name.starts_with('R') { - return None; + /// Prepare a per-JoinSet "required output columns" map for the selected join tree rooted at + /// `root_dp_plan`. The root requirements are derived from `target_map` (i.e. the original + /// region-root output columns), and then pushed down through the join tree while adding any + /// columns needed to evaluate each join's predicates. + pub fn prepare_required_output_columns( + &mut self, + root_dp_plan: &DPPlan, + target_map: &ColumnMap, + ) -> Result<()> { + let root_required = self.collect_required_stable_cols_from_column_map(target_map)?; + self.populate_required_for_plan(root_dp_plan, &root_required)?; + Ok(()) + } + + fn collect_required_stable_cols_from_column_map( + &self, + map: &ColumnMap, + ) -> Result> { + let mut out: HashSet<(usize, usize)> = HashSet::new(); + for entry in map { + self.collect_required_from_entry(entry, &mut out)?; + } + Ok(out) + } + + fn collect_required_from_entry( + &self, + entry: &ColumnMapEntry, + out: &mut HashSet<(usize, usize)>, + ) -> Result<()> { + match entry { + ColumnMapEntry::Stable { + relation_id, + column_index, + } => { + out.insert((*relation_id, *column_index)); + } + ColumnMapEntry::Expression { expr, input_map } => { + self.collect_required_from_expr(expr, input_map, out)?; + } + } + Ok(()) + } + + fn collect_required_from_expr( + &self, + expr: &Arc, + input_map: &ColumnMap, + out: &mut HashSet<(usize, usize)>, + ) -> Result<()> { + let cols = collect_columns(expr); + for c in &cols { + // Prefer stable names if they are already present. + if let Some((rel, col_idx)) = StableColumn::parse_stable_name(c.name()) { + out.insert((rel, col_idx)); + continue; + } + + let entry = input_map.get(c.index()).ok_or_else(|| { + DataFusionError::Internal(format!( + "Expression column index {} out of bounds (len {}) while collecting required columns", + c.index(), + input_map.len() + )) + })?; + self.collect_required_from_entry(entry, out)?; + } + Ok(()) + } + + fn populate_required_for_plan( + &mut self, + dp_plan: &DPPlan, + required: &HashSet<(usize, usize)>, + ) -> Result<()> { + self.required_output_cols + .insert(dp_plan.join_set, required.clone()); + + match &dp_plan.plan_type { + PlanType::Leaf { .. } => Ok(()), + PlanType::Join { + left_set, + right_set, + edge_indices, + } => { + let left_dp_plan = self.dp_table.get(left_set).ok_or_else(|| { + DataFusionError::Internal("Left subplan not found in DP table".to_string()) + })?; + let right_dp_plan = self.dp_table.get(right_set).ok_or_else(|| { + DataFusionError::Internal("Right subplan not found in DP table".to_string()) + })?; + + let mut req_left = self.filter_required_by_join_set(required, left_set); + let mut req_right = self.filter_required_by_join_set(required, right_set); + + // Add columns needed to evaluate the join predicates at this join node. + for &edge_index in edge_indices { + let edge = self.query_graph.edges.get(edge_index).ok_or_else(|| { + DataFusionError::Internal(format!( + "Edge with index {} not found", + edge_index + )) + })?; + + for (a, b) in &edge.equi_pairs { + if Self::join_set_contains_relation(left_set, a.relation_id) { + req_left.insert((a.relation_id, a.column_index)); + } else if Self::join_set_contains_relation(right_set, a.relation_id) { + req_right.insert((a.relation_id, a.column_index)); + } + if Self::join_set_contains_relation(left_set, b.relation_id) { + req_left.insert((b.relation_id, b.column_index)); + } else if Self::join_set_contains_relation(right_set, b.relation_id) { + req_right.insert((b.relation_id, b.column_index)); + } + } + + // Best-effort: add stable columns referenced in the filter expression. + for c in collect_columns(&edge.filter) { + if let Some((rel, col_idx)) = StableColumn::parse_stable_name(c.name()) { + if Self::join_set_contains_relation(left_set, rel) { + req_left.insert((rel, col_idx)); + } else if Self::join_set_contains_relation(right_set, rel) { + req_right.insert((rel, col_idx)); + } + } + } + } + + self.populate_required_for_plan(left_dp_plan, &req_left)?; + self.populate_required_for_plan(right_dp_plan, &req_right)?; + Ok(()) + } } - let dot = name.find('.')?; - let rel_str = &name[1..dot]; - if !name[dot + 1..].starts_with('C') { + } + + fn filter_required_by_join_set( + &self, + required: &HashSet<(usize, usize)>, + join_set: &JoinSet, + ) -> HashSet<(usize, usize)> { + required + .iter() + .filter(|(rel, _)| Self::join_set_contains_relation(join_set, *rel)) + .cloned() + .collect() + } + + #[inline] + fn relation_bit(relation_id: usize) -> u64 { + debug_assert!( + relation_id < u64::BITS as usize, + "relation_id {} must be < {} to fit in JoinSet bitmap", + relation_id, + u64::BITS + ); + 1u64 << relation_id + } + + #[inline] + fn join_set_contains_relation(join_set: &JoinSet, relation_id: usize) -> bool { + (join_set.bits() & Self::relation_bit(relation_id)) != 0 + } + + fn compute_required_projection( + &self, + join_set: JoinSet, + column_map: &ColumnMap, + ) -> Option> { + let required = self.required_output_cols.get(&join_set)?; + let keep: Vec = column_map + .iter() + .enumerate() + .filter_map(|(i, e)| match e { + ColumnMapEntry::Stable { + relation_id, + column_index, + } => required + .contains(&(*relation_id, *column_index)) + .then_some(i), + ColumnMapEntry::Expression { .. } => None, + }) + .collect(); + + if keep.is_empty() { + if !required.is_empty() && !column_map.is_empty() { + warn!( + "JoinReorder: required columns resolved to empty projection for join_set bits={:#x} (required={}, columns={})", + join_set.bits(), + required.len(), + column_map.len() + ); + debug_assert!( + required.is_empty() || column_map.is_empty(), + "non-empty required columns should not produce empty projection indices" + ); + } return None; } - let col_str = &name[dot + 2..]; - let rel = rel_str.parse::().ok()?; - let col = col_str.parse::().ok()?; - Some((rel, col)) + + (keep.len() < column_map.len()).then_some(keep) } /// Main entry point: recursively reconstruct ExecutionPlan from DPPlan @@ -79,7 +274,9 @@ impl<'a> PlanReconstructor<'a> { } let mut result = match &dp_plan.plan_type { - PlanType::Leaf { relation_id } => self.reconstruct_leaf(*relation_id)?, + PlanType::Leaf { relation_id } => { + self.reconstruct_leaf(*relation_id, dp_plan.join_set)? + } PlanType::Join { left_set, right_set, @@ -120,21 +317,39 @@ impl<'a> PlanReconstructor<'a> { } /// Reconstruct leaf node (single relation). - fn reconstruct_leaf(&self, relation_id: usize) -> Result<(Arc, ColumnMap)> { + fn reconstruct_leaf( + &self, + relation_id: usize, + join_set: JoinSet, + ) -> Result<(Arc, ColumnMap)> { let relation_node = self.query_graph.get_relation(relation_id).ok_or_else(|| { DataFusionError::Internal(format!("Relation {} not found in query graph", relation_id)) })?; - let plan = relation_node.plan.clone(); + let mut plan = relation_node.plan.clone(); // Create a fresh ColumnMap for this base relation - let column_map = (0..plan.schema().fields().len()) + let mut column_map: ColumnMap = (0..plan.schema().fields().len()) .map(|i| ColumnMapEntry::Stable { relation_id, column_index: i, }) .collect(); + // Apply leaf projection if required columns were precomputed. + if let Some(keep) = self.compute_required_projection(join_set, &column_map) { + let exprs: Vec = keep + .iter() + .map(|&i| ProjectionExpr { + expr: Arc::new(Column::new(plan.schema().field(i).name(), i)), + alias: plan.schema().field(i).name().to_string(), + }) + .collect(); + let proj = ProjectionExec::try_new(exprs, plan)?; + plan = Arc::new(proj); + column_map = keep.into_iter().map(|i| column_map[i].clone()).collect(); + } + Ok((plan, column_map)) } @@ -145,6 +360,7 @@ impl<'a> PlanReconstructor<'a> { right_set: JoinSet, edge_indices: &[usize], ) -> Result<(Arc, ColumnMap)> { + let current_join_set = left_set | right_set; // Find left and right subplans from DP table let left_dp_plan = self.dp_table.get(&left_set).ok_or_else(|| { DataFusionError::Internal("Left subplan not found in DP table".to_string()) @@ -153,36 +369,52 @@ impl<'a> PlanReconstructor<'a> { DataFusionError::Internal("Right subplan not found in DP table".to_string()) })?; - // Recursively reconstruct left and right subplans - let (left_plan, left_map) = self.reconstruct(left_dp_plan)?; - let (right_plan, right_map) = self.reconstruct(right_dp_plan)?; + // Determine join type from edge information before deciding whether we can swap sides. + let join_type = self.determine_join_type(edge_indices)?; + let null_equality = self.determine_null_equality(edge_indices)?; + + // Build/probe side reordering is semantics-preserving only for inner joins. + let should_swap_for_build = + join_type == JoinType::Inner && left_dp_plan.cardinality > right_dp_plan.cardinality; + let (build_set, probe_set, build_dp, probe_dp) = if should_swap_for_build { + (right_set, left_set, right_dp_plan, left_dp_plan) + } else { + (left_set, right_set, left_dp_plan, right_dp_plan) + }; + + // Recursively reconstruct build and probe subplans + let (build_plan, build_map) = self.reconstruct(build_dp)?; + let (probe_plan, probe_map) = self.reconstruct(probe_dp)?; // Build physical join conditions let on_conditions = self.build_join_conditions( edge_indices, - &left_map, - &right_map, - &left_plan, - &right_plan, + &build_map, + &probe_map, + &build_plan, + &probe_plan, )?; - // Determine join type from edge information - let join_type = self.determine_join_type(edge_indices)?; + debug_assert!( + !should_swap_for_build || join_type == JoinType::Inner, + "JoinReorder must only swap build/probe sides for inner joins" + ); // Build join filter for non-equi conditions let join_filter = self.build_join_filter( edge_indices, - &left_map, - &right_map, - &left_plan, - &right_plan, - left_set, - right_set, + &build_map, + &probe_map, + &build_plan, + &probe_plan, + build_set, + probe_set, )?; // Merge left and right ColumnMap to create output ColumnMap for new Join plan - let mut join_output_map = left_map; - join_output_map.extend(right_map); + let mut join_output_map = build_map; + join_output_map.extend(probe_map); + let projection = self.compute_required_projection(current_join_set, &join_output_map); // If there are no connecting edges, this is a cartesian product. HashJoinExec does not // support empty join keys; use CrossJoinExec instead to avoid optimizer-stage crashes. @@ -191,31 +423,61 @@ impl<'a> PlanReconstructor<'a> { // Theta join: no equi-join pairs were reconstructed, but we have a join predicate. // Use NestedLoopJoinExec which supports joins without equi-keys. let join_plan = Arc::new(NestedLoopJoinExec::try_new( - left_plan, - right_plan, + build_plan, + probe_plan, Some(join_filter), &join_type, - None, // projection + projection.clone(), // projection )?); + + if let Some(projection) = projection { + join_output_map = projection + .into_iter() + .map(|i| join_output_map[i].clone()) + .collect(); + } return Ok((join_plan, join_output_map)); } - let join_plan = Arc::new(CrossJoinExec::new(left_plan, right_plan)); + let join_plan = Arc::new(CrossJoinExec::new(build_plan, probe_plan)); + // CrossJoinExec does not support a built-in projection; wrap with ProjectionExec if needed. + if let Some(keep) = projection.clone() { + let exprs: Vec = keep + .iter() + .map(|&i| ProjectionExpr { + expr: Arc::new(Column::new(join_plan.schema().field(i).name(), i)), + alias: join_plan.schema().field(i).name().to_string(), + }) + .collect(); + let proj = Arc::new(ProjectionExec::try_new(exprs, join_plan)?); + join_output_map = keep + .into_iter() + .map(|i| join_output_map[i].clone()) + .collect(); + return Ok((proj, join_output_map)); + } return Ok((join_plan, join_output_map)); } // Otherwise, create HashJoinExec let join_plan = Arc::new(HashJoinExec::try_new( - left_plan, - right_plan, + build_plan, + probe_plan, on_conditions, join_filter, // Use JoinEdge.filter for non-equi conditions &join_type, // Use determined join type - None, // projection + projection.clone(), // projection PartitionMode::Auto, // partition_mode - NullEquality::NullEqualsNothing, // TODO: Skip the optimizer completely - // if NullEquality is something else in the input region. + null_equality, + false, // null_aware )?); + if let Some(projection) = projection { + join_output_map = projection + .into_iter() + .map(|i| join_output_map[i].clone()) + .collect(); + } + Ok((join_plan, join_output_map)) } @@ -304,6 +566,34 @@ impl<'a> PlanReconstructor<'a> { Ok(JoinType::Inner) } + /// Determines null-equality semantics from edge information. + /// + /// If multiple edges participate in one reconstructed join, they must agree on + /// null-equality behavior. Mixed semantics are rejected because a single + /// HashJoinExec has one null-equality mode. + fn determine_null_equality(&self, edge_indices: &[usize]) -> Result { + let mut selected: Option = None; + + for &edge_index in edge_indices { + let edge = self.query_graph.edges.get(edge_index).ok_or_else(|| { + DataFusionError::Internal(format!("Edge with index {} not found", edge_index)) + })?; + + if let Some(existing) = selected { + if existing != edge.null_equality { + return Err(DataFusionError::Internal(format!( + "Inconsistent null_equality across join edges: {:?} vs {:?}", + existing, edge.null_equality + ))); + } + } else { + selected = Some(edge.null_equality); + } + } + + Ok(selected.unwrap_or(NullEquality::NullEqualsNothing)) + } + /// Builds join filter for non-equi conditions from edge information. fn build_join_filter( &mut self, @@ -409,7 +699,7 @@ impl<'a> PlanReconstructor<'a> { // Find side and base index for a column, supporting stable names and schema field names let find_side_and_index = |col: &Column| -> Result> { - if let Some((rel, cidx)) = Self::parse_stable_name(col.name()) { + if let Some((rel, cidx)) = StableColumn::parse_stable_name(col.name()) { // Look up in left_map by stable, else right_map if let Some(pos) = left_map.iter().position(|e| matches!(e, ColumnMapEntry::Stable{ relation_id, column_index } if *relation_id==rel && *column_index==cidx)) { return Ok(Some((JoinSide::Left, pos))); @@ -419,19 +709,39 @@ impl<'a> PlanReconstructor<'a> { } } // Fallback by matching current plan schema names. - // If the name exists on both sides, it's ambiguous and unsafe to guess. - let left_match = left_plan + // If there are duplicate matches on one side or matches on both sides, + // fallback-by-name is unsafe and we fail fast. + let left_matches: Vec = left_plan .schema() .fields() .iter() - .position(|f| f.name() == col.name()); - let right_match = right_plan + .enumerate() + .filter_map(|(idx, field)| (field.name() == col.name()).then_some(idx)) + .collect(); + let right_matches: Vec = right_plan .schema() .fields() .iter() - .position(|f| f.name() == col.name()); + .enumerate() + .filter_map(|(idx, field)| (field.name() == col.name()).then_some(idx)) + .collect(); - match (left_match, right_match) { + if left_matches.len() > 1 { + return Err(DataFusionError::Internal(format!( + "Ambiguous column reference '{}' found {} times in left join input during reconstruction", + col.name(), + left_matches.len() + ))); + } + if right_matches.len() > 1 { + return Err(DataFusionError::Internal(format!( + "Ambiguous column reference '{}' found {} times in right join input during reconstruction", + col.name(), + right_matches.len() + ))); + } + + match (left_matches.first().copied(), right_matches.first().copied()) { (Some(_), Some(_)) => Err(DataFusionError::Internal(format!( "Ambiguous column reference '{}' found in both left and right join inputs during reconstruction", col.name() @@ -542,7 +852,7 @@ impl<'a> PlanReconstructor<'a> { let transformed = expr_arc.transform(|node| { if let Some(col) = node.as_any().downcast_ref::() { // Prefer stable name mapping first - if let Some((rel, cidx)) = self.parse_stable_column_name(col.name()) { + if let Some((rel, cidx)) = StableColumn::parse_stable_name(col.name()) { if let Some(pos) = output_map.iter().position(|e| { matches!( e, @@ -559,13 +869,23 @@ impl<'a> PlanReconstructor<'a> { } } - // Fallback: try to match by current schema field name - if let Some(pos) = plan + // Fallback: try to match by current schema field name. + // A non-unique match is unsafe to rewrite. + let matches: Vec = plan .schema() .fields() .iter() - .position(|f| f.name() == col.name()) - { + .enumerate() + .filter_map(|(idx, field)| (field.name() == col.name()).then_some(idx)) + .collect(); + if matches.len() > 1 { + return Err(DataFusionError::Internal(format!( + "Ambiguous output column '{}' found {} times while rewriting expression", + col.name(), + matches.len() + ))); + } + if let Some(pos) = matches.first().copied() { let new_col = Column::new(col.name(), pos); return Ok(Transformed::yes(Arc::new(new_col))); } @@ -601,8 +921,8 @@ impl<'a> PlanReconstructor<'a> { let cols = collect_columns(expr); for c in &cols { // If the expression already uses stable names, prefer that. - if let Some((rel, _)) = Self::parse_stable_name(c.name()) { - *bits |= 1u64 << rel; + if let Some((rel, _)) = StableColumn::parse_stable_name(c.name()) { + *bits |= Self::relation_bit(rel); continue; } @@ -616,7 +936,7 @@ impl<'a> PlanReconstructor<'a> { match entry { ColumnMapEntry::Stable { relation_id, .. } => { - *bits |= 1u64 << *relation_id; + *bits |= Self::relation_bit(*relation_id); } ColumnMapEntry::Expression { expr, input_map } => { self.add_relation_bits_from_expr(expr, input_map, bits)?; @@ -634,52 +954,64 @@ impl<'a> PlanReconstructor<'a> { left_plan: &Arc, right_plan: &Arc, ) -> Result { + let unique_field_index = |plan: &Arc, name: &str, side_name: &str| { + let matches: Vec = plan + .schema() + .fields() + .iter() + .enumerate() + .filter_map(|(idx, field)| (field.name() == name).then_some(idx)) + .collect(); + + match matches.as_slice() { + [] => Ok(None), + [idx] => Ok(Some(*idx)), + _ => Err(DataFusionError::Internal(format!( + "Ambiguous column '{}' matched {} fields on {} side while analyzing predicate dependencies", + name, + matches.len(), + side_name + ))), + } + }; + let mut bits: u64 = 0; let cols = collect_columns(predicate); for c in &cols { - if let Some((rel, _)) = Self::parse_stable_name(c.name()) { - bits |= 1u64 << rel; + if let Some((rel, _)) = StableColumn::parse_stable_name(c.name()) { + bits |= Self::relation_bit(rel); continue; } - let mut matched = false; - for (i, f) in left_plan.schema().fields().iter().enumerate() { - if f.name() == c.name() { - match left_map.get(i) { - Some(ColumnMapEntry::Stable { relation_id, .. }) => { - bits |= 1u64 << *relation_id; - matched = true; - break; - } - Some(ColumnMapEntry::Expression { expr, input_map }) => { - self.add_relation_bits_from_expr(expr, input_map, &mut bits)?; - matched = true; - break; - } - None => {} - } + let left_match = unique_field_index(left_plan, c.name(), "left")?; + let right_match = unique_field_index(right_plan, c.name(), "right")?; + + match (left_match, right_match) { + (Some(_), Some(_)) => { + return Err(DataFusionError::Internal(format!( + "Ambiguous column '{}' found in both left and right join inputs while analyzing predicate dependencies", + c.name() + ))); } - } - if matched { - continue; - } - for (i, f) in right_plan.schema().fields().iter().enumerate() { - if f.name() == c.name() { - match right_map.get(i) { - Some(ColumnMapEntry::Stable { relation_id, .. }) => { - bits |= 1u64 << *relation_id; - matched = true; - break; - } - Some(ColumnMapEntry::Expression { expr, input_map }) => { - self.add_relation_bits_from_expr(expr, input_map, &mut bits)?; - matched = true; - break; - } - None => {} + (Some(i), None) => match left_map.get(i) { + Some(ColumnMapEntry::Stable { relation_id, .. }) => { + bits |= Self::relation_bit(*relation_id); } - } + Some(ColumnMapEntry::Expression { expr, input_map }) => { + self.add_relation_bits_from_expr(expr, input_map, &mut bits)?; + } + None => {} + }, + (None, Some(i)) => match right_map.get(i) { + Some(ColumnMapEntry::Stable { relation_id, .. }) => { + bits |= Self::relation_bit(*relation_id); + } + Some(ColumnMapEntry::Expression { expr, input_map }) => { + self.add_relation_bits_from_expr(expr, input_map, &mut bits)?; + } + None => {} + }, + (None, None) => {} } - let _ = matched; } Ok(JoinSet::from_bits(bits)) @@ -809,7 +1141,7 @@ impl<'a> PlanReconstructor<'a> { column_map: &ColumnMap, ) -> bool { // Check by stable column name format (R{rel}.C{col}) - if let Some((rel, cidx)) = self.parse_stable_column_name(col.name()) { + if let Some((rel, cidx)) = StableColumn::parse_stable_name(col.name()) { return column_map.iter().any(|entry| { matches!(entry, ColumnMapEntry::Stable { relation_id, column_index } if *relation_id == rel && *column_index == cidx) @@ -823,22 +1155,6 @@ impl<'a> PlanReconstructor<'a> { .any(|f| f.name() == col.name()) } - /// Parse stable column name format "R{rel}.C{col}" -> (rel, col) - fn parse_stable_column_name(&self, name: &str) -> Option<(usize, usize)> { - if !name.starts_with('R') { - return None; - } - let dot = name.find('.')?; - let rel_str = &name[1..dot]; - if !name[dot + 1..].starts_with('C') { - return None; - } - let col_str = &name[dot + 2..]; - let rel = rel_str.parse::().ok()?; - let col = col_str.parse::().ok()?; - Some((rel, col)) - } - /// Combine multiple filter expressions with AND logic. fn combine_filters_with_and( &self, @@ -866,11 +1182,13 @@ impl<'a> PlanReconstructor<'a> { #[expect(clippy::unwrap_used)] mod tests { use datafusion::arrow::datatypes::{DataType, Field, Schema}; - use datafusion::common::Statistics; + use datafusion::common::{NullEquality, Statistics}; + use datafusion::logical_expr::{JoinType, Operator}; + use datafusion::physical_expr::expressions::{BinaryExpr, Column}; use datafusion::physical_plan::empty::EmptyExec; use super::*; - use crate::join_reorder::graph::{QueryGraph, RelationNode}; + use crate::join_reorder::graph::{JoinEdge, QueryGraph, RelationNode}; use crate::join_reorder::join_set::JoinSet; fn create_test_graph() -> QueryGraph { @@ -882,7 +1200,7 @@ mod tests { )])); let plan = Arc::new(EmptyExec::new(schema.clone())); - let relation = RelationNode::new(plan, 0, 1000.0, Statistics::new_unknown(&schema)); + let relation = RelationNode::new(plan, 0, 1000.0, 1000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation); graph @@ -910,6 +1228,243 @@ mod tests { Ok(()) } + #[test] + fn test_hash_join_build_side_prefers_smaller_cardinality() -> Result<()> { + // Two relations with different estimated cardinalities. HashJoinExec hashes the LEFT side + // (build side), so we want the smaller input on the left. + + let schema_a = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let schema_b = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, false)])); + + let plan_a: Arc = Arc::new(EmptyExec::new(schema_a.clone())); + let plan_b: Arc = Arc::new(EmptyExec::new(schema_b.clone())); + + let mut graph = QueryGraph::new(); + graph.add_relation(RelationNode::new( + plan_a, + 0, + 1_000_000.0, + 1_000_000.0, + Statistics::new_unknown(&schema_a), + )); + graph.add_relation(RelationNode::new( + plan_b, + 1, + 10.0, + 10.0, + Statistics::new_unknown(&schema_b), + )); + + let join_set = JoinSet::from_iter([0usize, 1usize].into_iter())?; + let filter: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new("R0.C0", 0)), + Operator::Eq, + Arc::new(Column::new("R1.C0", 0)), + )); + graph.add_edge(JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "R0.C0".to_string(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "R1.C0".to_string(), + }, + )], + ))?; + + // Build a DP table where the solver chose left={0}, right={1}. + // Reconstructor should swap to build on the smaller input ({1}). + let mut dp_table: HashMap> = HashMap::new(); + let leaf0 = Arc::new(DPPlan::new_leaf(0, 1_000_000.0)?); + let leaf1 = Arc::new(DPPlan::new_leaf(1, 10.0)?); + dp_table.insert(leaf0.join_set, leaf0); + dp_table.insert(leaf1.join_set, leaf1); + + let root = Arc::new(DPPlan::new_join( + JoinSet::new_singleton(0)?, + JoinSet::new_singleton(1)?, + vec![0], + 0.0, + 10.0, + )); + dp_table.insert(root.join_set, root.clone()); + + let mut reconstructor = PlanReconstructor::new(&dp_table, &graph); + let (plan, _map) = reconstructor.reconstruct(&root)?; + #[expect(clippy::expect_used)] + let hj = plan + .as_any() + .downcast_ref::() + .expect("expected HashJoinExec"); + + assert_eq!(hj.left.schema().fields()[0].name(), "b"); + assert_eq!(hj.right.schema().fields()[0].name(), "a"); + + Ok(()) + } + + #[test] + fn test_hash_join_build_side_keeps_order_when_cardinality_equal() -> Result<()> { + // When cardinalities are equal, keep the DP plan's original left/right assignment. + let schema_a = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)])); + let schema_b = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, false)])); + + let plan_a: Arc = Arc::new(EmptyExec::new(schema_a.clone())); + let plan_b: Arc = Arc::new(EmptyExec::new(schema_b.clone())); + + let mut graph = QueryGraph::new(); + graph.add_relation(RelationNode::new( + plan_a, + 0, + 100.0, + 100.0, + Statistics::new_unknown(&schema_a), + )); + graph.add_relation(RelationNode::new( + plan_b, + 1, + 100.0, + 100.0, + Statistics::new_unknown(&schema_b), + )); + + let join_set = JoinSet::from_iter([0usize, 1usize].into_iter())?; + let filter: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new("R0.C0", 0)), + Operator::Eq, + Arc::new(Column::new("R1.C0", 0)), + )); + graph.add_edge(JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "R0.C0".to_string(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "R1.C0".to_string(), + }, + )], + ))?; + + let mut dp_table: HashMap> = HashMap::new(); + let leaf0 = Arc::new(DPPlan::new_leaf(0, 100.0)?); + let leaf1 = Arc::new(DPPlan::new_leaf(1, 100.0)?); + dp_table.insert(leaf0.join_set, leaf0); + dp_table.insert(leaf1.join_set, leaf1); + + let root = Arc::new(DPPlan::new_join( + JoinSet::new_singleton(0)?, + JoinSet::new_singleton(1)?, + vec![0], + 0.0, + 100.0, + )); + dp_table.insert(root.join_set, root.clone()); + + let mut reconstructor = PlanReconstructor::new(&dp_table, &graph); + let (plan, _map) = reconstructor.reconstruct(&root)?; + #[expect(clippy::expect_used)] + let hj = plan + .as_any() + .downcast_ref::() + .expect("expected HashJoinExec"); + + assert_eq!(hj.left.schema().fields()[0].name(), "a"); + assert_eq!(hj.right.schema().fields()[0].name(), "b"); + + Ok(()) + } + + #[test] + fn test_hash_join_preserves_null_equality() -> Result<()> { + let schema_a = Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, true)])); + let schema_b = Arc::new(Schema::new(vec![Field::new("b", DataType::Int32, true)])); + + let plan_a: Arc = Arc::new(EmptyExec::new(schema_a.clone())); + let plan_b: Arc = Arc::new(EmptyExec::new(schema_b.clone())); + + let mut graph = QueryGraph::new(); + graph.add_relation(RelationNode::new( + plan_a, + 0, + 100.0, + 100.0, + Statistics::new_unknown(&schema_a), + )); + graph.add_relation(RelationNode::new( + plan_b, + 1, + 100.0, + 100.0, + Statistics::new_unknown(&schema_b), + )); + + let join_set = JoinSet::from_iter([0usize, 1usize].into_iter())?; + let filter: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new("R0.C0", 0)), + Operator::Eq, + Arc::new(Column::new("R1.C0", 0)), + )); + let mut edge = JoinEdge::new( + join_set, + filter, + JoinType::Inner, + vec![( + StableColumn { + relation_id: 0, + column_index: 0, + name: "R0.C0".to_string(), + }, + StableColumn { + relation_id: 1, + column_index: 0, + name: "R1.C0".to_string(), + }, + )], + ); + edge.null_equality = NullEquality::NullEqualsNull; + graph.add_edge(edge)?; + + let mut dp_table: HashMap> = HashMap::new(); + let leaf0 = Arc::new(DPPlan::new_leaf(0, 100.0)?); + let leaf1 = Arc::new(DPPlan::new_leaf(1, 100.0)?); + dp_table.insert(leaf0.join_set, leaf0); + dp_table.insert(leaf1.join_set, leaf1); + + let root = Arc::new(DPPlan::new_join( + JoinSet::new_singleton(0)?, + JoinSet::new_singleton(1)?, + vec![0], + 0.0, + 100.0, + )); + dp_table.insert(root.join_set, root.clone()); + + let mut reconstructor = PlanReconstructor::new(&dp_table, &graph); + let (plan, _) = reconstructor.reconstruct(&root)?; + #[expect(clippy::expect_used)] + let hj = plan + .as_any() + .downcast_ref::() + .expect("expected HashJoinExec"); + + assert_eq!(hj.null_equality(), NullEquality::NullEqualsNull); + Ok(()) + } + #[test] fn test_reconstruct_join_missing_subplans() { let dp_table = HashMap::new(); // Empty table @@ -1026,6 +1581,54 @@ mod tests { Ok(()) } + #[test] + fn test_analyze_predicate_dependencies_ambiguous_left_name_fails() -> Result<()> { + let left_schema = Arc::new(Schema::new(vec![ + Field::new("dup", DataType::Int32, false), + Field::new("dup", DataType::Int32, false), + ])); + let right_schema = Arc::new(Schema::new(vec![Field::new("r", DataType::Int32, false)])); + let left_plan: Arc = Arc::new(EmptyExec::new(left_schema.clone())); + let right_plan: Arc = Arc::new(EmptyExec::new(right_schema.clone())); + + let left_map: ColumnMap = vec![ + ColumnMapEntry::Stable { + relation_id: 0, + column_index: 0, + }, + ColumnMapEntry::Stable { + relation_id: 0, + column_index: 1, + }, + ]; + let right_map: ColumnMap = vec![ColumnMapEntry::Stable { + relation_id: 1, + column_index: 0, + }]; + + let pred: Arc = Arc::new(BinaryExpr::new( + Arc::new(Column::new("dup", 0)), + Operator::Gt, + Arc::new(Column::new("R1.C0", 0)), + )); + + let dp_table = HashMap::new(); + let graph = QueryGraph::new(); + let reconstructor = PlanReconstructor::new(&dp_table, &graph); + let res = reconstructor.analyze_predicate_dependencies( + &pred, + &left_map, + &right_map, + &left_plan, + &right_plan, + ); + assert!( + res.is_err(), + "expected duplicate left-name dependency to error" + ); + Ok(()) + } + #[test] fn test_build_join_filter_ambiguous_name_fails() -> Result<()> { // Regression test: name-based fallback in join filter reconstruction must not silently @@ -1060,12 +1663,14 @@ mod tests { dummy_plan.clone(), 0, 1000.0, + 1000.0, Statistics::new_unknown(&dummy_schema), )); graph.add_relation(RelationNode::new( dummy_plan, 1, 1000.0, + 1000.0, Statistics::new_unknown(&dummy_schema), )); @@ -1121,6 +1726,36 @@ mod tests { Ok(()) } + #[test] + fn test_rewrite_expr_to_output_schema_ambiguous_output_name_fails() -> Result<()> { + let schema = Arc::new(Schema::new(vec![ + Field::new("dup", DataType::Int32, false), + Field::new("dup", DataType::Int32, false), + ])); + let plan: Arc = Arc::new(EmptyExec::new(schema.clone())); + let output_map: ColumnMap = vec![ + ColumnMapEntry::Stable { + relation_id: 0, + column_index: 0, + }, + ColumnMapEntry::Stable { + relation_id: 1, + column_index: 0, + }, + ]; + + let expr: Arc = Arc::new(Column::new("dup", 0)); + let dp_table = HashMap::new(); + let graph = QueryGraph::new(); + let reconstructor = PlanReconstructor::new(&dp_table, &graph); + let res = reconstructor.rewrite_expr_to_output_schema(&expr, &plan, &output_map); + assert!( + res.is_err(), + "expected ambiguous output-name rewrite to error" + ); + Ok(()) + } + #[test] fn test_reconstruct_cartesian_product_uses_cross_join_exec() -> Result<()> { // Build a graph with 3 relations, but only one join edge between (0,1). @@ -1141,7 +1776,8 @@ mod tests { let mut graph = QueryGraph::new(); for i in 0..3 { let plan = Arc::new(EmptyExec::new(schema.clone())); - let relation = RelationNode::new(plan, i, 1000.0, Statistics::new_unknown(&schema)); + let relation = + RelationNode::new(plan, i, 1000.0, 1000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation); } @@ -1227,7 +1863,8 @@ mod tests { let mut graph = QueryGraph::new(); for i in 0..2 { let plan = Arc::new(EmptyExec::new(schema.clone())); - let relation = RelationNode::new(plan, i, 1000.0, Statistics::new_unknown(&schema)); + let relation = + RelationNode::new(plan, i, 1000.0, 1000.0, Statistics::new_unknown(&schema)); graph.add_relation(relation); } diff --git a/crates/sail-physical-optimizer/src/lib.rs b/crates/sail-physical-optimizer/src/lib.rs index d5b7119a11..04b56c0fc7 100644 --- a/crates/sail-physical-optimizer/src/lib.rs +++ b/crates/sail-physical-optimizer/src/lib.rs @@ -1,7 +1,6 @@ use std::sync::Arc; use datafusion::physical_optimizer::aggregate_statistics::AggregateStatistics; -use datafusion::physical_optimizer::coalesce_batches::CoalesceBatches; use datafusion::physical_optimizer::combine_partial_final_agg::CombinePartialFinalAggregate; use datafusion::physical_optimizer::enforce_distribution::EnforceDistribution; use datafusion::physical_optimizer::enforce_sorting::EnforceSorting; @@ -19,9 +18,13 @@ use datafusion::physical_optimizer::topk_aggregation::TopKAggregation; use datafusion::physical_optimizer::update_aggr_exprs::OptimizeAggregateOrder; use datafusion::physical_optimizer::PhysicalOptimizerRule; +use crate::barrier::EnforceBarrierPartitioning; +use crate::collect_left::RewriteCollectLeftHashJoin; use crate::explicit_repartition::RewriteExplicitRepartition; use crate::join_reorder::JoinReorder; +mod barrier; +mod collect_left; mod explicit_repartition; mod join_reorder; @@ -48,7 +51,6 @@ pub fn get_physical_optimizers( rules.push(Arc::new(EnforceSorting::new())); rules.push(Arc::new(OptimizeAggregateOrder::new())); rules.push(Arc::new(ProjectionPushdown::new())); - rules.push(Arc::new(CoalesceBatches::new())); rules.push(Arc::new(OutputRequirements::new_remove_mode())); rules.push(Arc::new(TopKAggregation::new())); rules.push(Arc::new(LimitPushPastWindows::new())); @@ -58,6 +60,8 @@ pub fn get_physical_optimizers( rules.push(Arc::new(EnsureCooperative::new())); rules.push(Arc::new(FilterPushdown::new_post_optimization())); rules.push(Arc::new(RewriteExplicitRepartition::new())); + rules.push(Arc::new(RewriteCollectLeftHashJoin::new())); + rules.push(Arc::new(EnforceBarrierPartitioning::new())); rules.push(Arc::new(SanityCheckPlan::new())); rules diff --git a/crates/sail-physical-plan/Cargo.toml b/crates/sail-physical-plan/Cargo.toml index 4d54fff480..3ece989d89 100644 --- a/crates/sail-physical-plan/Cargo.toml +++ b/crates/sail-physical-plan/Cargo.toml @@ -7,6 +7,7 @@ edition = { workspace = true } workspace = true [dependencies] +sail-catalog = { path = "../sail-catalog" } sail-common-datafusion = { path = "../sail-common-datafusion" } sail-logical-plan = { path = "../sail-logical-plan" } diff --git a/crates/sail-physical-plan/src/barrier.rs b/crates/sail-physical-plan/src/barrier.rs new file mode 100644 index 0000000000..5f3913c015 --- /dev/null +++ b/crates/sail-physical-plan/src/barrier.rs @@ -0,0 +1,138 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion_common::{exec_err, Result}; +use futures::{StreamExt, TryStreamExt}; + +/// A physical plan node that enforces a barrier between preconditions and the actual plan. +/// +/// This node has `n` children (`n >= 1`), where the first `n - 1` children (possibly none) +/// are preconditions and the last child is the actual plan. +/// +/// When `execute(partition, context)` is called, it exhausts partition `p` of each precondition +/// child sequentially before executing and returning partition `p` of the actual plan. This +/// guarantees that the side effects of the preconditions (e.g. catalog commands) are complete +/// before the actual plan starts producing output. +/// +/// **Partition matching**: The `EnforceBarrierPartitioning` physical optimizer rule (which +/// runs at the end of the optimizer pipeline) ensures that all preconditions are wrapped with +/// `RepartitionExec` (round-robin) or `CoalescePartitionsExec` to match the partition count +/// of the actual plan. By wrapping the preconditions this way, the actual plan will not start +/// until all partitions of the preconditions are completed, even if we only call `execute()` for +/// one partition of the precondition. +/// +/// **Distributed execution note**: In distributed processing, `BarrierExec` does not prevent +/// tasks in dependent stages from being _scheduled_. The barrier is only meaningful within a +/// single stage: the write node and `BarrierExec` belong to the same stage, so the actual write +/// only starts after preconditions for the corresponding partition have been exhausted. +#[derive(Debug, Clone)] +pub struct BarrierExec { + preconditions: Vec>, + plan: Arc, + properties: Arc, +} + +impl BarrierExec { + pub fn new(preconditions: Vec>, plan: Arc) -> Self { + let properties = Arc::new(plan.properties().as_ref().clone()); + Self { + preconditions, + plan, + properties, + } + } + + pub fn preconditions(&self) -> &[Arc] { + &self.preconditions + } + + pub fn plan(&self) -> &Arc { + &self.plan + } +} + +impl DisplayAs for BarrierExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "BarrierExec") + } +} + +impl ExecutionPlan for BarrierExec { + fn name(&self) -> &'static str { + Self::static_name() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn benefits_from_input_partitioning(&self) -> Vec { + // Never repartition children based on input partitioning heuristics; + // partition alignment is handled explicitly by `EnforceBarrierPartitioning`. + vec![false; self.children().len()] + } + + fn children(&self) -> Vec<&Arc> { + self.preconditions + .iter() + .chain(std::iter::once(&self.plan)) + .collect() + } + + fn with_new_children( + self: Arc, + mut children: Vec>, + ) -> Result> { + let plan = children.pop().ok_or_else(|| { + datafusion_common::DataFusionError::Internal(format!( + "{} requires at least 1 child (the actual plan)", + self.name() + )) + })?; + Ok(Arc::new(Self::new(children, plan))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + let num_partitions = self.properties.output_partitioning().partition_count(); + if partition >= num_partitions { + return exec_err!( + "{}: partition index {} out of range ({})", + self.name(), + partition, + num_partitions + ); + } + // Collect precondition streams to exhaust before running the actual plan. + let streams: Vec = self + .preconditions + .iter() + .map(|precondition| precondition.execute(partition, context.clone())) + .collect::>()?; + let plan = self.plan.clone(); + let schema = self.schema(); + // Exhaust each precondition stream sequentially, then run the actual plan. + // We use a once-stream that resolves to the actual plan stream, then flatten. + let outer = futures::stream::once(async move { + for mut stream in streams { + while let Some(batch) = stream.next().await { + // Discard the batch; we only care about side effects. + batch?; + } + } + plan.execute(partition, context) + }) + .try_flatten(); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, outer))) + } +} diff --git a/crates/sail-physical-plan/src/catalog_command.rs b/crates/sail-physical-plan/src/catalog_command.rs new file mode 100644 index 0000000000..f8dc7c6208 --- /dev/null +++ b/crates/sail-physical-plan/src/catalog_command.rs @@ -0,0 +1,104 @@ +use std::any::Any; +use std::sync::Arc; + +use datafusion::arrow::datatypes::SchemaRef; +use datafusion::execution::{SendableRecordBatchStream, TaskContext}; +use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; +use datafusion::physical_plan::execution_plan::{Boundedness, EmissionType}; +use datafusion::physical_plan::stream::RecordBatchStreamAdapter; +use datafusion::physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties}; +use datafusion_common::{exec_err, internal_err, Result}; +use sail_catalog::command::CatalogCommand; +use sail_catalog::manager::CatalogManager; +use sail_common_datafusion::extension::SessionExtensionAccessor; + +/// A physical plan node that executes a [`CatalogCommand`]. +/// +/// This node has a single output partition and no children. +/// When executed, it delegates to [`CatalogCommand::execute()`] using the [`TaskContext`] +/// to obtain both the [`CatalogManager`] and any session-level services. +#[derive(Debug, Clone)] +pub struct CatalogCommandExec { + command: CatalogCommand, + schema: SchemaRef, + properties: Arc, +} + +impl CatalogCommandExec { + pub fn new(command: CatalogCommand, schema: SchemaRef) -> Self { + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(schema.clone()), + Partitioning::UnknownPartitioning(1), + EmissionType::Final, + Boundedness::Bounded, + )); + Self { + command, + schema, + properties, + } + } + + pub fn command(&self) -> &CatalogCommand { + &self.command + } +} + +impl DisplayAs for CatalogCommandExec { + fn fmt_as(&self, _t: DisplayFormatType, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "CatalogCommandExec: {}", self.command.name()) + } +} + +impl ExecutionPlan for CatalogCommandExec { + fn name(&self) -> &'static str { + Self::static_name() + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> Result> { + if !children.is_empty() { + return internal_err!("{} should not have children", self.name()); + } + Ok(self) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> Result { + if partition != 0 { + return exec_err!( + "{} expects only partition 0 but got {}", + self.name(), + partition + ); + } + let command = self.command.clone(); + let schema = self.schema.clone(); + let stream = futures::stream::once(async move { + let manager = context.extension::()?; + let batch = command + .execute(context.as_ref(), manager.as_ref()) + .await + .map_err(|e| datafusion_common::exec_datafusion_err!("{e}"))?; + Ok(batch) + }); + Ok(Box::pin(RecordBatchStreamAdapter::new(schema, stream))) + } +} diff --git a/crates/sail-physical-plan/src/file_write.rs b/crates/sail-physical-plan/src/file_write.rs index 9edef478ef..a38c18b13d 100644 --- a/crates/sail-physical-plan/src/file_write.rs +++ b/crates/sail-physical-plan/src/file_write.rs @@ -19,12 +19,12 @@ pub async fn create_file_write_physical_plan( options: FileWriteOptions, ) -> Result> { let FileWriteOptions { - path, format, mode, partition_by, sort_by, bucket_by, + table_properties, options, } = options; let mode = match mode { @@ -44,11 +44,11 @@ pub async fn create_file_write_physical_plan( let sort_order = create_sort_order(ctx, sort_by, logical_input.schema())?; let info = SinkInfo { input: physical_input, - path, mode, partition_by, bucket_by, sort_order, + table_properties: table_properties.into_iter().collect(), // TODO: detect duplicated keys in each set of options options: options .into_iter() diff --git a/crates/sail-physical-plan/src/lib.rs b/crates/sail-physical-plan/src/lib.rs index 89c60c2cf2..a521aa62bf 100644 --- a/crates/sail-physical-plan/src/lib.rs +++ b/crates/sail-physical-plan/src/lib.rs @@ -1,3 +1,5 @@ +pub mod barrier; +pub mod catalog_command; pub mod file_delete; pub mod file_write; pub mod format_tag; diff --git a/crates/sail-physical-plan/src/map_partitions.rs b/crates/sail-physical-plan/src/map_partitions.rs index 564a4e6618..d83926de4d 100644 --- a/crates/sail-physical-plan/src/map_partitions.rs +++ b/crates/sail-physical-plan/src/map_partitions.rs @@ -18,19 +18,19 @@ use tokio_stream::StreamExt; pub struct MapPartitionsExec { input: Arc, udf: Arc, - properties: PlanProperties, + properties: Arc, } impl MapPartitionsExec { pub fn new(input: Arc, udf: Arc, schema: SchemaRef) -> Self { // The plan output schema can be different from the output schema of the UDF // due to field renaming. - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), input.output_partitioning().clone(), input.pipeline_behavior(), input.boundedness(), - ); + )); Self { input, udf, @@ -66,7 +66,7 @@ impl ExecutionPlan for MapPartitionsExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/merge_cardinality_check.rs b/crates/sail-physical-plan/src/merge_cardinality_check.rs index f13c866b8d..085ae13960 100644 --- a/crates/sail-physical-plan/src/merge_cardinality_check.rs +++ b/crates/sail-physical-plan/src/merge_cardinality_check.rs @@ -21,7 +21,7 @@ pub struct MergeCardinalityCheckExec { target_present_col: String, source_present_col: String, schema: SchemaRef, - properties: PlanProperties, + properties: Arc, } impl MergeCardinalityCheckExec { @@ -79,7 +79,7 @@ impl ExecutionPlan for MergeCardinalityCheckExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/monotonic_id.rs b/crates/sail-physical-plan/src/monotonic_id.rs index 1995d5a7de..3f683f87e9 100644 --- a/crates/sail-physical-plan/src/monotonic_id.rs +++ b/crates/sail-physical-plan/src/monotonic_id.rs @@ -21,7 +21,7 @@ pub struct MonotonicIdExec { input: Arc, column_name: String, schema: SchemaRef, - properties: PlanProperties, + properties: Arc, } impl MonotonicIdExec { @@ -42,12 +42,12 @@ impl MonotonicIdExec { ); } - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), input.output_partitioning().clone(), input.pipeline_behavior(), input.boundedness(), - ); + )); Ok(Self { input, column_name, @@ -80,7 +80,7 @@ impl ExecutionPlan for MonotonicIdExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/range.rs b/crates/sail-physical-plan/src/range.rs index 3636fc394e..e4270395dc 100644 --- a/crates/sail-physical-plan/src/range.rs +++ b/crates/sail-physical-plan/src/range.rs @@ -21,7 +21,7 @@ pub struct RangeExec { original_schema: SchemaRef, projected_schema: SchemaRef, projection: Vec, - properties: PlanProperties, + properties: Arc, } impl RangeExec { @@ -34,12 +34,12 @@ impl RangeExec { projection: Vec, ) -> Result { let projected_schema = Arc::new(schema.project(&projection)?); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(projected_schema.clone()), Partitioning::RoundRobinBatch(num_partitions), EmissionType::Both, Boundedness::Bounded, - ); + )); Ok(Self { range, num_partitions, @@ -86,7 +86,7 @@ impl ExecutionPlan for RangeExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/repartition.rs b/crates/sail-physical-plan/src/repartition.rs index daec8751dd..7a5a988c0f 100644 --- a/crates/sail-physical-plan/src/repartition.rs +++ b/crates/sail-physical-plan/src/repartition.rs @@ -16,7 +16,7 @@ use datafusion_common::{internal_err, plan_err, Result, Statistics}; #[derive(Debug)] pub struct ExplicitRepartitionExec { input: Arc, - properties: PlanProperties, + properties: Arc, } impl ExplicitRepartitionExec { @@ -26,14 +26,16 @@ impl ExplicitRepartitionExec { eq_properties.clear_orderings(); eq_properties.clear_per_partition_constants(); } - let properties = PlanProperties::new( - eq_properties, - partitioning, - input.pipeline_behavior(), - input.boundedness(), - ) - .with_scheduling_type(SchedulingType::Cooperative) - .with_evaluation_type(EvaluationType::Eager); + let properties = Arc::new( + PlanProperties::new( + eq_properties, + partitioning, + input.pipeline_behavior(), + input.boundedness(), + ) + .with_scheduling_type(SchedulingType::Cooperative) + .with_evaluation_type(EvaluationType::Eager), + ); Self { input, properties } } @@ -61,7 +63,7 @@ impl ExecutionPlan for ExplicitRepartitionExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } @@ -97,10 +99,6 @@ impl ExecutionPlan for ExplicitRepartitionExec { ) } - fn statistics(&self) -> Result { - self.input.partition_statistics(None) - } - fn partition_statistics(&self, partition: Option) -> Result { if partition.is_none() { self.input.partition_statistics(None) diff --git a/crates/sail-physical-plan/src/schema_pivot.rs b/crates/sail-physical-plan/src/schema_pivot.rs index dd525dd0ec..19facf4bde 100644 --- a/crates/sail-physical-plan/src/schema_pivot.rs +++ b/crates/sail-physical-plan/src/schema_pivot.rs @@ -20,7 +20,7 @@ pub struct SchemaPivotExec { input: Arc, names: Vec, schema: SchemaRef, - properties: PlanProperties, + properties: Arc, } impl SchemaPivotExec { @@ -30,12 +30,12 @@ impl SchemaPivotExec { Partitioning::Hash(_phy_exprs, size) => Partitioning::UnknownPartitioning(*size), Partitioning::UnknownPartitioning(size) => Partitioning::UnknownPartitioning(*size), }; - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), partitioning, input.pipeline_behavior(), input.boundedness(), - ); + )); Self { input, names, @@ -80,7 +80,7 @@ impl ExecutionPlan for SchemaPivotExec { Arc::clone(&self.schema) } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/show_string.rs b/crates/sail-physical-plan/src/show_string.rs index d72708a432..0f986cb6b1 100644 --- a/crates/sail-physical-plan/src/show_string.rs +++ b/crates/sail-physical-plan/src/show_string.rs @@ -25,7 +25,7 @@ pub struct ShowStringExec { limit: usize, format: ShowStringFormat, schema: SchemaRef, - properties: PlanProperties, + properties: Arc, } impl ShowStringExec { @@ -36,12 +36,12 @@ impl ShowStringExec { format: ShowStringFormat, schema: SchemaRef, ) -> Self { - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema.clone()), Partitioning::RoundRobinBatch(1), EmissionType::Final, Boundedness::Bounded, - ); + )); Self { input, names, @@ -88,7 +88,7 @@ impl ExecutionPlan for ShowStringExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/streaming/collector.rs b/crates/sail-physical-plan/src/streaming/collector.rs index 3dd71701d3..8459501c2f 100644 --- a/crates/sail-physical-plan/src/streaming/collector.rs +++ b/crates/sail-physical-plan/src/streaming/collector.rs @@ -21,7 +21,7 @@ use sail_common_datafusion::streaming::event::FlowEvent; #[derive(Debug)] pub struct StreamCollectorExec { input: Arc, - properties: PlanProperties, + properties: Arc, } impl StreamCollectorExec { @@ -30,13 +30,13 @@ impl StreamCollectorExec { return plan_err!("stream collector requires bounded input"); } let schema = Arc::new(try_from_flow_event_schema(&input.schema())?); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), Partitioning::UnknownPartitioning(1), // We emit data at the end since we need to handle retractions. EmissionType::Final, Boundedness::Bounded, - ); + )); Ok(Self { input, properties }) } @@ -64,7 +64,7 @@ impl ExecutionPlan for StreamCollectorExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-physical-plan/src/streaming/filter.rs b/crates/sail-physical-plan/src/streaming/filter.rs index 19f2178ab8..a983318dcb 100644 --- a/crates/sail-physical-plan/src/streaming/filter.rs +++ b/crates/sail-physical-plan/src/streaming/filter.rs @@ -20,7 +20,7 @@ use futures::StreamExt; pub struct StreamFilterExec { input: Arc, predicate: Arc, - properties: PlanProperties, + properties: Arc, } impl StreamFilterExec { @@ -28,13 +28,13 @@ impl StreamFilterExec { input: Arc, predicate: Arc, ) -> Result { - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(input.schema()), input.output_partitioning().clone(), // Filtering preserves pipeline behavior of input input.pipeline_behavior(), input.boundedness(), - ); + )); Ok(Self { input, predicate, @@ -70,7 +70,7 @@ impl ExecutionPlan for StreamFilterExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } @@ -121,10 +121,6 @@ impl ExecutionPlan for StreamFilterExec { ))) } - fn statistics(&self) -> Result { - self.partition_statistics(None) - } - fn partition_statistics(&self, partition: Option) -> Result { self.input.partition_statistics(partition) } diff --git a/crates/sail-physical-plan/src/streaming/limit.rs b/crates/sail-physical-plan/src/streaming/limit.rs index 7aa6c326ab..a20e1329f4 100644 --- a/crates/sail-physical-plan/src/streaming/limit.rs +++ b/crates/sail-physical-plan/src/streaming/limit.rs @@ -28,7 +28,7 @@ pub struct StreamLimitExec { skip: usize, fetch: Option, data_schema: SchemaRef, - properties: PlanProperties, + properties: Arc, } impl StreamLimitExec { @@ -43,12 +43,12 @@ impl StreamLimitExec { } else { input.boundedness() }; - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( input.equivalence_properties().clone(), Partitioning::UnknownPartitioning(1), input.pipeline_behavior(), boundedness, - ); + )); Ok(Self { input, data_schema, @@ -90,7 +90,7 @@ impl ExecutionPlan for StreamLimitExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } @@ -153,10 +153,6 @@ impl ExecutionPlan for StreamLimitExec { Ok(Box::pin(EncodedFlowEventStream::new(stream))) } - fn statistics(&self) -> Result { - self.partition_statistics(None) - } - fn partition_statistics(&self, partition: Option) -> Result { self.input .partition_statistics(partition)? diff --git a/crates/sail-physical-plan/src/streaming/source_adapter.rs b/crates/sail-physical-plan/src/streaming/source_adapter.rs index b5bece3db0..eab121b734 100644 --- a/crates/sail-physical-plan/src/streaming/source_adapter.rs +++ b/crates/sail-physical-plan/src/streaming/source_adapter.rs @@ -20,18 +20,18 @@ use sail_common_datafusion::streaming::event::FlowEvent; #[derive(Debug)] pub struct StreamSourceAdapterExec { input: Arc, - properties: PlanProperties, + properties: Arc, } impl StreamSourceAdapterExec { pub fn new(input: Arc) -> Self { let schema = Arc::new(to_flow_event_schema(&input.schema())); - let properties = PlanProperties::new( + let properties = Arc::new(PlanProperties::new( EquivalenceProperties::new(schema), input.output_partitioning().clone(), input.pipeline_behavior(), input.boundedness(), - ); + )); Self { input, properties } } @@ -59,7 +59,7 @@ impl ExecutionPlan for StreamSourceAdapterExec { self } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { &self.properties } diff --git a/crates/sail-plan-lakehouse/src/lib.rs b/crates/sail-plan-lakehouse/src/lib.rs index 679a946cd4..aac66554ea 100644 --- a/crates/sail-plan-lakehouse/src/lib.rs +++ b/crates/sail-plan-lakehouse/src/lib.rs @@ -8,7 +8,6 @@ use datafusion::physical_planner::{ExtensionPlanner, PhysicalPlanner}; use datafusion_common::{internal_err, plan_err, DFSchemaRef, DataFusionError, Result, ToDFSchema}; use datafusion_expr::{LogicalPlan, UserDefinedLogicalNode}; use sail_data_source::resolve_listing_urls; -use sail_delta_lake::datasource::schema::DataFusionMixins; use sail_delta_lake::table::open_table_with_object_store; use sail_logical_plan::file_delete::FileDeleteNode; use sail_logical_plan::file_write::FileWriteNode; diff --git a/crates/sail-plan-lakehouse/src/optimizer.rs b/crates/sail-plan-lakehouse/src/optimizer.rs index 08867c9d5d..94a42a189c 100644 --- a/crates/sail-plan-lakehouse/src/optimizer.rs +++ b/crates/sail-plan-lakehouse/src/optimizer.rs @@ -129,11 +129,15 @@ fn ensure_file_column(plan: LogicalPlan) -> Result { let mut new_config = delta_source.config().clone(); new_config.file_column_name = Some(PATH_COLUMN.to_string()); - let new_source = Arc::new(delta_source.try_with_config(new_config)?); + let new_source = Arc::new(DeltaTableSource::try_new( + Arc::clone(delta_source.snapshot()), + delta_source.log_store().clone(), + new_config, + )?); let schema = new_source.schema(); let file_idx = schema.column_with_name(PATH_COLUMN).map(|(idx, _)| idx); - let mut projection = scan.projection.clone(); + let mut projection: Option> = scan.projection.clone(); if projection.is_none() { projection = Some((0..schema.fields().len()).collect::>()); } diff --git a/crates/sail-plan/Cargo.toml b/crates/sail-plan/Cargo.toml index 3aba1bbfaf..1b08f16fa6 100644 --- a/crates/sail-plan/Cargo.toml +++ b/crates/sail-plan/Cargo.toml @@ -12,6 +12,7 @@ sail-common-datafusion = { path = "../sail-common-datafusion" } sail-python-udf = { path = "../sail-python-udf" } sail-sql-analyzer = { path = "../sail-sql-analyzer" } sail-catalog = { path = "../sail-catalog" } +sail-catalog-memory = { path = "../sail-catalog-memory" } sail-function = { path = "../sail-function" } sail-logical-plan = { path = "../sail-logical-plan" } sail-physical-plan = { path = "../sail-physical-plan" } @@ -34,12 +35,12 @@ ryu = { workspace = true } either = { workspace = true } tokio = { workspace = true } rand = { workspace = true } +uuid = { workspace = true } object_store = { workspace = true } half = { workspace = true } arrow = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } -uuid = { workspace = true } indexmap = { workspace = true } log = { workspace = true } regex = { workspace = true } diff --git a/crates/sail-plan/src/catalog.rs b/crates/sail-plan/src/catalog.rs index e58475e641..bc54e200cb 100644 --- a/crates/sail-plan/src/catalog.rs +++ b/crates/sail-plan/src/catalog.rs @@ -1,20 +1,14 @@ use std::fmt::Formatter; -use std::sync::Arc; -use datafusion::catalog::MemTable; use datafusion::common::{DFSchemaRef, Result}; -use datafusion::datasource::provider_as_source; use datafusion::logical_expr::{Expr, LogicalPlan, UserDefinedLogicalNodeCore}; use datafusion::prelude::SessionContext; -use datafusion_common::{exec_datafusion_err, internal_datafusion_err, DFSchema}; -use datafusion_expr::{TableScan, UNNAMED_TABLE}; +use datafusion_common::{internal_datafusion_err, DFSchema}; use educe::Educe; use sail_catalog::command::CatalogCommand; -use sail_catalog::manager::CatalogManager; use sail_catalog::utils::quote_names_if_needed; use sail_common_datafusion::catalog::display::CatalogObjectDisplay; use sail_common_datafusion::catalog::{DatabaseStatus, TableColumnStatus, TableKind, TableStatus}; -use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::session::plan::PlanFormatter; use sail_common_datafusion::utils::items::ItemTaker; @@ -22,7 +16,7 @@ use crate::formatter::SparkPlanFormatter; #[derive(Clone, Debug, PartialEq, Eq, Hash, Educe)] #[educe(PartialOrd)] -pub(crate) struct CatalogCommandNode { +pub struct CatalogCommandNode { name: String, #[educe(PartialOrd(ignore))] schema: DFSchemaRef, @@ -40,25 +34,9 @@ impl CatalogCommandNode { command, }) } -} -impl CatalogCommandNode { - pub(crate) async fn execute(&self, ctx: &SessionContext) -> Result { - let manager = ctx.extension::()?; - let batch = self - .command - .clone() - .execute(ctx, manager.as_ref()) - .await - .map_err(|e| exec_datafusion_err!("{e}"))?; - let provider = MemTable::try_new(batch.schema(), vec![vec![batch]])?; - Ok(LogicalPlan::TableScan(TableScan::try_new( - UNNAMED_TABLE, - provider_as_source(Arc::new(provider)), - None, - vec![], - None, - )?)) + pub fn command(&self) -> &CatalogCommand { + &self.command } } diff --git a/crates/sail-plan/src/explain.rs b/crates/sail-plan/src/explain.rs index 3bbb1a97d7..a300a1c5dd 100644 --- a/crates/sail-plan/src/explain.rs +++ b/crates/sail-plan/src/explain.rs @@ -6,14 +6,11 @@ use datafusion::physical_plan::display::DisplayableExecutionPlan; use datafusion::physical_plan::{collect, displayable, ExecutionPlan}; use datafusion::prelude::SessionContext; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; -use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{DataFusionError, Result}; -use datafusion_expr::{EmptyRelation, Extension, LogicalPlan, UserDefinedLogicalNodeCore}; +use datafusion_expr::LogicalPlan; use sail_common::spec; use sail_common_datafusion::rename::physical_plan::rename_physical_plan; -use sail_logical_plan::precondition::WithPreconditionsNode; -use crate::catalog::CatalogCommandNode; use crate::config::PlanConfig; use crate::error::{PlanError, PlanResult}; use crate::resolver::plan::NamedPlan; @@ -171,15 +168,12 @@ async fn collect_plan_with( let initial_logical = plan.clone(); let mut stringified = vec![initial_logical.to_stringified(PlanType::InitialLogicalPlan)]; - // NOTE: Do NOT call `execute_logical_plan` from EXPLAIN. - // It would execute command nodes and trigger side effects (e.g. CREATE TABLE / INSERT). let session_state = ctx.state(); - let logical_plan = strip_explain_side_effect_nodes(plan)?; let config_options = session_state.config_options(); let explain_config = &config_options.explain; let analyzed_logical = session_state.analyzer().execute_and_check( - logical_plan, + plan, config_options.as_ref(), |analyzed_plan, analyzer| { let plan_type = PlanType::AnalyzedLogicalPlan { @@ -319,36 +313,6 @@ async fn collect_plan_with( }) } -/// Remove/neutralize logical nodes that can trigger side effects during EXPLAIN. -/// -/// - `WithPreconditionsNode` is stripped (preconditions are not executed). -/// - `CatalogCommandNode` is replaced with an empty relation with the same schema. -/// -/// This is intentionally EXPLAIN-only: normal execution goes through `execute_logical_plan`, -/// which performs the required side effects. -pub(crate) fn strip_explain_side_effect_nodes(plan: LogicalPlan) -> PlanResult { - Ok(plan - .transform_up(|plan| match &plan { - LogicalPlan::Extension(Extension { node }) => { - if let Some(n) = node.as_any().downcast_ref::() { - Ok(Transformed::yes(n.plan().clone())) - } else if let Some(n) = node.as_any().downcast_ref::() { - Ok(Transformed::yes(LogicalPlan::EmptyRelation( - EmptyRelation { - produce_one_row: false, - schema: n.schema().clone(), - }, - ))) - } else { - Ok(Transformed::no(plan)) - } - } - _ => Ok(Transformed::no(plan)), - }) - .map_err(PlanError::from)? - .data) -} - fn render_section(title: &str, body: &str) -> String { format!("== {title} ==\n{body}") } diff --git a/crates/sail-plan/src/formatter.rs b/crates/sail-plan/src/formatter.rs index bdcca62f25..313e66d3a6 100644 --- a/crates/sail-plan/src/formatter.rs +++ b/crates/sail-plan/src/formatter.rs @@ -410,6 +410,9 @@ impl PlanFormatter for SparkPlanFormatter { } None => Ok("NULL".to_string()), }, + ScalarValue::RunEndEncoded(_, _, _) => { + not_impl_err!("RunEndEncoded scalar value is not supported in SQL") + } } } @@ -606,6 +609,7 @@ impl PlanFormatter for SparkPlanFormatter { // SELECT count(`*`) FROM VALUES 1 AS t(`*`) // ``` "count" => { + let name = name.to_lowercase(); let arguments = arguments.join(", "); if is_distinct { Ok(format!("{name}(DISTINCT {arguments})")) diff --git a/crates/sail-plan/src/function/aggregate.rs b/crates/sail-plan/src/function/aggregate.rs index 47f7513fb3..d3b8b6a35f 100644 --- a/crates/sail-plan/src/function/aggregate.rs +++ b/crates/sail-plan/src/function/aggregate.rs @@ -261,14 +261,34 @@ fn count(input: AggFunctionInput) -> PlanResult { function_context: _, } = input; let null_treatment = get_null_treatment(ignore_nulls); + // For COUNT(DISTINCT *), the resolver already expanded the wildcard to column references + // (with hidden-column filtering). For COUNT(*), convert to COUNT(1). let args = transform_count_star_wildcard_expr(arguments); // TODO: remove StructFunction call when count distinct from multiple arguments is implemented // https://github.com/apache/datafusion/blob/58ddf0d4390c770bc571f3ac2727c7de77aa25ab/datafusion/functions-aggregate/src/count.rs#L333 let args = if distinct && (args.len() > 1) { - vec![ScalarUDF::from(StructFunction::new( + // In Spark, COUNT(DISTINCT col1, col2, ...) skips rows where ANY column is NULL. + // Since we wrap multiple columns into a struct for DataFusion, a struct with NULL + // fields is still a non-NULL value and would be counted. To match Spark semantics, + // return NULL (instead of a struct with NULL fields) when any argument is NULL. + // Compute any_null first (borrowing args), then move args into .call() to avoid cloning. + let any_null = args + .iter() + .map(|arg| arg.clone().is_null()) + .reduce(|a, b| a.or(b)); + let struct_expr = ScalarUDF::from(StructFunction::new( (0..args.len()).map(|i| format!("col{i}")).collect(), )) - .call(args)] + .call(args); + // `any_null` is always `Some` here since `args.len() > 1` guarantees `reduce` succeeds. + match any_null { + Some(any_null) => vec![expr::Expr::Case(expr::Case { + expr: None, + when_then_expr: vec![(Box::new(any_null), Box::new(lit(ScalarValue::Null)))], + else_expr: Some(Box::new(struct_expr)), + })], + None => vec![struct_expr], + } } else { args }; diff --git a/crates/sail-plan/src/function/scalar/datetime.rs b/crates/sail-plan/src/function/scalar/datetime.rs index a89823f0c6..457c52a751 100644 --- a/crates/sail-plan/src/function/scalar/datetime.rs +++ b/crates/sail-plan/src/function/scalar/datetime.rs @@ -5,6 +5,7 @@ use datafusion::functions::expr_fn; use datafusion_common::ScalarValue; use datafusion_expr::expr::{self, Expr}; use datafusion_expr::{cast, lit, try_cast, when, BinaryExpr, ExprSchemable, Operator, ScalarUDF}; +use datafusion_functions::expr_fn::to_time; use datafusion_spark::function::datetime::make_dt_interval::SparkMakeDtInterval; use datafusion_spark::function::datetime::make_interval::SparkMakeInterval; use sail_common::datetime::time_unit_to_multiplier; @@ -12,9 +13,12 @@ use sail_common_datafusion::utils::items::ItemTaker; use sail_function::scalar::datetime::convert_tz::ConvertTz; use sail_function::scalar::datetime::spark_date_part::SparkDatePart; use sail_function::scalar::datetime::spark_last_day::SparkLastDay; +use sail_function::scalar::datetime::spark_make_time::SparkMakeTime; use sail_function::scalar::datetime::spark_make_timestamp::SparkMakeTimestampNtz; use sail_function::scalar::datetime::spark_make_ym_interval::SparkMakeYmInterval; use sail_function::scalar::datetime::spark_next_day::SparkNextDay; +use sail_function::scalar::datetime::spark_time_diff::SparkTimeDiff; +use sail_function::scalar::datetime::spark_time_trunc::SparkTimeTrunc; use sail_function::scalar::datetime::spark_to_chrono_fmt::SparkToChronoFmt; use sail_function::scalar::datetime::spark_try_make_timestamp_ntz::SparkTryMakeTimestampNtz; use sail_function::scalar::datetime::spark_try_to_timestamp::SparkTryToTimestamp; @@ -31,6 +35,10 @@ fn integer_part(expr: Expr, part: &str) -> Expr { ) } +fn years(arg: Expr) -> Expr { + integer_part(arg, "YEAR") +} + fn trunc_part_conversion(part: Expr) -> Expr { Expr::Case(expr::Case { expr: None, @@ -666,6 +674,7 @@ pub(super) fn list_built_in_datetime_functions() -> Vec<(&'static str, ScalarFun ("convert_timezone", F::custom(convert_timezone)), ("curdate", F::nullary(expr_fn::current_date)), ("current_date", F::nullary(expr_fn::current_date)), + ("current_time", F::nullary(expr_fn::current_time)), ( "current_timestamp", F::custom(current_timestamp_microseconds), @@ -710,6 +719,7 @@ pub(super) fn list_built_in_datetime_functions() -> Vec<(&'static str, ScalarFun ("make_date", F::ternary(make_date)), ("make_dt_interval", F::udf(SparkMakeDtInterval::new())), ("make_interval", F::udf(SparkMakeInterval::new())), + ("make_time", F::udf(SparkMakeTime::new())), ("make_timestamp", F::custom(make_timestamp)), ("make_timestamp_ltz", F::custom(make_timestamp_ltz)), ("make_timestamp_ntz", F::custom(make_timestamp_ntz)), @@ -752,6 +762,7 @@ pub(super) fn list_built_in_datetime_functions() -> Vec<(&'static str, ScalarFun }), ), ("to_date", F::custom(to_date)), + ("to_time", F::var_arg(to_time)), ("to_timestamp", F::var_arg(to_timestamp)), // The description for `to_timestamp_ltz` and `to_timestamp_ntz` are the same: // "Parses the timestamp with the format to a timestamp without time zone. Returns null with invalid input." @@ -767,6 +778,8 @@ pub(super) fn list_built_in_datetime_functions() -> Vec<(&'static str, ScalarFun ("try_make_timestamp_ltz", F::custom(try_make_timestamp_ltz)), ("try_make_timestamp_ntz", F::custom(try_make_timestamp_ntz)), ("try_to_timestamp", F::custom(try_to_timestamp)), + ("time_diff", F::udf(SparkTimeDiff::new())), + ("time_trunc", F::udf(SparkTimeTrunc::new())), ( "unix_date", F::unary(|arg| cast(cast(arg, DataType::Date32), DataType::Int32)), @@ -792,5 +805,6 @@ pub(super) fn list_built_in_datetime_functions() -> Vec<(&'static str, ScalarFun ("window", F::unknown("window")), ("window_time", F::unknown("window_time")), ("year", F::unary(|arg| integer_part(arg, "YEAR"))), + ("years", F::unary(years)), ] } diff --git a/crates/sail-plan/src/function/scalar/geo.rs b/crates/sail-plan/src/function/scalar/geo.rs new file mode 100644 index 0000000000..bb3aa075ee --- /dev/null +++ b/crates/sail-plan/src/function/scalar/geo.rs @@ -0,0 +1,56 @@ +use sail_common_datafusion::utils::items::ItemTaker; +use sail_function::scalar::geo::st_asbinary::StAsBinary; +use sail_function::scalar::geo::st_geogfromwkb::StGeogFromWKB; +use sail_function::scalar::geo::st_geomfromwkb::StGeomFromWKB; + +use crate::function::common::{ScalarFunction, ScalarFunctionBuilder as F, ScalarFunctionInput}; + +pub(super) fn list_built_in_geo_functions() -> Vec<(&'static str, ScalarFunction)> { + vec![ + ("st_asbinary", F::custom(st_asbinary)), + ("st_geomfromwkb", F::custom(st_geomfromwkb)), + ("st_geogfromwkb", F::custom(st_geogfromwkb)), + ] +} + +fn st_geomfromwkb(input: ScalarFunctionInput) -> crate::error::PlanResult { + use datafusion_expr::{Expr, ScalarUDF}; + + let arg = input.arguments.one()?; + + let func = StGeomFromWKB::new(); + Ok(Expr::ScalarFunction( + datafusion_expr::expr::ScalarFunction { + func: std::sync::Arc::new(ScalarUDF::from(func)), + args: vec![arg], + }, + )) +} + +fn st_geogfromwkb(input: ScalarFunctionInput) -> crate::error::PlanResult { + use datafusion_expr::{Expr, ScalarUDF}; + + let arg = input.arguments.one()?; + + let func = StGeogFromWKB::new(); + Ok(Expr::ScalarFunction( + datafusion_expr::expr::ScalarFunction { + func: std::sync::Arc::new(ScalarUDF::from(func)), + args: vec![arg], + }, + )) +} + +fn st_asbinary(input: ScalarFunctionInput) -> crate::error::PlanResult { + use datafusion_expr::{Expr, ScalarUDF}; + + let arg = input.arguments.one()?; + + let func = StAsBinary::new(); + Ok(Expr::ScalarFunction( + datafusion_expr::expr::ScalarFunction { + func: std::sync::Arc::new(ScalarUDF::from(func)), + args: vec![arg], + }, + )) +} diff --git a/crates/sail-plan/src/function/scalar/mod.rs b/crates/sail-plan/src/function/scalar/mod.rs index 21cac3ca8e..6c6afcfad0 100644 --- a/crates/sail-plan/src/function/scalar/mod.rs +++ b/crates/sail-plan/src/function/scalar/mod.rs @@ -7,6 +7,7 @@ mod conditional; mod conversion; mod csv; mod datetime; +mod geo; mod hash; mod json; mod lambda; @@ -29,6 +30,7 @@ pub(super) fn list_built_in_scalar_functions() -> Vec<(&'static str, ScalarFunct output.extend(conversion::list_built_in_conversion_functions()); output.extend(csv::list_built_in_csv_functions()); output.extend(datetime::list_built_in_datetime_functions()); + output.extend(geo::list_built_in_geo_functions()); output.extend(hash::list_built_in_hash_functions()); output.extend(json::list_built_in_json_functions()); output.extend(lambda::list_built_in_lambda_functions()); diff --git a/crates/sail-plan/src/lib.rs b/crates/sail-plan/src/lib.rs index ae721a8088..c7e4f51a9a 100644 --- a/crates/sail-plan/src/lib.rs +++ b/crates/sail-plan/src/lib.rs @@ -1,17 +1,14 @@ use std::sync::Arc; -use async_recursion::async_recursion; use datafusion::dataframe::DataFrame; use datafusion::physical_plan::{displayable, ExecutionPlan}; use datafusion::prelude::SessionContext; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; use datafusion_common::Result; -use datafusion_expr::{Extension, LogicalPlan}; +use datafusion_expr::LogicalPlan; use sail_common::spec; use sail_common_datafusion::rename::physical_plan::rename_physical_plan; -use sail_logical_plan::precondition::WithPreconditionsNode; -use crate::catalog::CatalogCommandNode; use crate::config::PlanConfig; use crate::error::PlanResult; use crate::resolver::plan::NamedPlan; @@ -28,25 +25,8 @@ pub mod resolver; mod streaming; /// Executes a logical plan. -/// This replaces DDL statements and catalog operations with the execution results. -/// Logical plan nodes with corresponding physical plan nodes remain unchanged. -#[async_recursion] +/// Catalog commands and barrier nodes are handled by the physical planner. pub async fn execute_logical_plan(ctx: &SessionContext, plan: LogicalPlan) -> Result { - let plan = match plan { - LogicalPlan::Extension(Extension { node }) => { - if let Some(n) = node.as_any().downcast_ref::() { - n.execute(ctx).await? - } else if let Some(n) = node.as_any().downcast_ref::() { - for plan in n.preconditions() { - let _ = execute_logical_plan(ctx, plan.as_ref().clone()).await?; - } - n.plan().clone() - } else { - LogicalPlan::Extension(Extension { node }) - } - } - x => x, - }; let df = ctx.execute_logical_plan(plan).await?; Ok(df) } diff --git a/crates/sail-plan/src/resolver/command/catalog/table.rs b/crates/sail-plan/src/resolver/command/catalog/table.rs index 7cf8c41965..052e23a2c9 100644 --- a/crates/sail-plan/src/resolver/command/catalog/table.rs +++ b/crates/sail-plan/src/resolver/command/catalog/table.rs @@ -1,13 +1,14 @@ use datafusion_expr::LogicalPlan; use sail_catalog::command::CatalogCommand; -use sail_catalog::provider::{CatalogPartitionField, CreateTableColumnOptions, CreateTableOptions}; +use sail_catalog::manager::CatalogManager; +use sail_catalog::provider::{CreateTableColumnOptions, CreateTableOptions}; use sail_common::spec; use sail_common_datafusion::catalog::{ CatalogTableBucketBy, CatalogTableConstraint, CatalogTableSort, }; +use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::rename::logical_plan::rename_logical_plan; use sail_common_datafusion::utils::items::ItemTaker; -use uuid::Uuid; use crate::error::{PlanError, PlanResult}; use crate::resolver::state::PlanResolverState; @@ -48,16 +49,10 @@ impl PlanResolver<'_> { let location = if let Some(location) = location { location } else { - self.resolve_default_table_location(&table)? + self.resolve_default_table_location(&table).await? }; let format = self.resolve_catalog_table_format(file_format)?; - let partition_by = partition_by - .into_iter() - .map(|x| CatalogPartitionField { - column: x.into(), - transform: None, - }) - .collect(); + let partition_by = self.resolve_write_partition_by_expressions(partition_by)?; let sort_by = self.resolve_catalog_table_sort(sort_by)?; let bucket_by = self.resolve_catalog_table_bucket_by(bucket_by)?; @@ -88,7 +83,7 @@ impl PlanResolver<'_> { query: spec::QueryPlan, state: &mut PlanResolverState, ) -> PlanResult { - use super::super::write::{WriteMode, WritePlanBuilder, WriteTableAction, WriteTarget}; + use super::super::write::{WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTarget}; let spec::TableDefinition { columns, comment, @@ -120,12 +115,6 @@ impl PlanResolver<'_> { "REPLACE in CREATE TABLE AS SELECT statement", )); } - if !properties.is_empty() { - return Err(PlanError::todo( - "PROPERTIES in CREATE TABLE AS SELECT statement", - )); - } - if !sort_by.is_empty() { return Err(PlanError::todo( "SORT_BY in CREATE TABLE AS SELECT statement", @@ -160,63 +149,85 @@ impl PlanResolver<'_> { let column_names = PlanResolver::get_field_names(input.schema(), state)?; let input = rename_logical_plan(input, &column_names)?; let format = self.resolve_catalog_table_format(file_format)?; - // Handle location: add to options if specified let mut write_options = options; if let Some(location) = location { - write_options.push(("location".to_string(), location)); + write_options.push(("path".to_string(), location)); } - // Set write mode and action based on if_not_exists + // Set write mode based on if_not_exists let write_mode = if if_not_exists { WriteMode::IgnoreIfExists } else { WriteMode::ErrorIfExists }; - let action = if if_not_exists { - WriteTableAction::CreateIfNotExists - } else { - WriteTableAction::Create - }; - let partition_by = partition_by - .into_iter() - .map(|c| CatalogPartitionField { - column: c.into(), - transform: None, - }) - .collect(); + let partition_by = self.resolve_write_partition_by_expressions(partition_by)?; let builder = WritePlanBuilder::new() - .with_target(WriteTarget::NewTable { table, action }) + .with_target(WriteTarget::Table { + table, + column_match: WriteColumnMatch::ByName, + }) .with_mode(write_mode) .with_format(format) .with_partition_by(partition_by) + .with_table_properties(properties) .with_options(write_options); self.resolve_write_with_builder(input, builder, state).await } - pub(in super::super) fn resolve_default_table_location( + pub(in super::super) async fn resolve_default_table_location( &self, table: &spec::ObjectName, ) -> PlanResult { - let name: String = table - .parts() - .last() - .ok_or_else(|| PlanError::invalid("missing table name"))? - .clone() - .into(); - let name = name - .replace(|c: char| !c.is_alphanumeric() && c != '-', "-") - .to_lowercase(); + let [qualifier @ .., last] = table.parts() else { + return Err(PlanError::invalid("missing table name")); + }; + let name: String = last.clone().into(); + // For characters in the table name that are not alphanumeric, `-`, or `_`, + // replace with a fixed-width hex encoding of the Unicode code point: + // `u+XXXX` for U+0000..U+FFFF and `U+XXXXXXXX` for U+10000..U+10FFFF. + let name: String = name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' || c == '_' { + c.to_string() + } else { + let v = c as u32; + if v <= 0xFFFF { + format!("u+{v:04X}") + } else { + format!("U+{v:08X}") + } + } + }) + .collect(); // We use our own logic to map tables to locations. This avoids conflicts // and avoids issues with special characters in table names. // Note that this is different from how Spark handles table locations // for the default catalog. + let catalog_manager = self.ctx.extension::()?; + let location = catalog_manager + .get_database_by_qualifier(qualifier) + .await? + .location; + let (base, suffix) = match &location { + Some(loc) => ( + loc.trim_end_matches(object_store::path::DELIMITER), + String::new(), + ), + None => ( + self.config + .default_warehouse_directory + .trim_end_matches(object_store::path::DELIMITER), + format!("-{}", uuid::Uuid::new_v4()), + ), + }; Ok(format!( - "{}{}{}-{}", - self.config.default_warehouse_directory, + "{}{}{}{}", + base, object_store::path::DELIMITER, name, - Uuid::new_v4() + suffix, )) } diff --git a/crates/sail-plan/src/resolver/command/catalog/view.rs b/crates/sail-plan/src/resolver/command/catalog/view.rs index 0e5fd6a656..bccd07ab69 100644 --- a/crates/sail-plan/src/resolver/command/catalog/view.rs +++ b/crates/sail-plan/src/resolver/command/catalog/view.rs @@ -4,11 +4,14 @@ use arrow::datatypes::DataType; use datafusion_common::TableReference; use datafusion_expr::{LogicalPlan, SubqueryAlias}; use sail_catalog::command::CatalogCommand; +use sail_catalog::manager::tracker::CatalogLogicalPlanId; +use sail_catalog::manager::CatalogManager; use sail_catalog::provider::{ CreateTemporaryViewColumnOptions, CreateTemporaryViewOptions, CreateViewColumnOptions, CreateViewOptions, DropTemporaryViewOptions, DropViewOptions, }; use sail_common::spec; +use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::rename::logical_plan::rename_logical_plan; use crate::error::PlanResult; @@ -96,11 +99,13 @@ impl PlanResolver<'_> { None => (Self::get_field_names(input.schema(), state)?, vec![]), }; let input = rename_logical_plan(input, &fields)?; + let manager = self.ctx.extension::()?; + let input: CatalogLogicalPlanId = manager.track_logical_plan(Arc::new(input))?; let command = CatalogCommand::CreateTemporaryView { view: view.into(), is_global, options: CreateTemporaryViewOptions { - input: Arc::new(input), + input, columns, if_not_exists, replace, diff --git a/crates/sail-plan/src/resolver/command/function.rs b/crates/sail-plan/src/resolver/command/function.rs index 439e624788..506901545b 100644 --- a/crates/sail-plan/src/resolver/command/function.rs +++ b/crates/sail-plan/src/resolver/command/function.rs @@ -1,6 +1,8 @@ -use datafusion_expr::{LogicalPlan, ScalarUDF}; +use datafusion_expr::LogicalPlan; use sail_catalog::command::CatalogCommand; +use sail_catalog::manager::CatalogManager; use sail_common::spec; +use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_python_udf::udf::pyspark_unresolved_udf::PySparkUnresolvedUDF; use crate::error::PlanResult; @@ -36,9 +38,9 @@ impl PlanResolver<'_> { deterministic, ); - let command = CatalogCommand::RegisterFunction { - udf: ScalarUDF::from(udf), - }; + let manager = self.ctx.extension::()?; + let udf = manager.track_function(datafusion_expr::ScalarUDF::from(udf))?; + let command = CatalogCommand::RegisterFunction { udf }; self.resolve_catalog_command(command) } @@ -69,9 +71,9 @@ impl PlanResolver<'_> { ); // PySpark UDTF is registered as a scalar UDF since it will be used as a stream UDF // in the `MapPartitions` plan. - let command = CatalogCommand::RegisterFunction { - udf: ScalarUDF::from(udtf), - }; + let manager = self.ctx.extension::()?; + let udf = manager.track_function(datafusion_expr::ScalarUDF::from(udtf))?; + let command = CatalogCommand::RegisterFunction { udf }; self.resolve_catalog_command(command) } } diff --git a/crates/sail-plan/src/resolver/command/insert.rs b/crates/sail-plan/src/resolver/command/insert.rs index 7e43e047fe..c3eefc06fb 100644 --- a/crates/sail-plan/src/resolver/command/insert.rs +++ b/crates/sail-plan/src/resolver/command/insert.rs @@ -42,10 +42,13 @@ impl PlanResolver<'_> { } }; let builder = WritePlanBuilder::new() - .with_mode(WriteMode::Overwrite) - .with_target(WriteTarget::Path { location }) + .with_mode(WriteMode::Replace { + error_if_absent: false, + }) + .with_target(WriteTarget::DataSource) .with_format(format) - .with_options(options); + .with_options(options) + .with_options(vec![("path".to_string(), location)]); let input = self.resolve_write_input(input, state).await?; self.resolve_write_with_builder(input, builder, state).await } @@ -69,51 +72,54 @@ impl PlanResolver<'_> { let input = self.resolve_write_input(input, state).await?; - let mut builder = WritePlanBuilder::new().with_partition(partition); + if !partition.is_empty() { + return Err(PlanError::todo("PARTITION for write")); + } + let mut builder = WritePlanBuilder::new(); match mode { InsertMode::InsertByPosition { overwrite } => { let mode = if overwrite { - WriteMode::Overwrite + WriteMode::Truncate } else { - WriteMode::Append + WriteMode::Append { + error_if_absent: true, + } }; - builder = builder - .with_mode(mode) - .with_target(WriteTarget::ExistingTable { - table, - column_match: WriteColumnMatch::ByPosition, - }); + builder = builder.with_mode(mode).with_target(WriteTarget::Table { + table, + column_match: WriteColumnMatch::ByPosition, + }); } InsertMode::InsertByName { overwrite } => { let mode = if overwrite { - WriteMode::Overwrite + WriteMode::Truncate } else { - WriteMode::Append + WriteMode::Append { + error_if_absent: true, + } }; - builder = builder - .with_mode(mode) - .with_target(WriteTarget::ExistingTable { - table, - column_match: WriteColumnMatch::ByName, - }); + builder = builder.with_mode(mode).with_target(WriteTarget::Table { + table, + column_match: WriteColumnMatch::ByName, + }); } InsertMode::InsertByColumns { columns, overwrite } => { let mode = if overwrite { - WriteMode::Overwrite + WriteMode::Truncate } else { - WriteMode::Append + WriteMode::Append { + error_if_absent: true, + } }; - builder = builder - .with_mode(mode) - .with_target(WriteTarget::ExistingTable { - table, - column_match: WriteColumnMatch::ByColumns { columns }, - }); + builder = builder.with_mode(mode).with_target(WriteTarget::Table { + table, + column_match: WriteColumnMatch::ByColumns { columns }, + }); } InsertMode::Replace { condition } => { builder = builder - .with_mode(WriteMode::OverwriteIf { condition }) - .with_target(WriteTarget::ExistingTable { + .with_mode(WriteMode::TruncateIf { condition }) + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByPosition, }); diff --git a/crates/sail-plan/src/resolver/command/merge.rs b/crates/sail-plan/src/resolver/command/merge.rs index e85884d23a..c2915b8900 100644 --- a/crates/sail-plan/src/resolver/command/merge.rs +++ b/crates/sail-plan/src/resolver/command/merge.rs @@ -163,7 +163,7 @@ impl PlanResolver<'_> { options: vec![], }; let plan = spec::QueryPlan::new(spec::QueryNode::Read { - read_type: spec::ReadType::NamedTable(read), + read_type: spec::ReadType::NamedTable(Box::new(read)), is_streaming: false, }); self.resolve_query_plan(plan, state).await @@ -478,7 +478,7 @@ impl PlanResolver<'_> { table_name: table.clone().into(), format, location, - partition_by, + partition_by: partition_by.into_iter().map(|field| field.column).collect(), options: vec![options], }) } @@ -883,6 +883,14 @@ fn merge_disambiguate_unqualified_plan_ids( }, Expr::UnresolvedDate { .. } => expr, Expr::UnresolvedTimestamp { .. } => expr, + Expr::IdentifierClause { expr: inner } => Expr::IdentifierClause { + expr: Box::new(merge_disambiguate_unqualified_plan_ids( + *inner, + state, + target_schema, + source_schema, + )), + }, Expr::Subquery { plan_id, subquery_type, diff --git a/crates/sail-plan/src/resolver/command/mod.rs b/crates/sail-plan/src/resolver/command/mod.rs index d65316a476..2c158d1518 100644 --- a/crates/sail-plan/src/resolver/command/mod.rs +++ b/crates/sail-plan/src/resolver/command/mod.rs @@ -24,6 +24,7 @@ mod write_v1; mod write_v2; impl PlanResolver<'_> { + /// Resolves a command plan into a logical plan. pub(super) async fn resolve_command_plan( &self, plan: spec::CommandPlan, diff --git a/crates/sail-plan/src/resolver/command/write.rs b/crates/sail-plan/src/resolver/command/write.rs index 69381017c5..f329083f71 100644 --- a/crates/sail-plan/src/resolver/command/write.rs +++ b/crates/sail-plan/src/resolver/command/write.rs @@ -7,48 +7,86 @@ use datafusion_expr::{col, Expr, ExprSchemable, Extension, LogicalPlan, LogicalP use sail_catalog::command::CatalogCommand; use sail_catalog::error::CatalogError; use sail_catalog::manager::CatalogManager; -use sail_catalog::provider::{CatalogPartitionField, CreateTableColumnOptions, CreateTableOptions}; +use sail_catalog::provider::{ + CatalogPartitionField, CreateTableColumnOptions, CreateTableOptions, PartitionTransform, +}; use sail_common::spec; use sail_common_datafusion::catalog::{ CatalogTableBucketBy, CatalogTableSort, TableColumnStatus, TableKind, }; -use sail_common_datafusion::datasource::{BucketBy, SinkMode}; +use sail_common_datafusion::datasource::{ + find_option, BucketBy, SinkMode, SourceInfo, TableFormatRegistry, +}; use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::logical_expr::ExprWithSource; use sail_common_datafusion::rename::logical_plan::rename_logical_plan; use sail_common_datafusion::rename::schema::rename_schema; use sail_common_datafusion::utils::items::ItemTaker; +use sail_logical_plan::barrier::BarrierNode; use sail_logical_plan::file_write::{FileWriteNode, FileWriteOptions}; -use sail_logical_plan::precondition::WithPreconditionsNode; use crate::error::{PlanError, PlanResult}; use crate::resolver::state::PlanResolverState; use crate::resolver::PlanResolver; +/// The write modes for all targets. +/// +/// The modes are classified based on the action to take when the target exists or +/// does not exist. More modes can be added if additional actions are required. +/// If the target does not exist, the action is usually either "creating the target +/// and writing the data" or "returning an error". If the target exists, the actions +/// are more diverse. +/// +/// We avoid using terms such as "overwrite" since it has different semantics +/// in different APIs, so we introduce more specific terms such as "replace" +/// and "truncate" instead. pub(super) enum WriteMode { + /// If the target exists, return an error. + /// If the target does not exist, create the target and write the data. ErrorIfExists, + /// If the target exists, skip the write operation. + /// If the target does not exist, create the target and write the data. IgnoreIfExists, - Append, - Overwrite, - OverwriteIf { + /// If the target exists, add the data to the target. + /// If the target does not exist, return an error if `error_if_absent` is true, + /// or create the target and write the data if `error_if_absent` is false. + /// + /// The data must have compatible schema if the target exists. + Append { error_if_absent: bool }, + /// If the target exists, remove and recreate the target and write the data. + /// If the target does not exist, return an error if `error_if_absent` is true, + /// or create the target and write the data if `error_if_absent` is false. + /// + /// The data can have incompatible schema even if the target exists. + Replace { error_if_absent: bool }, + /// If the target exists, remove all data and write the data. + /// If the target does not exist, return an error. + /// + /// This is different from [`Self::Replace`] since the existing target is not removed and + /// recreated. + /// The data must have compatible schema if the target exists. + Truncate, + /// If the target exists, remove all data matching the condition and add the data. + /// If the target does not exist, return an error. + /// + /// The data must have compatible schema if the target exists. + TruncateIf { condition: Box, }, - OverwritePartitions, + /// If the target exists, remove all data from partitions that overlap with the data to write, + /// and then add the data. + /// If the target does not exist, return an error. + /// + /// The data must have compatible schema if the target exists. + TruncatePartitions, } pub(super) enum WriteTarget { - Path { - location: String, - }, - Sink, - ExistingTable { + DataSource, + Table { table: spec::ObjectName, column_match: WriteColumnMatch, }, - NewTable { - table: spec::ObjectName, - action: WriteTableAction, - }, } #[expect(clippy::enum_variant_names)] @@ -58,24 +96,16 @@ pub(super) enum WriteColumnMatch { ByColumns { columns: Vec }, } -pub(super) enum WriteTableAction { - Create, - CreateIfNotExists, - CreateOrReplace, - Replace, -} - /// A unified logical plan builder for all write or insert operations. pub(super) struct WritePlanBuilder { target: Option, mode: Option, format: Option, - partition: Vec<(spec::Identifier, Option)>, partition_by: Vec, bucket_by: Option, sort_by: Vec, cluster_by: Vec, - options: Vec<(String, String)>, + options: Vec>, table_properties: Vec<(String, String)>, } @@ -85,7 +115,6 @@ impl WritePlanBuilder { target: None, mode: None, format: None, - partition: vec![], partition_by: vec![], bucket_by: None, sort_by: vec![], @@ -110,14 +139,6 @@ impl WritePlanBuilder { self } - pub fn with_partition( - mut self, - partition: Vec<(spec::Identifier, Option)>, - ) -> Self { - self.partition = partition; - self - } - pub fn with_partition_by(mut self, partition_by: Vec) -> Self { self.partition_by = partition_by; self @@ -139,7 +160,7 @@ impl WritePlanBuilder { } pub fn with_options(mut self, options: Vec<(String, String)>) -> Self { - self.options = options; + self.options.push(options); self } @@ -160,7 +181,6 @@ impl PlanResolver<'_> { mode, target, format, - partition, partition_by, bucket_by, sort_by, @@ -175,189 +195,212 @@ impl PlanResolver<'_> { let Some(target) = target else { return Err(PlanError::internal("target is required for write builder")); }; - if !partition.is_empty() { - return Err(PlanError::todo("PARTITION for write")); - } if !cluster_by.is_empty() { return Err(PlanError::todo("CLUSTER BY for write")); } let input_schema = input.schema().inner().clone(); - let options_map = options - .clone() - .into_iter() - .collect::>(); let mut file_write_options = FileWriteOptions { - path: String::new(), // The mode will be set later so the value here is just a placeholder. mode: SinkMode::ErrorIfExists, format: format.unwrap_or_default(), - partition_by: self.resolve_write_partition_by(partition_by.clone())?, + partition_by: partition_by.clone(), sort_by: self .resolve_sort_orders(sort_by.clone(), true, input.schema(), state) .await?, bucket_by: self.resolve_write_bucket_by(bucket_by.clone())?, - options: vec![options], + table_properties: vec![], + options, }; let mut preconditions = vec![]; match target { - WriteTarget::Path { location } => { + WriteTarget::DataSource => { if !table_properties.is_empty() { return Err(PlanError::invalid( - "table properties are not supported for writing to a path", + "table properties are not supported for writing to a data source", )); } if file_write_options.format.is_empty() { file_write_options.format = self.config.default_table_file_format.clone(); } - file_write_options.path = location; let schema_for_cond = - matches!(mode, WriteMode::OverwriteIf { .. }).then_some(input_schema.as_ref()); + matches!(mode, WriteMode::TruncateIf { .. }).then_some(input_schema.as_ref()); file_write_options.mode = self .resolve_write_mode(mode, schema_for_cond, state) .await?; } - WriteTarget::Sink => { - if !table_properties.is_empty() { - return Err(PlanError::invalid( - "table properties are not supported for writing to a sink", - )); - } - if file_write_options.format.is_empty() { - file_write_options.format = self.config.default_table_file_format.clone(); - } - let schema_for_cond = - matches!(mode, WriteMode::OverwriteIf { .. }).then_some(input_schema.as_ref()); - file_write_options.mode = self - .resolve_write_mode(mode, schema_for_cond, state) - .await?; - } - WriteTarget::ExistingTable { + WriteTarget::Table { table, column_match, } => { - if !table_properties.is_empty() { - return Err(PlanError::invalid( - "cannot specify table properties when writing to an existing table", - )); + let info = self.resolve_table_info(&table).await?; + + // Return early if the target exists and the mode says to skip + if matches!(mode, WriteMode::IgnoreIfExists) && info.is_some() { + return Ok(LogicalPlanBuilder::empty(false).build()?); } - let Some(info) = self.resolve_table_info(&table).await? else { + + // Error if the mode requires an existing target but it does not exist + let requires_existing = matches!( + mode, + WriteMode::Append { + error_if_absent: true + } | WriteMode::Replace { + error_if_absent: true + } | WriteMode::Truncate + | WriteMode::TruncateIf { .. } + | WriteMode::TruncatePartitions + ); + if requires_existing && info.is_none() { return Err(PlanError::invalid(format!( "table does not exist: {table:?}" ))); - }; - if matches!(mode, WriteMode::IgnoreIfExists) { - return Ok(LogicalPlanBuilder::empty(false).build()?); - } - info.validate_file_write_options(&file_write_options)?; - input = Self::rewrite_write_input(input, column_match, &info)?; - file_write_options.mode = self - .resolve_write_mode(mode, Some(&info.schema()), state) - .await?; - file_write_options.partition_by = info.partition_by; - file_write_options.sort_by = info.sort_by.into_iter().map(|x| x.into()).collect(); - file_write_options.bucket_by = info.bucket_by.map(|x| x.into()); - file_write_options.path = info.location.ok_or_else(|| { - PlanError::invalid(format!("table does not have a location: {table:?}")) - })?; - file_write_options.format = info.format; - file_write_options.options.insert(0, info.options); - if !info.properties.is_empty() { - file_write_options.options.insert(0, info.properties); - } - } - WriteTarget::NewTable { table, action } => { - let info = self.resolve_table_info(&table).await?; - if matches!(mode, WriteMode::IgnoreIfExists) && info.is_some() { - return Ok(LogicalPlanBuilder::empty(false).build()?); } - if matches!(action, WriteTableAction::CreateIfNotExists) { - if let Some(ref info) = info { - info.validate_file_write_options(&file_write_options)?; - input = Self::rewrite_write_input(input, WriteColumnMatch::ByName, info)?; + + // Compute the schema for conditional truncation before potentially consuming info + let schema_for_cond = if matches!(mode, WriteMode::TruncateIf { .. }) { + info.as_ref().map(|i| i.schema()) + } else { + None + }; + + // Use the existing table metadata when the table exists and the mode is + // "append or truncate" (as opposed to "replace or create"). + let use_existing = matches!( + mode, + WriteMode::Append { .. } + | WriteMode::Truncate + | WriteMode::TruncateIf { .. } + | WriteMode::TruncatePartitions + ); + if let Some(info) = info.as_ref().filter(|_| use_existing) { + if !table_properties.is_empty() { + return Err(PlanError::invalid( + "cannot specify table properties when writing to an existing table", + )); } - } - file_write_options.mode = self.resolve_write_mode(mode, None, state).await?; - if file_write_options.format.is_empty() { - if let Some(format) = info.as_ref().map(|x| &x.format) { - file_write_options.format = format.clone(); - } else { - file_write_options.format = self.config.default_table_file_format.clone(); + info.validate_file_write_options(&file_write_options)?; + input = Self::rewrite_write_input(input, column_match, info)?; + if file_write_options.partition_by.is_empty() + || !info.format.eq_ignore_ascii_case("iceberg") + { + file_write_options.partition_by = info.partition_by.clone(); } - } - if let Some(location) = info.as_ref().and_then(|x| x.location.as_ref()) { - file_write_options.path = location.clone(); - } else if let Some(location) = options_map.get("location") { - file_write_options.path = location.to_string(); - } else if let Some(path) = options_map.get("path") { - file_write_options.path = path.to_string(); - } else { - file_write_options.path = self.resolve_default_table_location(&table)?; - } - if !table_properties.is_empty() { + file_write_options.sort_by = + info.sort_by.iter().cloned().map(|x| x.into()).collect(); + file_write_options.bucket_by = info.bucket_by.clone().map(|x| x.into()); + let location = info.location.clone().ok_or_else(|| { + PlanError::invalid(format!("table does not have a location: {table:?}")) + })?; file_write_options .options - .insert(0, table_properties.clone()); - } - let (if_not_exists, replace) = match action { - WriteTableAction::Create => (false, false), - WriteTableAction::CreateIfNotExists => (true, false), - WriteTableAction::CreateOrReplace => (false, true), - WriteTableAction::Replace => { - if info.is_none() { - return Err(PlanError::invalid(format!( - "table does not exist: {table:?}" - ))); + .push(vec![("path".to_string(), location)]); + file_write_options.format = info.format.clone(); + file_write_options.options.insert(0, info.options.clone()); + file_write_options.table_properties = info.properties.clone(); + } else { + // Create or replace the table + file_write_options.table_properties = table_properties.clone(); + if file_write_options.format.is_empty() { + if let Some(format) = info.as_ref().map(|x| &x.format) { + file_write_options.format = format.clone(); + } else { + file_write_options.format = + self.config.default_table_file_format.clone(); } - (false, true) } - }; - let columns = input - .schema() - .inner() - .fields() - .iter() - .map(|f| CreateTableColumnOptions { - name: f.name().clone(), - data_type: f.data_type().clone(), - nullable: f.is_nullable(), - comment: None, - default: None, - generated_always_as: None, - }) - .collect(); - let sort_by = self.resolve_catalog_table_sort(sort_by)?; - let bucket_by = self.resolve_catalog_table_bucket_by(bucket_by)?; - let command = CatalogCommand::CreateTable { - table: table.into(), - options: CreateTableOptions { - columns, - comment: None, - constraints: vec![], - location: Some(file_write_options.path.clone()), - format: file_write_options.format.clone(), - partition_by, - sort_by, - bucket_by, - if_not_exists, - replace, - options: file_write_options + if let Some(location) = info.as_ref().and_then(|x| x.location.as_ref()) { + file_write_options .options - .last() - .cloned() - .into_iter() - .flatten() - .collect(), - properties: table_properties, - }, - }; - preconditions.push(Arc::new(self.resolve_catalog_command(command)?)); + .push(vec![("path".to_string(), location.clone())]); + } else { + let default_location = self.resolve_default_table_location(&table).await?; + file_write_options + .options + .insert(0, vec![("path".to_string(), default_location)]); + }; + if file_write_options + .partition_by + .iter() + .any(|field| field.transform.is_some()) + && !file_write_options.format.eq_ignore_ascii_case("iceberg") + { + return Err(PlanError::unsupported( + "partition transforms are only supported for Iceberg tables", + )); + } + let all_options: Vec> = + file_write_options + .options + .iter() + .map(|set| set.iter().cloned().collect()) + .collect(); + let table_location = find_option(&all_options, "path") + .or_else(|| find_option(&all_options, "location")) + .unwrap_or_default(); + let (if_not_exists, replace) = if matches!(mode, WriteMode::Append { .. }) { + (true, false) + } else if matches!(mode, WriteMode::Replace { .. }) { + (false, true) + } else { + // ErrorIfExists or IgnoreIfExists + (false, false) + }; + let columns = input + .schema() + .inner() + .fields() + .iter() + .map(|f| CreateTableColumnOptions { + name: f.name().clone(), + data_type: f.data_type().clone(), + nullable: f.is_nullable(), + comment: None, + default: None, + generated_always_as: None, + }) + .collect(); + // TODO: Revisit passing write options to CreateTableOptions. + let create_table_options: Vec<(String, String)> = file_write_options + .options + .iter() + .flatten() + .filter(|(k, _)| { + !k.eq_ignore_ascii_case("path") && !k.eq_ignore_ascii_case("location") + }) + .cloned() + .collect(); + let sort_by = self.resolve_catalog_table_sort(sort_by)?; + let bucket_by = self.resolve_catalog_table_bucket_by(bucket_by)?; + let command = CatalogCommand::CreateTable { + table: table.into(), + options: CreateTableOptions { + columns, + comment: None, + constraints: vec![], + location: Some(table_location), + format: file_write_options.format.clone(), + partition_by, + sort_by, + bucket_by, + if_not_exists, + replace, + options: create_table_options, + properties: table_properties, + }, + }; + preconditions.push(Arc::new(self.resolve_catalog_command(command)?)); + } + + file_write_options.mode = self + .resolve_write_mode(mode, schema_for_cond.as_ref(), state) + .await?; } }; let plan = LogicalPlan::Extension(Extension { node: Arc::new(FileWriteNode::new(Arc::new(input), file_write_options)), }); Ok(LogicalPlan::Extension(Extension { - node: Arc::new(WithPreconditionsNode::new(preconditions, Arc::new(plan))), + node: Arc::new(BarrierNode::new(preconditions, Arc::new(plan))), })) } @@ -389,9 +432,9 @@ impl PlanResolver<'_> { match mode { WriteMode::ErrorIfExists => Ok(SinkMode::ErrorIfExists), WriteMode::IgnoreIfExists => Ok(SinkMode::IgnoreIfExists), - WriteMode::Append => Ok(SinkMode::Append), - WriteMode::Overwrite => Ok(SinkMode::Overwrite), - WriteMode::OverwriteIf { condition } => { + WriteMode::Append { .. } => Ok(SinkMode::Append), + WriteMode::Replace { .. } | WriteMode::Truncate => Ok(SinkMode::Overwrite), + WriteMode::TruncateIf { condition } => { let Some(schema) = schema else { return Err(PlanError::internal( "conditional overwrite is not allowed without a table schema", @@ -408,7 +451,7 @@ impl PlanResolver<'_> { condition: Box::new(ExprWithSource::new(expr, condition.source)), }) } - WriteMode::OverwritePartitions => Ok(SinkMode::OverwritePartitions), + WriteMode::TruncatePartitions => Ok(SinkMode::OverwritePartitions), } } @@ -425,7 +468,7 @@ impl PlanResolver<'_> { }; match status.kind { TableKind::Table { - columns, + mut columns, comment: _, constraints: _, location, @@ -435,16 +478,67 @@ impl PlanResolver<'_> { bucket_by, options, properties, - } => Ok(Some(TableInfo { - columns, - location, - format, - partition_by, - sort_by, - bucket_by, - options, - properties, - })), + } => { + // When a table is created without column definitions + // (e.g. `CREATE TABLE t USING fmt`), the catalog stores an empty column list. + // Discover the schema from the table format so that write operations + // (INSERT INTO) can validate the input schema correctly. + if columns.is_empty() { + let registry = self.ctx.extension::().map_err(|e| { + PlanError::invalid(format!( + "failed to access table format registry for table `{table:?}`: {e}", + )) + })?; + let table_format = registry.get(&format).map_err(|e| { + PlanError::invalid(format!( + "failed to resolve table format `{format}` for table `{table:?}`: {e}", + )) + })?; + let info = SourceInfo { + paths: location.iter().cloned().collect(), + schema: None, + constraints: Default::default(), + partition_by: vec![], + bucket_by: None, + sort_order: vec![], + options: vec![options.iter().cloned().collect()], + }; + let provider = table_format + .create_provider(&self.ctx.state(), info) + .await + .map_err(|e| { + PlanError::invalid(format!( + "failed to infer schema for table `{table:?}` from format `{format}`: {e}", + )) + })?; + columns = provider + .schema() + .fields() + .iter() + .map(|f| TableColumnStatus { + name: f.name().clone(), + data_type: f.data_type().clone(), + nullable: f.is_nullable(), + comment: None, + default: None, + generated_always_as: None, + is_partition: false, + is_bucket: false, + is_cluster: false, + }) + .collect(); + } + Ok(Some(TableInfo { + columns, + location, + format, + partition_by, + sort_by, + bucket_by, + options, + properties, + })) + } _ => Ok(None), } } @@ -527,11 +621,30 @@ impl PlanResolver<'_> { Ok(plan) } - fn resolve_write_partition_by( + pub(super) fn resolve_write_partition_by_expressions( &self, - partition_by: Vec, - ) -> PlanResult> { - Ok(partition_by.into_iter().map(|x| x.column).collect()) + partition_by: Vec, + ) -> PlanResult> { + partition_by + .into_iter() + .map(|x| match x { + spec::Expr::UnresolvedAttribute { + name, + plan_id: None, + is_metadata_column: false, + } => { + let name: Vec = name.into(); + Ok(CatalogPartitionField { + column: name.one()?, + transform: None, + }) + } + spec::Expr::UnresolvedFunction(f) => resolve_partition_transform_function(f), + _ => Err(PlanError::invalid( + "partitioning column must be a column reference or transform function", + )), + }) + .collect() } fn resolve_write_bucket_by( @@ -551,11 +664,133 @@ impl PlanResolver<'_> { } } +fn resolve_partition_transform_function( + func: spec::UnresolvedFunction, +) -> PlanResult { + let function_name: Vec = func.function_name.into(); + let function_name = function_name.one()?; + let function_name_lower = function_name.to_lowercase(); + + match function_name_lower.as_str() { + "years" | "months" | "days" | "hours" => { + let transform = match function_name_lower.as_str() { + "years" => PartitionTransform::Year, + "months" => PartitionTransform::Month, + "days" => PartitionTransform::Day, + "hours" => PartitionTransform::Hour, + _ => unreachable!(), + }; + let column = extract_partition_column_from_args(&func.arguments, 0)?; + Ok(CatalogPartitionField { + column, + transform: Some(transform), + }) + } + "bucket" => { + let num_buckets = extract_partition_int_arg(&func.arguments, 0, "bucket count")?; + let column = extract_partition_column_from_args(&func.arguments, 1)?; + Ok(CatalogPartitionField { + column, + transform: Some(PartitionTransform::Bucket(num_buckets)), + }) + } + "truncate" => { + let (column, width) = extract_partition_truncate_args(&func.arguments)?; + Ok(CatalogPartitionField { + column, + transform: Some(PartitionTransform::Truncate(width)), + }) + } + _ => Err(PlanError::invalid(format!( + "unsupported partition transform function: {function_name}" + ))), + } +} + +fn extract_partition_truncate_args(args: &[spec::Expr]) -> PlanResult<(String, u32)> { + if let (Ok(column), Ok(width)) = ( + extract_partition_column_from_args(args, 0), + extract_partition_int_arg(args, 1, "truncate width"), + ) { + return Ok((column, width)); + } + if let (Ok(width), Ok(column)) = ( + extract_partition_int_arg(args, 0, "truncate width"), + extract_partition_column_from_args(args, 1), + ) { + return Ok((column, width)); + } + Err(PlanError::invalid( + "truncate() expects a column reference and an integer literal width", + )) +} + +fn extract_partition_column_from_args(args: &[spec::Expr], index: usize) -> PlanResult { + let arg = args.get(index).ok_or_else(|| { + PlanError::invalid(format!( + "partition transform function requires argument at index {index}" + )) + })?; + match arg { + spec::Expr::UnresolvedAttribute { + name, + plan_id: None, + is_metadata_column: false, + } => { + let name: Vec = name.clone().into(); + Ok(name.one()?) + } + _ => Err(PlanError::invalid( + "partition transform function argument must be a column reference", + )), + } +} + +fn extract_partition_int_arg( + args: &[spec::Expr], + index: usize, + description: &str, +) -> PlanResult { + let arg = args.get(index).ok_or_else(|| { + PlanError::invalid(format!( + "partition transform function requires {description} at index {index}" + )) + })?; + match arg { + spec::Expr::Literal(lit) => match lit { + spec::Literal::Int8 { value: Some(v) } => u32::try_from(*v).map_err(|_| { + PlanError::invalid(format!("{description} must be a positive integer")) + }), + spec::Literal::Int16 { value: Some(v) } => u32::try_from(*v).map_err(|_| { + PlanError::invalid(format!("{description} must be a positive integer")) + }), + spec::Literal::Int32 { value: Some(v) } => u32::try_from(*v).map_err(|_| { + PlanError::invalid(format!("{description} must be a positive integer")) + }), + spec::Literal::Int64 { value: Some(v) } => u32::try_from(*v).map_err(|_| { + PlanError::invalid(format!("{description} must be a positive integer")) + }), + spec::Literal::UInt8 { value: Some(v) } => Ok(u32::from(*v)), + spec::Literal::UInt16 { value: Some(v) } => Ok(u32::from(*v)), + spec::Literal::UInt32 { value: Some(v) } => Ok(*v), + spec::Literal::UInt64 { value: Some(v) } => u32::try_from(*v).map_err(|_| { + PlanError::invalid(format!("{description} must fit in a 32-bit integer")) + }), + _ => Err(PlanError::invalid(format!( + "{description} must be an integer literal" + ))), + }, + _ => Err(PlanError::invalid(format!( + "{description} must be an integer literal" + ))), + } +} + struct TableInfo { columns: Vec, location: Option, format: String, - partition_by: Vec, + partition_by: Vec, sort_by: Vec, bucket_by: Option, options: Vec<(String, String)>, @@ -573,7 +808,9 @@ impl TableInfo { } fn validate_file_write_options(&self, options: &FileWriteOptions) -> PlanResult<()> { - if !self.is_empty_or_equivalent_partitioning(&options.partition_by) { + if !self.format.eq_ignore_ascii_case("iceberg") + && !self.is_empty_or_equivalent_partitioning(&options.partition_by) + { return Err(PlanError::invalid( "cannot specify a different partitioning when writing to an existing table", )); @@ -592,13 +829,27 @@ impl TableInfo { Ok(()) } - fn is_empty_or_equivalent_partitioning(&self, partition_by: &[String]) -> bool { + fn is_empty_or_equivalent_partitioning(&self, partition_by: &[CatalogPartitionField]) -> bool { partition_by.is_empty() || (partition_by.len() == self.partition_by.len() && partition_by .iter() .zip(self.partition_by.iter()) - .all(|(a, b)| a.eq_ignore_ascii_case(b))) + .all(|(a, b)| Self::partition_fields_match(a, b))) + } + + fn partition_fields_match(a: &CatalogPartitionField, b: &CatalogPartitionField) -> bool { + fn normalize_transform( + transform: Option, + ) -> Option { + match transform { + None | Some(PartitionTransform::Identity) => None, + Some(transform) => Some(transform), + } + } + + a.column.eq_ignore_ascii_case(&b.column) + && normalize_transform(a.transform) == normalize_transform(b.transform) } fn is_empty_or_equivalent_bucketing( diff --git a/crates/sail-plan/src/resolver/command/write_stream.rs b/crates/sail-plan/src/resolver/command/write_stream.rs index 190055041c..afb4189769 100644 --- a/crates/sail-plan/src/resolver/command/write_stream.rs +++ b/crates/sail-plan/src/resolver/command/write_stream.rs @@ -3,7 +3,7 @@ use sail_catalog::provider::CatalogPartitionField; use sail_common::spec; use crate::error::{PlanError, PlanResult}; -use crate::resolver::command::write::{WriteMode, WritePlanBuilder, WriteTableAction, WriteTarget}; +use crate::resolver::command::write::{WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTarget}; use crate::resolver::state::PlanResolverState; use crate::resolver::PlanResolver; @@ -49,18 +49,22 @@ impl PlanResolver<'_> { .with_cluster_by(clustering_columns) .with_format(format) .with_options(options) - .with_mode(WriteMode::Append); + .with_mode(WriteMode::Append { + error_if_absent: false, + }); match sink_destination { None => { - builder = builder.with_target(WriteTarget::Sink); + builder = builder.with_target(WriteTarget::DataSource); } Some(WriteStreamSinkDestination::Path { path }) => { - builder = builder.with_target(WriteTarget::Path { location: path }); + builder = builder + .with_target(WriteTarget::DataSource) + .with_options(vec![("path".to_string(), path)]); } Some(WriteStreamSinkDestination::Table { table }) => { - builder = builder.with_target(WriteTarget::NewTable { + builder = builder.with_target(WriteTarget::Table { table, - action: WriteTableAction::CreateIfNotExists, + column_match: WriteColumnMatch::ByName, }) } } diff --git a/crates/sail-plan/src/resolver/command/write_v1.rs b/crates/sail-plan/src/resolver/command/write_v1.rs index 1f471cc6a8..3298784e5d 100644 --- a/crates/sail-plan/src/resolver/command/write_v1.rs +++ b/crates/sail-plan/src/resolver/command/write_v1.rs @@ -1,11 +1,8 @@ use datafusion_expr::LogicalPlan; -use sail_catalog::provider::CatalogPartitionField; use sail_common::spec; use crate::error::{PlanError, PlanResult}; -use crate::resolver::command::write::{ - WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTableAction, WriteTarget, -}; +use crate::resolver::command::write::{WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTarget}; use crate::resolver::state::PlanResolverState; use crate::resolver::PlanResolver; @@ -41,13 +38,7 @@ impl PlanResolver<'_> { let input = self.resolve_write_input(*input, state).await?; let clustering_columns = self.resolve_write_cluster_by_columns(clustering_columns)?; - let partition_by = partitioning_columns - .into_iter() - .map(|c| CatalogPartitionField { - column: c.into(), - transform: None, - }) - .collect(); + let partition_by = self.resolve_write_partition_by_expressions(partitioning_columns)?; let mut builder = WritePlanBuilder::new() .with_partition_by(partition_by) .with_bucket_by(bucket_by) @@ -61,7 +52,9 @@ impl PlanResolver<'_> { let write_mode = match mode { Some(SaveMode::ErrorIfExists) | None => WriteMode::ErrorIfExists, Some(SaveMode::IgnoreIfExists) => WriteMode::IgnoreIfExists, - Some(SaveMode::Append) => WriteMode::Append, + Some(SaveMode::Append) => WriteMode::Append { + error_if_absent: false, + }, Some(SaveMode::Overwrite) => match replace_where { Some(ref replace_where) => { let ast_expr = @@ -79,14 +72,16 @@ impl PlanResolver<'_> { "invalid replaceWhere expression: {replace_where} ({e})" )) })?; - WriteMode::OverwriteIf { + WriteMode::TruncateIf { condition: Box::new(spec::ExprWithSource { expr: spec_expr, source: Some(replace_where.clone()), }), } } - None => WriteMode::Overwrite, + None => WriteMode::Replace { + error_if_absent: false, + }, }, }; Ok(write_mode) @@ -96,12 +91,16 @@ impl PlanResolver<'_> { SaveType::Path(location) => { let mode = to_write_mode(mode)?; builder = builder - .with_target(WriteTarget::Path { location }) - .with_mode(mode); + .with_target(WriteTarget::DataSource) + .with_mode(mode) + .with_options(vec![("path".to_string(), location)]); } SaveType::Sink => { let mode = to_write_mode(mode)?; - builder = builder.with_target(WriteTarget::Sink).with_mode(mode); + // Any "path" option supplied by the user (e.g. via + // df.write.format(...).option("path", path).save()) remains in + // options and is picked up by WriteTarget::DataSource. + builder = builder.with_target(WriteTarget::DataSource).with_mode(mode); } SaveType::Table { table, @@ -109,35 +108,39 @@ impl PlanResolver<'_> { } => match mode { Some(SaveMode::ErrorIfExists) | None => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::Create, + column_match: WriteColumnMatch::ByName, }) .with_mode(WriteMode::ErrorIfExists); } Some(SaveMode::IgnoreIfExists) => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::Create, + column_match: WriteColumnMatch::ByName, }) .with_mode(WriteMode::IgnoreIfExists); } Some(SaveMode::Append) => { builder = builder - .with_target(WriteTarget::ExistingTable { + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Append); + .with_mode(WriteMode::Append { + error_if_absent: false, + }); } Some(SaveMode::Overwrite) => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::CreateOrReplace, + column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Overwrite); + .with_mode(WriteMode::Replace { + error_if_absent: false, + }); } }, SaveType::Table { @@ -145,11 +148,13 @@ impl PlanResolver<'_> { save_method: TableSaveMethod::InsertInto, } => { let mode = match mode { - Some(SaveMode::Overwrite) => WriteMode::Overwrite, - _ => WriteMode::Append, + Some(SaveMode::Overwrite) => WriteMode::Truncate, + _ => WriteMode::Append { + error_if_absent: true, + }, }; builder = builder - .with_target(WriteTarget::ExistingTable { + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByPosition, }) diff --git a/crates/sail-plan/src/resolver/command/write_v2.rs b/crates/sail-plan/src/resolver/command/write_v2.rs index 92aabccd4f..11f67f8816 100644 --- a/crates/sail-plan/src/resolver/command/write_v2.rs +++ b/crates/sail-plan/src/resolver/command/write_v2.rs @@ -1,12 +1,8 @@ use datafusion_expr::LogicalPlan; -use sail_catalog::provider::{CatalogPartitionField, PartitionTransform}; use sail_common::spec; -use sail_common_datafusion::utils::items::ItemTaker; -use crate::error::{PlanError, PlanResult}; -use crate::resolver::command::write::{ - WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTableAction, WriteTarget, -}; +use crate::error::PlanResult; +use crate::resolver::command::write::{WriteColumnMatch, WriteMode, WritePlanBuilder, WriteTarget}; use crate::resolver::state::PlanResolverState; use crate::resolver::PlanResolver; @@ -47,35 +43,39 @@ impl PlanResolver<'_> { match mode { WriteToMode::Append => { builder = builder - .with_target(WriteTarget::ExistingTable { + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Append); + .with_mode(WriteMode::Append { + error_if_absent: true, + }); } WriteToMode::Create => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::Create, + column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Overwrite); + .with_mode(WriteMode::ErrorIfExists); } WriteToMode::CreateOrReplace => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::CreateOrReplace, + column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Overwrite); + .with_mode(WriteMode::Replace { + error_if_absent: false, + }); } WriteToMode::Overwrite { condition } => { builder = builder - .with_target(WriteTarget::ExistingTable { + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::OverwriteIf { + .with_mode(WriteMode::TruncateIf { condition: Box::new(spec::ExprWithSource { expr: *condition, source: None, @@ -84,159 +84,23 @@ impl PlanResolver<'_> { } WriteToMode::OverwritePartitions => { builder = builder - .with_target(WriteTarget::ExistingTable { + .with_target(WriteTarget::Table { table, column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::OverwritePartitions); + .with_mode(WriteMode::TruncatePartitions); } WriteToMode::Replace => { builder = builder - .with_target(WriteTarget::NewTable { + .with_target(WriteTarget::Table { table, - action: WriteTableAction::Replace, + column_match: WriteColumnMatch::ByName, }) - .with_mode(WriteMode::Overwrite); + .with_mode(WriteMode::Replace { + error_if_absent: true, + }); } }; self.resolve_write_with_builder(input, builder, state).await } - - fn resolve_write_partition_by_expressions( - &self, - partition_by: Vec, - ) -> PlanResult> { - partition_by - .into_iter() - .map(|x| match x { - spec::Expr::UnresolvedAttribute { - name, - plan_id: None, - is_metadata_column: false, - } => { - let name: Vec = name.into(); - Ok(CatalogPartitionField { - column: name.one()?, - transform: None, - }) - } - spec::Expr::UnresolvedFunction(f) => self.resolve_partition_transform_function(f), - _ => Err(PlanError::invalid( - "partitioning column must be a column reference or transform function", - )), - }) - .collect() - } - - fn resolve_partition_transform_function( - &self, - func: spec::UnresolvedFunction, - ) -> PlanResult { - let function_name: Vec = func.function_name.into(); - let function_name = function_name.one()?; - let function_name_lower = function_name.to_lowercase(); - - match function_name_lower.as_str() { - "years" | "months" | "days" | "hours" => { - let transform = match function_name_lower.as_str() { - "years" => PartitionTransform::Year, - "months" => PartitionTransform::Month, - "days" => PartitionTransform::Day, - "hours" => PartitionTransform::Hour, - _ => unreachable!(), - }; - let column = self.extract_partition_column_from_args(&func.arguments, 0)?; - Ok(CatalogPartitionField { - column, - transform: Some(transform), - }) - } - "bucket" => { - let num_buckets = - self.extract_partition_int_arg(&func.arguments, 0, "bucket count")?; - let column = self.extract_partition_column_from_args(&func.arguments, 1)?; - Ok(CatalogPartitionField { - column, - transform: Some(PartitionTransform::Bucket(num_buckets)), - }) - } - "truncate" => { - let width = self.extract_partition_int_arg(&func.arguments, 0, "truncate width")?; - let column = self.extract_partition_column_from_args(&func.arguments, 1)?; - Ok(CatalogPartitionField { - column, - transform: Some(PartitionTransform::Truncate(width)), - }) - } - _ => Err(PlanError::invalid(format!( - "unsupported partition transform function: {function_name}" - ))), - } - } - - fn extract_partition_column_from_args( - &self, - args: &[spec::Expr], - index: usize, - ) -> PlanResult { - let arg = args.get(index).ok_or_else(|| { - PlanError::invalid(format!( - "partition transform function requires argument at index {index}" - )) - })?; - match arg { - spec::Expr::UnresolvedAttribute { - name, - plan_id: None, - is_metadata_column: false, - } => { - let name: Vec = name.clone().into(); - Ok(name.one()?) - } - _ => Err(PlanError::invalid( - "partition transform function argument must be a column reference", - )), - } - } - - fn extract_partition_int_arg( - &self, - args: &[spec::Expr], - index: usize, - description: &str, - ) -> PlanResult { - let arg = args.get(index).ok_or_else(|| { - PlanError::invalid(format!( - "partition transform function requires {description} at index {index}" - )) - })?; - match arg { - spec::Expr::Literal(lit) => match lit { - spec::Literal::Int8 { value: Some(v) } => u32::try_from(*v).map_err(|_| { - PlanError::invalid(format!("{description} must be a positive integer")) - }), - spec::Literal::Int16 { value: Some(v) } => u32::try_from(*v).map_err(|_| { - PlanError::invalid(format!("{description} must be a positive integer")) - }), - spec::Literal::Int32 { value: Some(v) } => u32::try_from(*v).map_err(|_| { - PlanError::invalid(format!("{description} must be a positive integer")) - }), - spec::Literal::Int64 { value: Some(v) } => u32::try_from(*v).map_err(|_| { - PlanError::invalid(format!("{description} must be a positive integer")) - }), - spec::Literal::UInt8 { value: Some(v) } => Ok(u32::from(*v)), - spec::Literal::UInt16 { value: Some(v) } => Ok(u32::from(*v)), - spec::Literal::UInt32 { value: Some(v) } => Ok(*v), - spec::Literal::UInt64 { value: Some(v) } => u32::try_from(*v).map_err(|_| { - PlanError::invalid(format!("{description} must fit in a 32-bit integer")) - }), - _ => Err(PlanError::invalid(format!( - "{description} must be an integer literal" - ))), - }, - _ => Err(PlanError::invalid(format!( - "{description} must be an integer literal" - ))), - } - } } diff --git a/crates/sail-plan/src/resolver/data_type.rs b/crates/sail-plan/src/resolver/data_type.rs index 49855c4a1a..21bb8648b3 100644 --- a/crates/sail-plan/src/resolver/data_type.rs +++ b/crates/sail-plan/src/resolver/data_type.rs @@ -129,14 +129,20 @@ impl PlanResolver<'_> { interval_unit, start_field: _, end_field: _, - } => { - // TODO: Currently `start_field` and `end_field` is lost in translation. - // This does not impact computation accuracy, - // This may affect the display string in the `data_type_to_simple_string` function. - Ok(adt::DataType::Interval(Self::resolve_interval_unit( - interval_unit, - ))) - } + } => match interval_unit { + spec::IntervalUnit::YearMonth => { + Ok(adt::DataType::Interval(adt::IntervalUnit::YearMonth)) + } + // Spark's DayTimeInterval has microsecond precision. + // Arrow's IntervalUnit::DayTime has millisecond precision. + // Use Duration to preserve microsecond precision. + spec::IntervalUnit::DayTime => { + Ok(adt::DataType::Duration(adt::TimeUnit::Microsecond)) + } + spec::IntervalUnit::MonthDayNano => { + Ok(adt::DataType::Interval(adt::IntervalUnit::MonthDayNano)) + } + }, DataType::Binary => Ok(adt::DataType::Binary), DataType::FixedSizeBinary { size } => Ok(adt::DataType::FixedSizeBinary(*size)), DataType::LargeBinary => Ok(adt::DataType::LargeBinary), @@ -380,14 +386,6 @@ impl PlanResolver<'_> { } } - pub fn resolve_interval_unit(interval_unit: &spec::IntervalUnit) -> adt::IntervalUnit { - match interval_unit { - spec::IntervalUnit::YearMonth => adt::IntervalUnit::YearMonth, - spec::IntervalUnit::DayTime => adt::IntervalUnit::DayTime, - spec::IntervalUnit::MonthDayNano => adt::IntervalUnit::MonthDayNano, - } - } - pub fn resolve_union_mode(union_mode: &spec::UnionMode) -> adt::UnionMode { match union_mode { spec::UnionMode::Sparse => adt::UnionMode::Sparse, diff --git a/crates/sail-plan/src/resolver/expression/cast.rs b/crates/sail-plan/src/resolver/expression/cast.rs index e697f0b32b..1953d776dc 100644 --- a/crates/sail-plan/src/resolver/expression/cast.rs +++ b/crates/sail-plan/src/resolver/expression/cast.rs @@ -31,6 +31,19 @@ impl PlanResolver<'_> { schema: &DFSchemaRef, state: &mut PlanResolverState, ) -> PlanResult { + // Extract the DayTimeInterval field unit before resolving to Arrow type, + // since it determines the multiplier for numeric-to-interval casts. + // Spark uses the end field (or start field for single-field intervals) + // to interpret the numeric value: e.g. DayTimeIntervalType(DAY, DAY) treats + // the value as days, while DayTimeIntervalType(DAY, SECOND) treats it as seconds. + let day_time_interval_field = match &cast_to_type { + spec::DataType::Interval { + interval_unit: spec::IntervalUnit::DayTime, + start_field, + end_field, + } => end_field.or(*start_field), + _ => None, + }; let cast_to_type = self.resolve_data_type(&cast_to_type, state)?; let NamedExpr { expr, name, .. } = self.resolve_named_expression(expr, schema, state).await?; @@ -70,10 +83,11 @@ impl PlanResolver<'_> { (from, DataType::Timestamp(time_unit, _) | DataType::Duration(time_unit), _) if from.is_numeric() => { - cast( - expr.mul(lit(time_unit_to_multiplier(&time_unit))), - cast_to_type, - ) + let multiplier = match (day_time_interval_field, &cast_to_type) { + (Some(field), DataType::Duration(_)) => day_time_field_to_microseconds(field), + _ => time_unit_to_multiplier(&time_unit), + }; + cast(expr.mul(lit(multiplier)), cast_to_type) } (DataType::Timestamp(time_unit, _) | DataType::Duration(time_unit), to, _) if to.is_numeric() => @@ -137,6 +151,16 @@ impl PlanResolver<'_> { } } +fn day_time_field_to_microseconds(field: spec::IntervalFieldType) -> i64 { + match field { + spec::IntervalFieldType::Day => 86_400_000_000, + spec::IntervalFieldType::Hour => 3_600_000_000, + spec::IntervalFieldType::Minute => 60_000_000, + // Second, or Year/Month (shouldn't appear for DayTime intervals) + _ => 1_000_000, + } +} + fn need_rename_cast(expr: &expr::Expr) -> bool { match expr { expr::Expr::Alias(_) | expr::Expr::Column(_) | expr::Expr::OuterReferenceColumn(..) => { diff --git a/crates/sail-plan/src/resolver/expression/function.rs b/crates/sail-plan/src/resolver/expression/function.rs index ff8abfa368..668597b3b4 100644 --- a/crates/sail-plan/src/resolver/expression/function.rs +++ b/crates/sail-plan/src/resolver/expression/function.rs @@ -1,8 +1,8 @@ use datafusion_common::DFSchemaRef; use datafusion_expr::expr::ScalarFunction; -use datafusion_expr::registry::FunctionRegistry; use datafusion_expr::utils::{expand_qualified_wildcard, expand_wildcard}; use datafusion_expr::{expr, EmptyRelation, Expr, LogicalPlan}; +use sail_catalog::manager::CatalogManager; use sail_common::spec; use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::session::plan::PlanService; @@ -46,7 +46,8 @@ impl PlanResolver<'_> { return Err(PlanError::todo("named function arguments")); } let canonical_function_name = function_name.to_ascii_lowercase(); - if let Ok(udf) = self.ctx.udf(&canonical_function_name) { + let catalog_manager = self.ctx.extension::()?; + if let Some(udf) = catalog_manager.get_function(&canonical_function_name)? { if udf.inner().as_any().is::() { state.config_mut().arrow_allow_large_var_types = true; } @@ -67,7 +68,7 @@ impl PlanResolver<'_> { // FIXME: `is_user_defined_function` is always false, // so we need to check UDFs before built-in functions. - let func = if let Ok(udf) = self.ctx.udf(&canonical_function_name) { + let func = if let Some(udf) = catalog_manager.get_function(&canonical_function_name)? { if ignore_nulls.is_some() || filter.is_some() || order_by.is_some() { return Err(PlanError::invalid("invalid scalar function clause")); } @@ -90,7 +91,7 @@ impl PlanResolver<'_> { )? } else { expr::Expr::ScalarFunction(ScalarFunction { - func: udf, + func: std::sync::Arc::new(udf), args: arguments, }) } @@ -117,6 +118,32 @@ impl PlanResolver<'_> { Some(x) => self.resolve_sort_orders(x, true, schema, state).await?, None => vec![], }; + // For DISTINCT aggregate functions with a wildcard argument (e.g., COUNT(DISTINCT *)), + // expand the wildcard to visible column references here in the resolver where we have + // access to `state` for hidden-column filtering. This ensures hidden columns (e.g., + // join keys) are excluded from the distinct count. + #[expect(deprecated)] + let arguments = if is_distinct + && matches!( + arguments.as_slice(), + [expr::Expr::Wildcard { + qualifier: None, + options: _ + }] + ) { + schema + .columns() + .into_iter() + .filter(|c| { + state + .get_field_info(&c.name) + .is_ok_and(|info| !info.is_hidden()) + }) + .map(expr::Expr::Column) + .collect() + } else { + arguments + }; let input = AggFunctionInput { arguments, distinct: is_distinct, @@ -137,13 +164,48 @@ impl PlanResolver<'_> { ))); }; + // When `COUNT(DISTINCT *)` is used, expand the wildcard display names + // to individual column names so the output header matches Spark JVM behavior + // (e.g., `count(DISTINCT a, b, c)` instead of `count(DISTINCT *)`). + let argument_display_names = + if is_distinct && argument_display_names.iter().any(|n| n == "*") { + schema + .columns() + .iter() + .filter_map(|c| { + let info = state.get_field_info(&c.name).ok()?; + if info.is_hidden() { + None + } else { + Some(info.name().to_string()) + } + }) + .collect::>() + } else { + argument_display_names + }; let service = self.ctx.extension::()?; let name = service.plan_formatter().function_to_string( &function_name, argument_display_names.iter().map(|x| x.as_str()).collect(), is_distinct, )?; - Ok(NamedExpr::new(vec![name], func)) + + // Extract metadata from UDF if it implements return_field_from_args + let metadata = if let expr::Expr::ScalarFunction(ScalarFunction { + func: udf, args, .. + }) = &func + { + extract_metadata_from_udf(udf, args)? + } else { + vec![] + }; + + if !metadata.is_empty() { + Ok(NamedExpr::new(vec![name], func).with_metadata(metadata)) + } else { + Ok(NamedExpr::new(vec![name], func)) + } } pub(super) async fn resolve_expression_call_function( @@ -277,3 +339,58 @@ impl PlanResolver<'_> { arguments } } + +/// Extract metadata from a UDF by calling return_field_from_args with real argument information +fn extract_metadata_from_udf( + udf: &std::sync::Arc, + args: &[expr::Expr], +) -> PlanResult> { + use std::sync::Arc; + + use datafusion::arrow::datatypes::{DataType, Field}; + use datafusion_common::ScalarValue; + use datafusion_expr::{ExprSchemable, ReturnFieldArgs}; + + // Extract real field types from the resolved expressions + let empty_schema = datafusion_common::DFSchema::empty(); + let arg_fields: Vec> = args + .iter() + .enumerate() + .map(|(i, arg)| { + let data_type = arg.get_type(&empty_schema).unwrap_or(DataType::Null); + Arc::new(Field::new(format!("arg_{}", i), data_type, true)) + }) + .collect(); + + // Extract literal scalar values from expressions + let mut scalar_values = Vec::new(); + + for arg in args { + if let expr::Expr::Literal(scalar_value, _) = arg { + scalar_values.push(scalar_value.clone()); + } else { + scalar_values.push(ScalarValue::Null); + } + } + + let scalar_refs: Vec> = scalar_values + .iter() + .map(|v| if v.is_null() { None } else { Some(v) }) + .collect(); + + let return_field_args = ReturnFieldArgs { + arg_fields: &arg_fields, + scalar_arguments: &scalar_refs, + }; + + // Try to extract metadata, but don't fail if it doesn't work + if let Ok(field) = udf.return_field_from_args(return_field_args) { + Ok(field + .metadata() + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect()) + } else { + Ok(vec![]) + } +} diff --git a/crates/sail-plan/src/resolver/expression/misc.rs b/crates/sail-plan/src/resolver/expression/misc.rs index 2eead77c7a..0dacae60a9 100644 --- a/crates/sail-plan/src/resolver/expression/misc.rs +++ b/crates/sail-plan/src/resolver/expression/misc.rs @@ -11,6 +11,7 @@ use datafusion_functions::core::expr_ext::FieldAccessor; use datafusion_functions_nested::expr_fn::{array_element, map_extract}; use sail_common::spec; use sail_common_datafusion::extension::SessionExtensionAccessor; +use sail_common_datafusion::literal::LiteralEvaluator; use sail_common_datafusion::session::plan::PlanService; use sail_common_datafusion::utils::items::ItemTaker; use sail_function::scalar::drop_struct_field::DropStructField; @@ -56,6 +57,81 @@ impl PlanResolver<'_> { Ok(NamedExpr::new(vec![name], expr)) } + pub(super) async fn resolve_expression_identifier_clause( + &self, + expr: spec::Expr, + schema: &DFSchemaRef, + state: &mut PlanResolverState, + ) -> PlanResult { + let resolved = self.resolve_expression(expr, schema, state).await?; + let name = self.evaluate_identifier_expr(resolved, state)?; + let object_name = sail_sql_analyzer::expression::from_ast_object_name( + sail_sql_analyzer::parser::parse_object_name(&name)?, + )?; + self.resolve_expression_attribute(object_name, None, false, schema, state) + } + + /// Evaluates a resolved DataFusion expression as an identifier string. + /// + /// Named parameter placeholders (e.g. `:col`) are substituted from the + /// current parameter scope in `state` before constant-folding, which + /// allows expressions like `IDENTIFIER(:col)` or + /// `IDENTIFIER(:tab || '.' || :col)` to work inside parameterized SQL. + pub(in super::super) fn evaluate_identifier_expr( + &self, + expr: expr::Expr, + state: &PlanResolverState, + ) -> PlanResult { + use datafusion_common::tree_node::{Transformed, TreeNode}; + let expr = expr + .transform(|e| { + if let expr::Expr::Placeholder(expr::Placeholder { id, .. }) = &e { + if id.is_empty() { + return Ok(Transformed::no(e)); + } + // Strip the leading prefix character (e.g. ':' or '$') from the + // placeholder id to get the param key, mirroring DataFusion's own + // `get_placeholders_with_values` which does `id[1..]`. + let key = &id[1..]; + // Try named parameter. + if let Some(scalar) = state.get_param_value(key) { + return Ok(Transformed::yes(expr::Expr::Literal(scalar.clone(), None))); + } + // Try positional parameter (key is a 1-based integer index). + if let Ok(index) = key.parse::() { + if index > 0 { + if let Some(scalar) = state.get_positional_param_value(index - 1) { + return Ok(Transformed::yes(expr::Expr::Literal( + scalar.clone(), + None, + ))); + } + } + } + } + Ok(Transformed::no(e)) + }) + .map_err(|e| { + PlanError::invalid(format!("IDENTIFIER placeholder substitution failed: {e}")) + })? + .data; + let evaluator = LiteralEvaluator::new(); + // Any placeholder that was not substituted above (e.g. because it had no + // matching parameter) will cause the evaluation to fail here, since the + // LiteralEvaluator cannot constant-fold an unresolved placeholder expression. + let scalar = evaluator.evaluate(&expr).map_err(|e| { + PlanError::invalid(format!("IDENTIFIER expression must be a constant: {e}")) + })?; + match scalar { + ScalarValue::Utf8(Some(s)) + | ScalarValue::LargeUtf8(Some(s)) + | ScalarValue::Utf8View(Some(s)) => Ok(s), + _ => Err(PlanError::invalid( + "IDENTIFIER expression must evaluate to a string", + )), + } + } + pub(super) async fn resolve_expression_table( &self, expr: spec::Expr, @@ -68,12 +144,12 @@ impl PlanResolver<'_> { plan_id: None, is_metadata_column: false, } => spec::QueryPlan::new(spec::QueryNode::Read { - read_type: spec::ReadType::NamedTable(spec::ReadNamedTable { + read_type: spec::ReadType::NamedTable(Box::new(spec::ReadNamedTable { name, temporal: None, sample: None, options: vec![], - }), + })), is_streaming: false, }), _ => { diff --git a/crates/sail-plan/src/resolver/expression/mod.rs b/crates/sail-plan/src/resolver/expression/mod.rs index e6307e2f61..d1fd2b09b0 100644 --- a/crates/sail-plan/src/resolver/expression/mod.rs +++ b/crates/sail-plan/src/resolver/expression/mod.rs @@ -320,6 +320,10 @@ impl PlanResolver<'_> { timestamp_type, } => self.resolve_expression_timestamp(value, timestamp_type, state), Expr::UnresolvedTime { value } => self.resolve_expression_time(value, state), + Expr::IdentifierClause { expr } => { + self.resolve_expression_identifier_clause(*expr, schema, state) + .await + } } } @@ -386,6 +390,7 @@ impl PlanResolver<'_> { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::sync::Arc; use datafusion::execution::SessionStateBuilder; @@ -393,6 +398,9 @@ mod tests { use datafusion_common::{DFSchema, ScalarValue}; use datafusion_expr::expr::{Alias, Expr}; use datafusion_expr::{BinaryExpr, Operator}; + use sail_catalog::manager::{CatalogManager, CatalogManagerOptions}; + use sail_catalog::provider::CatalogProvider; + use sail_catalog_memory::MemoryCatalogProvider; use sail_common::spec; use sail_common_datafusion::catalog::display::DefaultCatalogDisplay; use sail_common_datafusion::session::plan::PlanService; @@ -405,14 +413,33 @@ mod tests { use crate::resolver::state::PlanResolverState; use crate::resolver::PlanResolver; - #[tokio::test] - async fn test_resolve_expression_with_name() -> PlanResult<()> { + fn create_session() -> PlanResult { let mut state = SessionStateBuilder::new().build(); - state.config_mut().set_extension(Arc::new(PlanService::new( + let catalog_manager = CatalogManager::try_new(CatalogManagerOptions { + catalogs: HashMap::from([( + "sail".to_string(), + Arc::new(MemoryCatalogProvider::new( + "sail".to_string(), + vec![Arc::from("default")].try_into()?, + None, + )) as Arc, + )]), + default_catalog: "sail".to_string(), + default_database: vec!["default".to_string()], + global_temporary_database: vec!["global_temp".to_string()], + })?; + let plan_service = PlanService::new( Box::new(DefaultCatalogDisplay::::default()), Box::new(SparkPlanFormatter), - ))); - let ctx = SessionContext::new_with_state(state); + ); + state.config_mut().set_extension(Arc::new(catalog_manager)); + state.config_mut().set_extension(Arc::new(plan_service)); + Ok(SessionContext::new_with_state(state)) + } + + #[tokio::test] + async fn test_resolve_expression_with_name() -> PlanResult<()> { + let ctx = create_session()?; let resolver = PlanResolver::new(&ctx, Arc::new(PlanConfig::new()?)); async fn resolve(resolver: &PlanResolver<'_>, expr: spec::Expr) -> PlanResult { @@ -520,4 +547,98 @@ mod tests { Ok(()) } + + fn assert_metadata_value( + metadata_map: &HashMap, + key: &str, + expected_value: &str, + ) { + assert_eq!( + metadata_map.get(key), + Some(&expected_value.to_string()), + "Expected {} in metadata, got: {:?}", + key, + metadata_map + ); + } + + #[tokio::test] + async fn test_st_geomfromwkb_returns_geometry_metadata() -> PlanResult<()> { + let ctx = create_session()?; + let resolver = PlanResolver::new(&ctx, Arc::new(PlanConfig::new()?)); + + let result = resolver + .resolve_named_expression( + spec::Expr::UnresolvedFunction(spec::UnresolvedFunction { + function_name: spec::ObjectName::bare("st_geomfromwkb"), + arguments: vec![spec::Expr::Literal(spec::Literal::Binary { + value: Some(vec![ + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 255, 63, 0, 0, 0, 0, 0, 0, 64, + ]), + })], + named_arguments: vec![], + is_distinct: false, + is_user_defined_function: false, + is_internal: None, + ignore_nulls: None, + filter: None, + order_by: None, + }), + &Arc::new(DFSchema::empty()), + &mut PlanResolverState::new(), + ) + .await?; + + let metadata: Vec<(String, String)> = result.metadata.iter().as_slice().to_vec(); + let metadata_map: HashMap<_, _> = metadata.clone().into_iter().collect(); + + assert_metadata_value(&metadata_map, "ARROW:extension:name", "geoarrow.wkb"); + assert_metadata_value( + &metadata_map, + "ARROW:extension:metadata", + r#"{"crs":"SRID:0"}"#, + ); + + Ok(()) + } + + #[tokio::test] + async fn test_st_geogfromwkb_returns_geography_metadata() -> PlanResult<()> { + let ctx = create_session()?; + let resolver = PlanResolver::new(&ctx, Arc::new(PlanConfig::new()?)); + + let result = resolver + .resolve_named_expression( + spec::Expr::UnresolvedFunction(spec::UnresolvedFunction { + function_name: spec::ObjectName::bare("st_geogfromwkb"), + arguments: vec![spec::Expr::Literal(spec::Literal::Binary { + value: Some(vec![ + 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 240, 255, 63, 0, 0, 0, 0, 0, 0, 64, + ]), + })], + named_arguments: vec![], + is_distinct: false, + is_user_defined_function: false, + is_internal: None, + ignore_nulls: None, + filter: None, + order_by: None, + }), + &Arc::new(DFSchema::empty()), + &mut PlanResolverState::new(), + ) + .await?; + + let metadata: Vec<(String, String)> = result.metadata.iter().as_slice().to_vec(); + let metadata_map: HashMap<_, _> = metadata.clone().into_iter().collect(); + + assert_metadata_value(&metadata_map, "ARROW:extension:name", "geoarrow.wkb"); + assert_metadata_value( + &metadata_map, + "ARROW:extension:metadata", + r#"{"crs":"OGC:CRS84","edges":"spherical"}"#, + ); + + Ok(()) + } } diff --git a/crates/sail-plan/src/resolver/expression/window.rs b/crates/sail-plan/src/resolver/expression/window.rs index 7cefa1e0e7..d535afdfc7 100644 --- a/crates/sail-plan/src/resolver/expression/window.rs +++ b/crates/sail-plan/src/resolver/expression/window.rs @@ -2,13 +2,16 @@ use std::cmp::Ordering; use std::sync::Arc; use arrow::datatypes::DataType; +use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion_common::{DFSchemaRef, DataFusionError, ScalarValue}; use datafusion_expr::expr::WindowFunctionParams; +use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{ expr, AggregateUDF, ExprSchemable, WindowFrame, WindowFrameBound, WindowFrameUnits, }; use sail_common::spec; use sail_common_datafusion::extension::SessionExtensionAccessor; +use sail_common_datafusion::literal::LiteralEvaluator; use sail_common_datafusion::session::plan::PlanService; use sail_common_datafusion::utils::items::ItemTaker; use sail_python_udf::cereal::pyspark_udf::PySparkUdfPayload; @@ -52,7 +55,8 @@ impl PlanResolver<'_> { .resolve_sort_orders(order_by, false, schema, state) .await?; let window_frame = if let Some(frame) = frame { - self.resolve_window_frame(frame, &sorts, schema, state)? + self.resolve_window_frame(frame, &sorts, schema, state) + .await? } else { WindowFrame::new(if sorts.is_empty() { None @@ -184,7 +188,7 @@ impl PlanResolver<'_> { Ok(NamedExpr::new(vec![name], window)) } - fn resolve_window_frame( + async fn resolve_window_frame( &self, frame: spec::WindowFrame, order_by: &[expr::Sort], @@ -205,31 +209,55 @@ impl PlanResolver<'_> { }; let (start, end) = match units { WindowFrameUnits::Rows | WindowFrameUnits::Groups => ( - self.resolve_window_boundary_offset(lower, state)?, - self.resolve_window_boundary_offset(upper, state)?, + self.resolve_window_boundary_offset(lower, schema, state) + .await?, + self.resolve_window_boundary_offset(upper, schema, state) + .await?, ), WindowFrameUnits::Range => ( - self.resolve_window_boundary_value(lower, order_by, schema, state)?, - self.resolve_window_boundary_value(upper, order_by, schema, state)?, + self.resolve_window_boundary_value(lower, order_by, schema, state) + .await?, + self.resolve_window_boundary_value(upper, order_by, schema, state) + .await?, ), }; Ok(WindowFrame::new_bounds(units, start, end)) } - fn resolve_window_boundary( + async fn resolve_window_boundary( &self, expr: spec::Expr, + schema: &DFSchemaRef, state: &mut PlanResolverState, ) -> PlanResult { - let spec::Expr::Literal(value) = expr else { - return Err(PlanError::invalid("window boundary must be a literal")); - }; - self.resolve_literal(value, state) + if let spec::Expr::Literal(value) = expr { + return self.resolve_literal(value, state); + } + let resolved = self.resolve_expression(expr, schema, state).await?; + if let datafusion_expr::Expr::Literal(scalar, _) = resolved { + return Ok(scalar); + } + // Apply type coercion so that expressions like `CAST(0 AS INTERVAL SECOND)` + // have compatible types before physical evaluation. + let context = SimplifyContext::default().with_schema(schema.clone()); + let simplifier = ExprSimplifier::new(context); + let coerced = simplifier.coerce(resolved, schema).map_err(|e| { + PlanError::invalid(format!( + "window boundary must be a constant expression: {e}" + )) + })?; + let evaluator = LiteralEvaluator::new(); + evaluator.evaluate(&coerced).map_err(|e| { + PlanError::invalid(format!( + "window boundary must be a constant expression: {e}" + )) + }) } - fn resolve_window_boundary_offset( + async fn resolve_window_boundary_offset( &self, value: spec::WindowFrameBoundary, + schema: &DFSchemaRef, state: &mut PlanResolverState, ) -> PlanResult { match value { @@ -241,19 +269,19 @@ impl PlanResolver<'_> { Ok(WindowFrameBound::Following(ScalarValue::UInt64(None))) } spec::WindowFrameBoundary::Preceding(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; Ok(WindowFrameBound::Preceding( value.cast_to(&DataType::UInt64)?, )) } spec::WindowFrameBoundary::Following(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; Ok(WindowFrameBound::Following( value.cast_to(&DataType::UInt64)?, )) } spec::WindowFrameBoundary::Value(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; let ScalarValue::Int64(Some(value)) = value.cast_to(&DataType::Int64)? else { return Err(PlanError::invalid("invalid window boundary offset")); }; @@ -272,7 +300,7 @@ impl PlanResolver<'_> { } } - fn resolve_window_boundary_value( + async fn resolve_window_boundary_value( &self, value: spec::WindowFrameBoundary, order_by: &[expr::Sort], @@ -298,7 +326,7 @@ impl PlanResolver<'_> { Ok(WindowFrameBound::Following(ScalarValue::Null)) } spec::WindowFrameBoundary::Preceding(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; // Cast numeric boundaries to match the ORDER BY type. // Non-numeric boundaries (e.g. INTERVAL for TIMESTAMP ORDER BY) are left as-is // since DataFusion handles interval arithmetic directly. @@ -311,7 +339,7 @@ impl PlanResolver<'_> { Ok(WindowFrameBound::Preceding(value)) } spec::WindowFrameBoundary::Following(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; let data_type = get_order_by_type()?; let value = if data_type.is_numeric() { value.cast_to(&data_type)? @@ -321,7 +349,7 @@ impl PlanResolver<'_> { Ok(WindowFrameBound::Following(value)) } spec::WindowFrameBoundary::Value(expr) => { - let value = self.resolve_window_boundary(*expr, state)?; + let value = self.resolve_window_boundary(*expr, schema, state).await?; if value.is_null() { Err(PlanError::invalid("window boundary value cannot be null")) } else { diff --git a/crates/sail-plan/src/resolver/plan.rs b/crates/sail-plan/src/resolver/plan.rs index a4872863a6..79f32cb426 100644 --- a/crates/sail-plan/src/resolver/plan.rs +++ b/crates/sail-plan/src/resolver/plan.rs @@ -14,6 +14,7 @@ pub struct NamedPlan { } impl PlanResolver<'_> { + /// Resolves a plan into a named plan. pub async fn resolve_named_plan(&self, plan: spec::Plan) -> PlanResult { let mut state = PlanResolverState::new(); match plan { diff --git a/crates/sail-plan/src/resolver/query/lateral.rs b/crates/sail-plan/src/resolver/query/lateral.rs index 2ff39ce4b2..3b870810b8 100644 --- a/crates/sail-plan/src/resolver/query/lateral.rs +++ b/crates/sail-plan/src/resolver/query/lateral.rs @@ -1,8 +1,9 @@ use std::sync::Arc; -use datafusion_expr::registry::FunctionRegistry; use datafusion_expr::{Expr, LogicalPlan, Projection}; +use sail_catalog::manager::CatalogManager; use sail_common::spec; +use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::utils::items::ItemTaker; use sail_python_udf::udf::pyspark_unresolved_udf::PySparkUnresolvedUDF; @@ -33,9 +34,10 @@ impl PlanResolver<'_> { )); }; let canonical_function_name = function_name.to_ascii_lowercase(); + let catalog_manager = self.ctx.extension::()?; let mut scope = state.enter_config_scope(); let state = scope.state(); - if let Ok(f) = self.ctx.udf(&canonical_function_name) { + if let Some(f) = catalog_manager.get_function(&canonical_function_name)? { if f.inner().as_any().is::() { state.config_mut().arrow_allow_large_var_types = true; } @@ -46,7 +48,7 @@ impl PlanResolver<'_> { }; let schema = input.schema().clone(); - if let Ok(f) = self.ctx.udf(&canonical_function_name) { + if let Some(f) = catalog_manager.get_function(&canonical_function_name)? { if let Some(f) = f.inner().as_any().downcast_ref::() { if !f.eval_type().is_table_function() { return Err(PlanError::invalid(format!( diff --git a/crates/sail-plan/src/resolver/query/misc.rs b/crates/sail-plan/src/resolver/query/misc.rs index 27a1c76c00..a49c825d8c 100644 --- a/crates/sail-plan/src/resolver/query/misc.rs +++ b/crates/sail-plan/src/resolver/query/misc.rs @@ -58,38 +58,47 @@ impl PlanResolver<'_> { ) -> PlanResult { let evaluator = LiteralEvaluator::new(); let schema = Arc::new(DFSchema::empty()); + // Evaluate named arguments eagerly so that IDENTIFIER(:col) expressions + // inside the query body can substitute their placeholder values at plan-resolution + // time (before `with_param_values` is applied to the resolved plan). + let named_params = { + let mut params = HashMap::new(); + for (name, arg) in named { + let expr = self.resolve_expression(arg, &schema, state).await?; + let param = evaluator + .evaluate(&expr) + .map_err(|e| PlanError::invalid(e.to_string()))?; + params.insert(name, param); + } + params + }; + // Evaluate positional arguments eagerly for the same reason. + let positional_params = { + let mut params = vec![]; + for arg in positional { + let expr = self.resolve_expression(arg, &schema, state).await?; + let param = evaluator + .evaluate(&expr) + .map_err(|e| PlanError::invalid(e.to_string()))?; + params.push(param); + } + params + }; + // Enter a scope that makes both named and positional parameter values + // available for IDENTIFIER clause evaluation inside the query body. + let mut scope = + state.enter_param_values_scope(named_params.clone(), positional_params.clone()); + let state = scope.state(); let input = self .resolve_query_plan_with_hidden_fields(input, state) .await?; - let input = if !positional.is_empty() { - let params = { - let mut params = vec![]; - for arg in positional { - let expr = self.resolve_expression(arg, &schema, state).await?; - let param = evaluator - .evaluate(&expr) - .map_err(|e| PlanError::invalid(e.to_string()))?; - params.push(param); - } - params - }; - input.with_param_values(ParamValues::from(params))? + let input = if !positional_params.is_empty() { + input.with_param_values(ParamValues::from(positional_params))? } else { input }; - if !named.is_empty() { - let params = { - let mut params = HashMap::new(); - for (name, arg) in named { - let expr = self.resolve_expression(arg, &schema, state).await?; - let param = evaluator - .evaluate(&expr) - .map_err(|e| PlanError::invalid(e.to_string()))?; - params.insert(name, param); - } - params - }; - Ok(input.with_param_values(ParamValues::from(params))?) + if !named_params.is_empty() { + Ok(input.with_param_values(ParamValues::from(named_params))?) } else { Ok(input) } diff --git a/crates/sail-plan/src/resolver/query/mod.rs b/crates/sail-plan/src/resolver/query/mod.rs index 5d393b6489..1355513b67 100644 --- a/crates/sail-plan/src/resolver/query/mod.rs +++ b/crates/sail-plan/src/resolver/query/mod.rs @@ -26,6 +26,7 @@ mod sample; mod set_op; mod sort; mod stat; +mod time_travel; mod udf; mod udtf; mod values; @@ -65,11 +66,14 @@ impl PlanResolver<'_> { is_streaming: _, } => match read_type { spec::ReadType::NamedTable(table) => { - self.resolve_query_read_named_table(table, state).await? + self.resolve_query_read_named_table(*table, state).await? } - spec::ReadType::Udtf(udtf) => self.resolve_query_read_udtf(udtf, state).await?, + spec::ReadType::Udtf(udtf) => self.resolve_query_read_udtf(*udtf, state).await?, spec::ReadType::DataSource(source) => { - self.resolve_query_read_data_source(source, state).await? + self.resolve_query_read_data_source(*source, state).await? + } + spec::ReadType::DynamicTable(table) => { + self.resolve_query_read_dynamic_table(*table, state).await? } }, QueryNode::Project { input, expressions } => { diff --git a/crates/sail-plan/src/resolver/query/read.rs b/crates/sail-plan/src/resolver/query/read.rs index dea8d7a9d2..df915925fb 100644 --- a/crates/sail-plan/src/resolver/query/read.rs +++ b/crates/sail-plan/src/resolver/query/read.rs @@ -4,9 +4,8 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Schema}; use datafusion::datasource::{provider_as_source, source_as_provider, TableProvider}; use datafusion_common::{DFSchema, ScalarValue, TableReference}; -use datafusion_expr::registry::FunctionRegistry; use datafusion_expr::{Expr, LogicalPlan, TableScan, TableSource, UNNAMED_TABLE}; -use rand::{rng, Rng}; +use rand::{rng, RngExt}; use sail_catalog::manager::CatalogManager; use sail_common::spec; use sail_common_datafusion::catalog::TableKind; @@ -36,12 +35,39 @@ impl PlanResolver<'_> { sample, options, } = table; - if temporal.is_some() { - return Err(PlanError::todo("read table AS OF clause")); + + // Check if the name is in the form `.` where `` is a + // registered table format. In that case, treat it as a direct data source read. + if let [format, path] = name.parts() { + let format = format.as_ref().to_ascii_lowercase(); + let registry = self.ctx.extension::()?; + if registry.get(&format).is_ok() { + let temporal_options = self + .resolve_time_travel_options(&format, temporal, state) + .await?; + let source = spec::ReadDataSource { + format: Some(format), + schema: None, + options: options.into_iter().chain(temporal_options).collect(), + paths: vec![path.as_ref().to_string()], + predicates: vec![], + }; + let plan = self.resolve_query_read_data_source(source, state).await?; + return if let Some(table_sample) = sample { + self.apply_table_sample(plan, table_sample, state).await + } else { + Ok(plan) + }; + } } let table_reference = self.resolve_table_reference(&name)?; if let Some(cte) = state.get_cte(&table_reference) { + if temporal.is_some() { + return Err(PlanError::unsupported( + "SQL time travel is not supported for CTEs", + )); + } let plan = cte.clone(); return if let Some(table_sample) = sample { self.apply_table_sample(plan, table_sample, state).await @@ -71,17 +97,21 @@ impl PlanResolver<'_> { } => { let schema = Schema::new(columns.iter().map(|x| x.field()).collect::>()); let constraints = self.resolve_catalog_table_constraints(constraints, &schema)?; + let temporal_options = self + .resolve_time_travel_options(&format, temporal, state) + .await?; let info = SourceInfo { paths: location.map(|x| vec![x]).unwrap_or_default(), schema: Some(schema), constraints, - partition_by, + partition_by: partition_by.into_iter().map(|field| field.column).collect(), bucket_by: bucket_by.map(|x| x.into()), sort_order: sort_by.into_iter().map(|x| x.into()).collect(), // TODO: detect duplicated keys in each set of options options: vec![ table_options.into_iter().collect(), options.into_iter().collect(), + temporal_options.into_iter().collect(), ], }; let registry = self.ctx.extension::()?; @@ -98,8 +128,20 @@ impl PlanResolver<'_> { state, )? } - TableKind::View { .. } => return Err(PlanError::todo("read view")), + TableKind::View { .. } => { + if temporal.is_some() { + return Err(PlanError::unsupported( + "SQL time travel is not supported for views", + )); + } + return Err(PlanError::todo("read view")); + } TableKind::TemporaryView { plan, .. } | TableKind::GlobalTemporaryView { plan, .. } => { + if temporal.is_some() { + return Err(PlanError::unsupported( + "SQL time travel is not supported for temporary views", + )); + } let names = state.register_fields(plan.schema().inner().fields()); rename_logical_plan(plan.as_ref().clone(), &names)? } @@ -112,6 +154,34 @@ impl PlanResolver<'_> { } } + pub(super) async fn resolve_query_read_dynamic_table( + &self, + table: spec::ReadDynamicTable, + state: &mut PlanResolverState, + ) -> PlanResult { + let spec::ReadDynamicTable { + name, + sample, + options, + } = table; + let schema = Arc::new(DFSchema::empty()); + let resolved = self.resolve_expression(name, &schema, state).await?; + let name_str = self.evaluate_identifier_expr(resolved, state)?; + let name = sail_sql_analyzer::expression::from_ast_object_name( + sail_sql_analyzer::parser::parse_object_name(&name_str)?, + )?; + self.resolve_query_read_named_table( + spec::ReadNamedTable { + name, + temporal: None, + sample, + options, + }, + state, + ) + .await + } + /// Apply TABLESAMPLE clause to a LogicalPlan pub(super) async fn apply_table_sample( &self, @@ -223,7 +293,8 @@ impl PlanResolver<'_> { }); self.resolve_query_project(None, vec![expr], state).await } else { - let udf = self.ctx.udf(&canonical_function_name).ok(); + let catalog_manager = self.ctx.extension::()?; + let udf = catalog_manager.get_function(&canonical_function_name)?; if let Some(f) = udf .as_ref() .and_then(|x| x.inner().as_any().downcast_ref::()) diff --git a/crates/sail-plan/src/resolver/query/sample.rs b/crates/sail-plan/src/resolver/query/sample.rs index 2650c422f8..0adff1f88a 100644 --- a/crates/sail-plan/src/resolver/query/sample.rs +++ b/crates/sail-plan/src/resolver/query/sample.rs @@ -5,7 +5,7 @@ use datafusion_common::ScalarValue; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::select_expr::SelectExpr; use datafusion_expr::{col, lit, Expr, Extension, LogicalPlan, LogicalPlanBuilder, ScalarUDF}; -use rand::{rng, Rng}; +use rand::{rng, RngExt}; use sail_common::spec; use sail_common::spec::{NullOrdering, SortDirection, SortOrder}; use sail_function::scalar::array::spark_sequence::SparkSequence; diff --git a/crates/sail-plan/src/resolver/query/time_travel.rs b/crates/sail-plan/src/resolver/query/time_travel.rs new file mode 100644 index 0000000000..19e7ca8aa2 --- /dev/null +++ b/crates/sail-plan/src/resolver/query/time_travel.rs @@ -0,0 +1,357 @@ +use std::sync::Arc; + +use chrono::{SecondsFormat, TimeZone, Utc}; +use datafusion::arrow::datatypes::{DataType, TimeUnit}; +use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRecursion}; +use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; +use datafusion_expr::{EmptyRelation, Expr, Limit, LogicalPlan, Projection}; +use sail_common::spec; + +use crate::error::{PlanError, PlanResult}; +use crate::resolver::state::PlanResolverState; +use crate::resolver::PlanResolver; + +impl PlanResolver<'_> { + pub(super) async fn resolve_time_travel_options( + &self, + format: &str, + temporal: Option, + state: &mut PlanResolverState, + ) -> PlanResult> { + let Some(temporal) = temporal else { + return Ok(vec![]); + }; + match format.to_ascii_lowercase().as_str() { + "delta" => { + self.resolve_delta_time_travel_options(temporal, state) + .await + } + "iceberg" => { + self.resolve_iceberg_time_travel_options(temporal, state) + .await + } + other => Err(PlanError::unsupported(format!( + "SQL time travel is only supported for Delta and Iceberg tables, got {other}", + ))), + } + } + + async fn resolve_delta_time_travel_options( + &self, + temporal: spec::TableTemporal, + state: &mut PlanResolverState, + ) -> PlanResult> { + match temporal { + spec::TableTemporal::Timestamp { value } => Ok(vec![( + "timestampAsOf".to_string(), + self.evaluate_time_travel_timestamp(value, state).await?, + )]), + spec::TableTemporal::Version { value } => Ok(vec![( + "versionAsOf".to_string(), + self.evaluate_time_travel_version_i64(value, "version", state) + .await? + .to_string(), + )]), + } + } + + async fn resolve_iceberg_time_travel_options( + &self, + temporal: spec::TableTemporal, + state: &mut PlanResolverState, + ) -> PlanResult> { + match temporal { + spec::TableTemporal::Timestamp { value } => Ok(vec![( + "timestampAsOf".to_string(), + self.evaluate_time_travel_timestamp(value, state).await?, + )]), + spec::TableTemporal::Version { value } => { + match self + .evaluate_time_travel_version_for_iceberg(value, state) + .await? + { + IcebergVersionAsOf::SnapshotId(snapshot_id) => { + Ok(vec![("snapshotId".to_string(), snapshot_id.to_string())]) + } + IcebergVersionAsOf::Reference(reference) => { + Ok(vec![("ref".to_string(), reference)]) + } + } + } + } + } + + async fn evaluate_time_travel_timestamp( + &self, + expr: spec::Expr, + state: &mut PlanResolverState, + ) -> PlanResult { + let resolved = self + .resolve_time_travel_expression(expr, "timestamp", state) + .await?; + let timestamp = Expr::Cast(datafusion_expr::expr::Cast { + expr: Box::new(resolved), + data_type: DataType::Timestamp( + TimeUnit::Microsecond, + self.resolve_timezone(&spec::TimestampType::Configured)?, + ), + }); + let scalar = self.execute_time_travel_scalar(timestamp).await?; + Self::normalize_time_travel_timestamp_scalar(scalar) + } + + async fn evaluate_time_travel_version_i64( + &self, + expr: spec::Expr, + kind: &str, + state: &mut PlanResolverState, + ) -> PlanResult { + let resolved = self + .resolve_time_travel_expression(expr, kind, state) + .await?; + let scalar = self.execute_time_travel_scalar(resolved).await?; + Self::scalar_to_time_travel_i64(&scalar, kind) + } + + async fn evaluate_time_travel_version_for_iceberg( + &self, + expr: spec::Expr, + state: &mut PlanResolverState, + ) -> PlanResult { + let resolved = self + .resolve_time_travel_expression(expr, "version", state) + .await?; + let scalar = self.execute_time_travel_scalar(resolved).await?; + match scalar { + ScalarValue::Utf8(Some(value)) + | ScalarValue::LargeUtf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) => { + let value = value.trim().to_string(); + match value.parse::() { + Ok(snapshot_id) => Ok(IcebergVersionAsOf::SnapshotId(snapshot_id)), + Err(_) => Ok(IcebergVersionAsOf::Reference(value)), + } + } + _ => Ok(IcebergVersionAsOf::SnapshotId( + Self::scalar_to_time_travel_i64(&scalar, "version")?, + )), + } + } + + async fn resolve_time_travel_expression( + &self, + expr: spec::Expr, + kind: &str, + state: &mut PlanResolverState, + ) -> PlanResult { + let schema = Arc::new(DFSchema::empty()); + let resolved = self + .resolve_expression(expr, &schema, state) + .await + .map_err(|e| Self::invalid_time_travel_spec(kind, e))?; + self.validate_time_travel_expression(&resolved, kind)?; + Ok(resolved) + } + + // TODO: Extract as general utilities and incorporated into LiteralEvaluator. + fn validate_time_travel_expression(&self, expr: &Expr, kind: &str) -> PlanResult<()> { + if expr.any_column_refs() { + return Err(Self::invalid_time_travel_message( + kind, + "expression cannot refer to any columns", + )); + } + if expr.contains_outer() { + return Err(Self::invalid_time_travel_message( + kind, + "expression cannot refer to any outer columns", + )); + } + if expr.is_volatile() { + return Err(Self::invalid_time_travel_message( + kind, + "expression must be deterministic", + )); + } + let mut has_correlated_subquery = false; + let mut has_volatile_subquery = false; + expr.apply(|nested| { + if let Expr::ScalarSubquery(subquery) = nested { + if !subquery.outer_ref_columns.is_empty() + || subquery.subquery.contains_outer_reference() + { + has_correlated_subquery = true; + return Ok(TreeNodeRecursion::Stop); + } + if Self::logical_plan_has_volatile_expressions(&subquery.subquery)? { + has_volatile_subquery = true; + return Ok(TreeNodeRecursion::Stop); + } + } + Ok(TreeNodeRecursion::Continue) + })?; + if has_correlated_subquery { + return Err(Self::invalid_time_travel_message( + kind, + "scalar subquery cannot be correlated", + )); + } + if has_volatile_subquery { + return Err(Self::invalid_time_travel_message( + kind, + "expression must be deterministic", + )); + } + Ok(()) + } + + fn logical_plan_has_volatile_expressions( + plan: &LogicalPlan, + ) -> datafusion_common::Result { + let mut contains = false; + plan.apply(|node| { + node.apply_expressions(|expr| { + if expr.is_volatile() { + contains = true; + Ok(TreeNodeRecursion::Stop) + } else { + Ok(TreeNodeRecursion::Continue) + } + })?; + Ok(if contains { + TreeNodeRecursion::Stop + } else { + TreeNodeRecursion::Continue + }) + })?; + Ok(contains) + } + + async fn execute_time_travel_scalar(&self, expr: Expr) -> PlanResult { + let expr = Self::cap_time_travel_scalar_subqueries(expr)?; + let plan = Self::build_time_travel_scalar_plan(expr)?; + let batches = self.ctx.execute_logical_plan(plan).await?.collect().await?; + let mut total_rows = 0usize; + let mut value = None; + for batch in batches { + total_rows += batch.num_rows(); + if value.is_none() && batch.num_rows() > 0 { + value = Some(ScalarValue::try_from_array(batch.column(0).as_ref(), 0)?); + } + } + match (total_rows, value) { + (1, Some(value)) => Ok(value), + (rows, _) => Err(PlanError::invalid(format!( + "Invalid time travel spec: expression must evaluate to a single value, got {rows} rows", + ))), + } + } + + fn cap_time_travel_scalar_subqueries(expr: Expr) -> datafusion_common::Result { + expr.transform(|nested| match nested { + Expr::ScalarSubquery(subquery) => Ok(Transformed::yes(Expr::ScalarSubquery( + datafusion_expr::logical_plan::Subquery { + subquery: Arc::new(Self::limit_plan_to_two_rows(Arc::unwrap_or_clone( + subquery.subquery, + ))), + outer_ref_columns: subquery.outer_ref_columns, + spans: subquery.spans, + }, + ))), + _ => Ok(Transformed::no(nested)), + }) + .data() + } + + fn build_time_travel_scalar_plan(expr: Expr) -> PlanResult { + let projection = LogicalPlan::Projection(Projection::try_new( + vec![expr], + Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: true, + schema: DFSchemaRef::new(DFSchema::empty()), + })), + )?); + Ok(Self::limit_plan_to_two_rows(projection)) + } + + fn limit_plan_to_two_rows(input: LogicalPlan) -> LogicalPlan { + LogicalPlan::Limit(Limit { + skip: None, + fetch: Some(Box::new(Expr::Literal(ScalarValue::Int64(Some(2)), None))), + input: Arc::new(input), + }) + } + + fn normalize_time_travel_timestamp_scalar(scalar: ScalarValue) -> PlanResult { + match scalar { + ScalarValue::TimestampMicrosecond(Some(value), Some(_)) => { + let datetime = Utc.timestamp_micros(value).single().ok_or_else(|| { + PlanError::invalid("Invalid time travel spec: timestamp is out of range") + })?; + Ok(datetime.to_rfc3339_opts(SecondsFormat::Micros, true)) + } + ScalarValue::TimestampMicrosecond(Some(value), None) => { + let datetime = Utc.timestamp_micros(value).single().ok_or_else(|| { + PlanError::invalid("Invalid time travel spec: timestamp is out of range") + })?; + Ok(datetime + .naive_utc() + .format("%Y-%m-%d %H:%M:%S%.6f") + .to_string()) + } + ScalarValue::TimestampMicrosecond(None, _) => Err(PlanError::invalid( + "Invalid time travel spec: timestamp expression evaluated to NULL", + )), + other => Err(PlanError::invalid(format!( + "Invalid time travel spec: timestamp expression must evaluate to a timestamp, got {other:?}", + ))), + } + } + + fn scalar_to_time_travel_i64(scalar: &ScalarValue, kind: &str) -> PlanResult { + match scalar { + ScalarValue::Int8(Some(value)) => Ok(i64::from(*value)), + ScalarValue::Int16(Some(value)) => Ok(i64::from(*value)), + ScalarValue::Int32(Some(value)) => Ok(i64::from(*value)), + ScalarValue::Int64(Some(value)) => Ok(*value), + ScalarValue::UInt8(Some(value)) => Ok(i64::from(*value)), + ScalarValue::UInt16(Some(value)) => Ok(i64::from(*value)), + ScalarValue::UInt32(Some(value)) => Ok(i64::from(*value)), + ScalarValue::UInt64(Some(value)) => i64::try_from(*value).map_err(|_| { + Self::invalid_time_travel_message(kind, "integer value is out of range") + }), + ScalarValue::Utf8(Some(value)) + | ScalarValue::LargeUtf8(Some(value)) + | ScalarValue::Utf8View(Some(value)) => value.trim().parse::().map_err(|_| { + Self::invalid_time_travel_message( + kind, + "expression must evaluate to an integer or numeric string", + ) + }), + ScalarValue::Null => Err(Self::invalid_time_travel_message( + kind, + "expression evaluated to NULL", + )), + other => Err(Self::invalid_time_travel_message( + kind, + format!("expression must evaluate to an integer, got {other:?}"), + )), + } + } + + fn invalid_time_travel_spec(kind: &str, error: impl std::fmt::Display) -> PlanError { + Self::invalid_time_travel_message(kind, format!("failed to resolve expression: {error}")) + } + + fn invalid_time_travel_message(kind: &str, message: impl Into) -> PlanError { + PlanError::invalid(format!( + "Invalid time travel spec: {kind} {}.", + message.into() + )) + } +} + +enum IcebergVersionAsOf { + SnapshotId(i64), + Reference(String), +} diff --git a/crates/sail-plan/src/resolver/state.rs b/crates/sail-plan/src/resolver/state.rs index d5ee0c3d54..d66fb0790c 100644 --- a/crates/sail-plan/src/resolver/state.rs +++ b/crates/sail-plan/src/resolver/state.rs @@ -2,7 +2,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use datafusion_common::arrow::datatypes::Field; -use datafusion_common::{DFSchemaRef, TableReference}; +use datafusion_common::{DFSchemaRef, ScalarValue, TableReference}; use datafusion_expr::LogicalPlan; use sail_common::spec; @@ -78,6 +78,13 @@ pub(super) struct PlanResolverState { /// Unresolved subquery references from a WithRelations node, keyed by plan_id. subquery_references: HashMap, config: PlanResolverStateConfig, + /// Named parameter values available for IDENTIFIER clause evaluation. + /// Set when resolving a `WithParameters` query node so that IDENTIFIER expressions + /// can substitute placeholders before constant-folding them. + param_values: HashMap, + /// Positional parameter values available for IDENTIFIER clause evaluation. + /// Set alongside `param_values` when resolving a `WithParameters` query node. + positional_param_values: Vec, } impl Default for PlanResolverState { @@ -96,6 +103,8 @@ impl PlanResolverState { ctes: HashMap::new(), subquery_references: HashMap::new(), config: PlanResolverStateConfig::default(), + param_values: HashMap::new(), + positional_param_values: Vec::new(), } } @@ -241,6 +250,62 @@ impl PlanResolverState { pub fn config_mut(&mut self) -> &mut PlanResolverStateConfig { &mut self.config } + + /// Returns the named parameter value for the given name, if any. + pub fn get_param_value(&self, name: &str) -> Option<&ScalarValue> { + self.param_values.get(name) + } + + /// Returns the positional parameter value at the given 0-based index, if any. + pub fn get_positional_param_value(&self, index: usize) -> Option<&ScalarValue> { + self.positional_param_values.get(index) + } + + /// Enters a scope where named and positional parameter values are set. + /// The previous parameter values are restored when the scope is dropped. + pub fn enter_param_values_scope( + &mut self, + named: HashMap, + positional: Vec, + ) -> ParamValuesScope<'_> { + ParamValuesScope::new(self, named, positional) + } +} + +/// Scope for parameter values used by IDENTIFIER clause evaluation. +pub(crate) struct ParamValuesScope<'a> { + state: &'a mut PlanResolverState, + previous_param_values: HashMap, + previous_positional_param_values: Vec, +} + +impl<'a> ParamValuesScope<'a> { + fn new( + state: &'a mut PlanResolverState, + named: HashMap, + positional: Vec, + ) -> Self { + let previous_param_values = std::mem::replace(&mut state.param_values, named); + let previous_positional_param_values = + std::mem::replace(&mut state.positional_param_values, positional); + Self { + state, + previous_param_values, + previous_positional_param_values, + } + } + + pub(crate) fn state(&mut self) -> &mut PlanResolverState { + self.state + } +} + +impl Drop for ParamValuesScope<'_> { + fn drop(&mut self) { + self.state.param_values = std::mem::take(&mut self.previous_param_values); + self.state.positional_param_values = + std::mem::take(&mut self.previous_positional_param_values); + } } pub(crate) struct QueryScope<'a> { diff --git a/crates/sail-plan/src/streaming/rewriter.rs b/crates/sail-plan/src/streaming/rewriter.rs index 2e4812a25d..cb3a900d28 100644 --- a/crates/sail-plan/src/streaming/rewriter.rs +++ b/crates/sail-plan/src/streaming/rewriter.rs @@ -14,6 +14,7 @@ use sail_common_datafusion::streaming::event::schema::{ is_flow_event_schema, MARKER_FIELD_NAME, RETRACTED_FIELD_NAME, }; use sail_common_datafusion::streaming::source::{StreamSource, StreamSourceTableProvider}; +use sail_logical_plan::barrier::BarrierNode; use sail_logical_plan::file_write::FileWriteNode; use sail_logical_plan::range::RangeNode; use sail_logical_plan::show_string::ShowStringNode; @@ -47,6 +48,9 @@ impl StreamingRewriter { }))) } else if node.as_any().is::() { Ok(Transformed::no(LogicalPlan::Extension(extension))) + } else if node.as_any().is::() { + // TODO: support BarrierNode for streaming properly. + Ok(Transformed::no(LogicalPlan::Extension(extension))) } else { plan_err!("unsupported extension node for streaming: {node:?}") } diff --git a/crates/sail-python-udf/src/config.rs b/crates/sail-python-udf/src/config.rs index 5bf0cba449..6330c0e169 100644 --- a/crates/sail-python-udf/src/config.rs +++ b/crates/sail-python-udf/src/config.rs @@ -1,7 +1,7 @@ use pyo3::pyclass; #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd)] -#[pyclass(frozen)] +#[pyclass(frozen, from_py_object)] pub struct PySparkUdfConfig { #[pyo3(get)] pub session_timezone: String, diff --git a/crates/sail-session/src/catalog.rs b/crates/sail-session/src/catalog.rs index 64627abf2d..de7e81abd2 100644 --- a/crates/sail-session/src/catalog.rs +++ b/crates/sail-session/src/catalog.rs @@ -181,6 +181,6 @@ pub fn create_catalog_manager( default_database: config.catalog.default_database.clone(), global_temporary_database: config.catalog.global_temporary_database.clone(), }; - CatalogManager::new(options) + CatalogManager::try_new(options) .map_err(|e| plan_datafusion_err!("failed to create catalog manager: {e}")) } diff --git a/crates/sail-session/src/optimizer.rs b/crates/sail-session/src/optimizer.rs index 32c484bcb5..39840383f5 100644 --- a/crates/sail-session/src/optimizer.rs +++ b/crates/sail-session/src/optimizer.rs @@ -9,7 +9,11 @@ pub fn default_analyzer_rules() -> Vec> { pub fn default_optimizer_rules() -> Vec> { let Optimizer { rules } = Optimizer::default(); let mut custom = sail_plan_lakehouse::lakehouse_optimizer_rules(); - custom.extend(rules); + custom.extend( + rules + .into_iter() + .filter(|r| r.name() != "push_down_leaf_projections"), + ); custom } diff --git a/crates/sail-session/src/planner.rs b/crates/sail-session/src/planner.rs index 1c7abdedbf..961e8f252a 100644 --- a/crates/sail-session/src/planner.rs +++ b/crates/sail-session/src/planner.rs @@ -22,6 +22,7 @@ use sail_common_datafusion::streaming::event::schema::{ to_flow_event_field_names, to_flow_event_projection, }; use sail_delta_lake::logical::RewriteDeltaTableSource; +use sail_logical_plan::barrier::BarrierNode; use sail_logical_plan::file_delete::FileDeleteNode; use sail_logical_plan::file_write::FileWriteNode; use sail_logical_plan::map_partitions::MapPartitionsNode; @@ -37,6 +38,8 @@ use sail_logical_plan::streaming::filter::StreamFilterNode; use sail_logical_plan::streaming::limit::StreamLimitNode; use sail_logical_plan::streaming::source_adapter::StreamSourceAdapterNode; use sail_logical_plan::streaming::source_wrapper::StreamSourceWrapperNode; +use sail_physical_plan::barrier::BarrierExec; +use sail_physical_plan::catalog_command::CatalogCommandExec; use sail_physical_plan::file_delete::create_file_delete_physical_plan; use sail_physical_plan::file_write::create_file_write_physical_plan; use sail_physical_plan::map_partitions::MapPartitionsExec; @@ -49,6 +52,7 @@ use sail_physical_plan::streaming::collector::StreamCollectorExec; use sail_physical_plan::streaming::filter::StreamFilterExec; use sail_physical_plan::streaming::limit::StreamLimitExec; use sail_physical_plan::streaming::source_adapter::StreamSourceAdapterExec; +use sail_plan::catalog::CatalogCommandNode; use sail_plan_lakehouse::new_lakehouse_extension_planners; #[derive(Debug)] @@ -299,6 +303,21 @@ Ensure expand_merge is enabled; MERGE is currently only supported for Delta tabl return internal_err!("StreamCollectorExec requires exactly one physical input"); }; Arc::new(StreamCollectorExec::try_new(input.clone())?) + } else if let Some(node) = node.as_any().downcast_ref::() { + let schema = node.schema().inner().clone(); + Arc::new(CatalogCommandExec::new(node.command().clone(), schema)) + } else if let Some(_node) = node.as_any().downcast_ref::() { + let (plan, preconditions) = physical_inputs.split_last().ok_or_else(|| { + datafusion_common::DataFusionError::Internal(format!( + "{} requires at least one physical input", + BarrierExec::static_name() + )) + })?; + if preconditions.is_empty() { + plan.clone() + } else { + Arc::new(BarrierExec::new(preconditions.to_vec(), plan.clone())) + } } else { return internal_err!("unsupported logical extension node: {:?}", node); }; diff --git a/crates/sail-session/src/session_factory/server.rs b/crates/sail-session/src/session_factory/server.rs index d65cfe8a9b..94b147d034 100644 --- a/crates/sail-session/src/session_factory/server.rs +++ b/crates/sail-session/src/session_factory/server.rs @@ -6,21 +6,21 @@ use datafusion::common::parquet_config::DFParquetWriterVersion; use datafusion::common::{internal_datafusion_err, Result}; use datafusion::execution::runtime_env::RuntimeEnvBuilder; use datafusion::execution::{SessionState, SessionStateBuilder}; +use datafusion::functions_aggregate::first_last::first_value_udaf; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_expr::registry::FunctionRegistry; use sail_catalog_system::service::SystemTableService; use sail_common::config::{AppConfig, ExecutionMode}; use sail_common::runtime::RuntimeHandle; use sail_common_datafusion::session::activity::ActivityTracker; use sail_common_datafusion::session::job::{JobRunner, JobService}; +use sail_delta_lake::session_extension::DeltaTableCache; use sail_execution::driver::DriverOptions; use sail_execution::job_runner::{ClusterJobRunner, LocalJobRunner}; use sail_execution::worker_manager::{ KubernetesWorkerManager, KubernetesWorkerManagerOptions, LocalWorkerManager, }; use sail_physical_optimizer::{get_physical_optimizers, PhysicalOptimizerOptions}; -use sail_plan::function::{ - BUILT_IN_GENERATOR_FUNCTIONS, BUILT_IN_SCALAR_FUNCTIONS, BUILT_IN_TABLE_FUNCTIONS, -}; use sail_server::actor::{ActorHandle, ActorSystem}; use crate::catalog::create_catalog_manager; @@ -87,18 +87,17 @@ impl SessionFactory for ServerSessionFactory { let state = self.create_session_state(&info)?; let context = SessionContext::new_with_state(state); - // TODO: This is a temp workaround to deregister all built-in functions that we define. - // We should deregister all context.udfs() once we have better coverage of functions. - // handler.rs needs to do this - for (&name, _function) in BUILT_IN_SCALAR_FUNCTIONS.iter() { - context.deregister_udf(name); - } - for (&name, _function) in BUILT_IN_GENERATOR_FUNCTIONS.iter() { - context.deregister_udf(name); - } - for (&name, _function) in BUILT_IN_TABLE_FUNCTIONS.iter() { - context.deregister_udtf(name); - } + // Register the `first_value` UDAF since the `replace_distinct_aggregate` optimizer rule + // assumes that this UDAF is available in the function registry. + // This is a hidden assumption made by the optimizer rule. + // We have to do so because we do not add default features (including built-in functions) + // to the session state. + // + // See also: https://github.com/apache/datafusion/issues/10703 + context + .state_ref() + .write() + .register_udaf(first_value_udaf())?; Ok(context) } @@ -118,7 +117,8 @@ impl ServerSessionFactory { )?)) .with_extension(Arc::new(ActivityTracker::new())) .with_extension(Arc::new(JobService::new(job_runner))) - .with_extension(Arc::new(self.create_system_table_service(info)?)); + .with_extension(Arc::new(self.create_system_table_service(info)?)) + .with_extension(Arc::new(DeltaTableCache::default())); self.apply_execution_config(&mut config); self.apply_execution_parquet_config(&mut config); let config = self.mutator.mutate_config(config, info)?; @@ -130,10 +130,11 @@ impl ServerSessionFactory { let runtime = self .runtime_env .create(|builder| self.mutator.mutate_runtime_env(builder, info))?; + // We do not add default features to the session state, + // since we manage table formats and functions ourselves. let builder = SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) - .with_default_features() .with_analyzer_rules(default_analyzer_rules()) .with_optimizer_rules(default_optimizer_rules()) .with_physical_optimizer_rules(get_physical_optimizers(PhysicalOptimizerOptions { diff --git a/crates/sail-session/src/session_factory/worker.rs b/crates/sail-session/src/session_factory/worker.rs index 9596383912..5024e43d49 100644 --- a/crates/sail-session/src/session_factory/worker.rs +++ b/crates/sail-session/src/session_factory/worker.rs @@ -5,6 +5,7 @@ use datafusion::execution::SessionStateBuilder; use datafusion::prelude::{SessionConfig, SessionContext}; use sail_common::config::AppConfig; use sail_common::runtime::RuntimeHandle; +use sail_delta_lake::session_extension::DeltaTableCache; use crate::runtime::RuntimeEnvFactory; use crate::session_factory::SessionFactory; @@ -23,7 +24,10 @@ impl WorkerSessionFactory { impl SessionFactory<()> for WorkerSessionFactory { fn create(&mut self, _info: ()) -> Result { let runtime = self.runtime_env.create(Ok)?; - let config = SessionConfig::default(); + // We still add default features for the worker session + // since we need built-in functions to be available for the codec + // when decoding the execution plan. + let config = SessionConfig::default().with_extension(Arc::new(DeltaTableCache::default())); let state = SessionStateBuilder::new() .with_config(config) .with_runtime_env(runtime) diff --git a/crates/sail-session/src/session_manager/actor/handler.rs b/crates/sail-session/src/session_manager/actor/handler.rs index 7d331f3535..900bff95d1 100644 --- a/crates/sail-session/src/session_manager/actor/handler.rs +++ b/crates/sail-session/src/session_manager/actor/handler.rs @@ -8,7 +8,7 @@ use log::{info, warn}; use sail_common_datafusion::extension::SessionExtensionAccessor; use sail_common_datafusion::session::activity::ActivityTracker; use sail_common_datafusion::session::job::JobService; -use sail_common_datafusion::system::catalog::SessionRow; +use sail_common_datafusion::system::catalog::{OptionRow, SessionRow}; use sail_common_datafusion::system::observable::{JobRunnerObserver, SessionManagerObserver}; use sail_common_datafusion::system::predicate::PredicateExt; use sail_server::actor::{ActorAction, ActorContext}; @@ -300,6 +300,23 @@ impl SessionManagerActor { let _ = result.send(task.fetch(fetch).collect().await); }); } + SessionManagerObserver::Options { key, fetch, result } => { + let rows = self + .options + .options + .iter() + .predicate_filter_map( + key, + |(key, _)| key, + |(key, value)| OptionRow { + key: key.clone(), + value: value.clone(), + }, + ) + .fetch(fetch) + .collect::, _>>(); + let _ = result.send(rows); + } } ActorAction::Continue } diff --git a/crates/sail-session/src/session_manager/options.rs b/crates/sail-session/src/session_manager/options.rs index e63836d4fe..099f09db26 100644 --- a/crates/sail-session/src/session_manager/options.rs +++ b/crates/sail-session/src/session_manager/options.rs @@ -12,6 +12,9 @@ pub struct SessionManagerOptions { pub runtime: RuntimeHandle, pub system: Arc>, pub factory: Box Box> + Send>, + /// The application configuration options as key-value pairs, + /// used to populate the `system.session.options` table. + pub options: Vec<(String, String)>, } impl SessionManagerOptions { @@ -25,6 +28,7 @@ impl SessionManagerOptions { runtime, system, factory, + options: Vec::new(), } } @@ -32,4 +36,9 @@ impl SessionManagerOptions { self.session_timeout = timeout; self } + + pub fn with_options(mut self, options: Vec<(String, String)>) -> Self { + self.options = options; + self + } } diff --git a/crates/sail-spark-connect/src/error.rs b/crates/sail-spark-connect/src/error.rs index 5134ff1e81..99a0d6f821 100644 --- a/crates/sail-spark-connect/src/error.rs +++ b/crates/sail-spark-connect/src/error.rs @@ -293,7 +293,24 @@ impl From for Status { // The original Spark Connect server implementation uses the "INTERNAL" status code // for all Spark exceptions, so we do the same here. // Reference: org.apache.spark.sql.connect.utils.ErrorUtils#buildStatusFromThrowable - Status::with_error_details(Code::Internal, throwable.message(), details) + // + // Truncate the message to avoid exceeding the gRPC trailing metadata size limit (~8KB). + // The message appears both in `grpc-message` (percent-encoded) and + // `grpc-status-details-bin` (base64-encoded), so the effective budget per message + // is roughly a quarter of the limit after encoding overhead. + const TRUNCATION_SUFFIX: &str = " (truncated)"; + const MAX_ERROR_MESSAGE_LEN: usize = 1024; + let message = throwable.message(); + let message = if message.len() > MAX_ERROR_MESSAGE_LEN { + let available = MAX_ERROR_MESSAGE_LEN.saturating_sub(TRUNCATION_SUFFIX.len()); + let end = message.floor_char_boundary(available); + let mut truncated = String::from(&message[..end]); + truncated.push_str(TRUNCATION_SUFFIX); + truncated + } else { + message.to_string() + }; + Status::with_error_details(Code::Internal, &message, details) } } diff --git a/crates/sail-spark-connect/src/proto/data_type.rs b/crates/sail-spark-connect/src/proto/data_type.rs index d2f6852a8d..d724c8aad6 100644 --- a/crates/sail-spark-connect/src/proto/data_type.rs +++ b/crates/sail-spark-connect/src/proto/data_type.rs @@ -174,17 +174,26 @@ impl TryFrom for spec::DataType { }) } Kind::DayTimeInterval(sdt::DayTimeInterval { - // FIXME: Currently `start_field` and `end_field` are lost in translation. - // This does not impact computation accuracy. - // This may affect the display string in the `data_type_to_simple_string` function. - start_field: _, - end_field: _, + start_field, + end_field, type_variation_reference: _, }) => { - // Spark's DayTimeInterval has microsecond precision. - // Arrow's IntervalUnit::DayTime has millisecond precision. - Ok(spec::DataType::Duration { - time_unit: spec::TimeUnit::Microsecond, + let start_field = start_field + .map(spec::DayTimeIntervalField::try_from) + .transpose()? + .map(spec::IntervalFieldType::try_from) + .transpose()?; + let end_field = end_field + .map(spec::DayTimeIntervalField::try_from) + .transpose()? + .map(spec::IntervalFieldType::try_from) + .transpose()?; + let start_field = Some(start_field.unwrap_or(spec::IntervalFieldType::Day)); + let end_field = Some(end_field.unwrap_or(spec::IntervalFieldType::Second)); + Ok(spec::DataType::Interval { + interval_unit: spec::IntervalUnit::DayTime, + start_field, + end_field, }) } Kind::Array(array) => { diff --git a/crates/sail-spark-connect/src/proto/plan.rs b/crates/sail-spark-connect/src/proto/plan.rs index b625b21823..470924b28f 100644 --- a/crates/sail-spark-connect/src/proto/plan.rs +++ b/crates/sail-spark-connect/src/proto/plan.rs @@ -141,14 +141,14 @@ impl TryFrom for RelationNode { unparsed_identifier, options, } = x; - spec::ReadType::NamedTable(spec::ReadNamedTable { + spec::ReadType::NamedTable(Box::new(spec::ReadNamedTable { name: from_ast_object_name(parse_object_name( unparsed_identifier.as_str(), )?)?, temporal: None, sample: None, options: options.into_iter().collect(), - }) + })) } ReadType::DataSource(x) => { let DataSource { @@ -172,13 +172,13 @@ impl TryFrom for RelationNode { .into_iter() .map(|x| Ok(from_ast_expression(parse_expression(x.as_str())?)?)) .collect::>>()?; - spec::ReadType::DataSource(spec::ReadDataSource { + spec::ReadType::DataSource(Box::new(spec::ReadDataSource { format, schema, options: options.into_iter().collect(), paths, predicates, - }) + })) } }; Ok(RelationNode::Query(spec::QueryNode::Read { @@ -1578,7 +1578,14 @@ impl TryFrom for spec::Write { null_ordering: spec::NullOrdering::Unspecified, }) .collect(); - let partitioning_columns = partitioning_columns.into_iter().map(|x| x.into()).collect(); + let partitioning_columns = partitioning_columns + .into_iter() + .map(|name| spec::Expr::UnresolvedAttribute { + name: spec::ObjectName::bare(name), + plan_id: None, + is_metadata_column: false, + }) + .collect(); let clustering_columns = clustering_columns.into_iter().map(|x| x.into()).collect(); let bucket_by = match bucket_by { Some(x) => { diff --git a/crates/sail-spark-connect/src/session_manager.rs b/crates/sail-spark-connect/src/session_manager.rs index 25025f51f6..b94582dbb4 100644 --- a/crates/sail-spark-connect/src/session_manager.rs +++ b/crates/sail-spark-connect/src/session_manager.rs @@ -17,7 +17,7 @@ use sail_session::session_factory::{ }; use sail_session::session_manager::{SessionManager, SessionManagerOptions}; -use crate::error::SparkResult; +use crate::error::{SparkError, SparkResult}; use crate::session::{SparkSession, SparkSessionOptions}; pub struct SparkSessionMutator { @@ -91,6 +91,7 @@ pub fn create_spark_session_manager( }) }; let options = SessionManagerOptions::new(runtime.clone(), system, factory) - .with_session_timeout(Duration::from_secs(config.spark.session_timeout_secs)); + .with_session_timeout(Duration::from_secs(config.spark.session_timeout_secs)) + .with_options(config.raw().map_err(SparkError::from)?); Ok(SessionManager::try_new(options)?) } diff --git a/crates/sail-spark-connect/tests/gold_data/data_type.json b/crates/sail-spark-connect/tests/gold_data/data_type.json index a38dd94548..0f154a8775 100644 --- a/crates/sail-spark-connect/tests/gold_data/data_type.json +++ b/crates/sail-spark-connect/tests/gold_data/data_type.json @@ -518,8 +518,10 @@ "input": "interval day to second", "output": { "success": { - "duration": { - "timeUnit": "microsecond" + "interval": { + "intervalUnit": "dayTime", + "startField": "day", + "endField": "second" } } } diff --git a/crates/sail-spark-connect/tests/gold_data/function/agg.json b/crates/sail-spark-connect/tests/gold_data/function/agg.json index 9415b0afc9..3d2a6f65df 100644 --- a/crates/sail-spark-connect/tests/gold_data/function/agg.json +++ b/crates/sail-spark-connect/tests/gold_data/function/agg.json @@ -327,7 +327,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Interval(YearMonth), Decimal128(1, 1), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Interval(YearMonth), Decimal128(1, 1), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Interval(YearMonth), Decimal128(1, 1), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Interval(YearMonth), Decimal128(1, 1), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -353,7 +353,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int32, List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Int32, List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int32, List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Int32, List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -379,7 +379,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Duration(µs), List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Duration(µs), List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Duration(µs), List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Duration(µs), List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -2337,7 +2337,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Interval(YearMonth), Decimal128(1, 1), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Interval(YearMonth), Decimal128(1, 1), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Interval(YearMonth), Decimal128(1, 1), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Interval(YearMonth), Decimal128(1, 1), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -2363,7 +2363,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int32, List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Int32, List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Int32, List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Int32, List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -2389,7 +2389,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Duration(µs), List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Duration(µs), List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'approx_percentile_cont' function: coercion from Duration(µs), List(Decimal128(1, 1)), Int32 to the signature OneOf([Exact([Int8, Float64]), Exact([Int8, Float64, Int8]), Exact([Int8, Float64, Int16]), Exact([Int8, Float64, Int32]), Exact([Int8, Float64, Int64]), Exact([Int8, Float64, UInt8]), Exact([Int8, Float64, UInt16]), Exact([Int8, Float64, UInt32]), Exact([Int8, Float64, UInt64]), Exact([Int16, Float64]), Exact([Int16, Float64, Int8]), Exact([Int16, Float64, Int16]), Exact([Int16, Float64, Int32]), Exact([Int16, Float64, Int64]), Exact([Int16, Float64, UInt8]), Exact([Int16, Float64, UInt16]), Exact([Int16, Float64, UInt32]), Exact([Int16, Float64, UInt64]), Exact([Int32, Float64]), Exact([Int32, Float64, Int8]), Exact([Int32, Float64, Int16]), Exact([Int32, Float64, Int32]), Exact([Int32, Float64, Int64]), Exact([Int32, Float64, UInt8]), Exact([Int32, Float64, UInt16]), Exact([Int32, Float64, UInt32]), Exact([Int32, Float64, UInt64]), Exact([Int64, Float64]), Exact([Int64, Float64, Int8]), Exact([Int64, Float64, Int16]), Exact([Int64, Float64, Int32]), Exact([Int64, Float64, Int64]), Exact([Int64, Float64, UInt8]), Exact([Int64, Float64, UInt16]), Exact([Int64, Float64, UInt32]), Exact([Int64, Float64, UInt64]), Exact([UInt8, Float64]), Exact([UInt8, Float64, Int8]), Exact([UInt8, Float64, Int16]), Exact([UInt8, Float64, Int32]), Exact([UInt8, Float64, Int64]), Exact([UInt8, Float64, UInt8]), Exact([UInt8, Float64, UInt16]), Exact([UInt8, Float64, UInt32]), Exact([UInt8, Float64, UInt64]), Exact([UInt16, Float64]), Exact([UInt16, Float64, Int8]), Exact([UInt16, Float64, Int16]), Exact([UInt16, Float64, Int32]), Exact([UInt16, Float64, Int64]), Exact([UInt16, Float64, UInt8]), Exact([UInt16, Float64, UInt16]), Exact([UInt16, Float64, UInt32]), Exact([UInt16, Float64, UInt64]), Exact([UInt32, Float64]), Exact([UInt32, Float64, Int8]), Exact([UInt32, Float64, Int16]), Exact([UInt32, Float64, Int32]), Exact([UInt32, Float64, Int64]), Exact([UInt32, Float64, UInt8]), Exact([UInt32, Float64, UInt16]), Exact([UInt32, Float64, UInt32]), Exact([UInt32, Float64, UInt64]), Exact([UInt64, Float64]), Exact([UInt64, Float64, Int8]), Exact([UInt64, Float64, Int16]), Exact([UInt64, Float64, Int32]), Exact([UInt64, Float64, Int64]), Exact([UInt64, Float64, UInt8]), Exact([UInt64, Float64, UInt16]), Exact([UInt64, Float64, UInt32]), Exact([UInt64, Float64, UInt64]), Exact([Float16, Float64]), Exact([Float16, Float64, Int8]), Exact([Float16, Float64, Int16]), Exact([Float16, Float64, Int32]), Exact([Float16, Float64, Int64]), Exact([Float16, Float64, UInt8]), Exact([Float16, Float64, UInt16]), Exact([Float16, Float64, UInt32]), Exact([Float16, Float64, UInt64]), Exact([Float32, Float64]), Exact([Float32, Float64, Int8]), Exact([Float32, Float64, Int16]), Exact([Float32, Float64, Int32]), Exact([Float32, Float64, Int64]), Exact([Float32, Float64, UInt8]), Exact([Float32, Float64, UInt16]), Exact([Float32, Float64, UInt32]), Exact([Float32, Float64, UInt64]), Exact([Float64, Float64]), Exact([Float64, Float64, Int8]), Exact([Float64, Float64, Int16]), Exact([Float64, Float64, Int32]), Exact([Float64, Float64, Int64]), Exact([Float64, Float64, UInt8]), Exact([Float64, Float64, UInt16]), Exact([Float64, Float64, UInt32]), Exact([Float64, Float64, UInt64])]) failed No function matches the given name and argument types 'approx_percentile_cont(Duration(µs), List(Decimal128(1, 1)), Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tapprox_percentile_cont(Int8, Float64)\n\tapprox_percentile_cont(Int8, Float64, Int8)\n\tapprox_percentile_cont(Int8, Float64, Int16)\n\tapprox_percentile_cont(Int8, Float64, Int32)\n\tapprox_percentile_cont(Int8, Float64, Int64)\n\tapprox_percentile_cont(Int8, Float64, UInt8)\n\tapprox_percentile_cont(Int8, Float64, UInt16)\n\tapprox_percentile_cont(Int8, Float64, UInt32)\n\tapprox_percentile_cont(Int8, Float64, UInt64)\n\tapprox_percentile_cont(Int16, Float64)\n\tapprox_percentile_cont(Int16, Float64, Int8)\n\tapprox_percentile_cont(Int16, Float64, Int16)\n\tapprox_percentile_cont(Int16, Float64, Int32)\n\tapprox_percentile_cont(Int16, Float64, Int64)\n\tapprox_percentile_cont(Int16, Float64, UInt8)\n\tapprox_percentile_cont(Int16, Float64, UInt16)\n\tapprox_percentile_cont(Int16, Float64, UInt32)\n\tapprox_percentile_cont(Int16, Float64, UInt64)\n\tapprox_percentile_cont(Int32, Float64)\n\tapprox_percentile_cont(Int32, Float64, Int8)\n\tapprox_percentile_cont(Int32, Float64, Int16)\n\tapprox_percentile_cont(Int32, Float64, Int32)\n\tapprox_percentile_cont(Int32, Float64, Int64)\n\tapprox_percentile_cont(Int32, Float64, UInt8)\n\tapprox_percentile_cont(Int32, Float64, UInt16)\n\tapprox_percentile_cont(Int32, Float64, UInt32)\n\tapprox_percentile_cont(Int32, Float64, UInt64)\n\tapprox_percentile_cont(Int64, Float64)\n\tapprox_percentile_cont(Int64, Float64, Int8)\n\tapprox_percentile_cont(Int64, Float64, Int16)\n\tapprox_percentile_cont(Int64, Float64, Int32)\n\tapprox_percentile_cont(Int64, Float64, Int64)\n\tapprox_percentile_cont(Int64, Float64, UInt8)\n\tapprox_percentile_cont(Int64, Float64, UInt16)\n\tapprox_percentile_cont(Int64, Float64, UInt32)\n\tapprox_percentile_cont(Int64, Float64, UInt64)\n\tapprox_percentile_cont(UInt8, Float64)\n\tapprox_percentile_cont(UInt8, Float64, Int8)\n\tapprox_percentile_cont(UInt8, Float64, Int16)\n\tapprox_percentile_cont(UInt8, Float64, Int32)\n\tapprox_percentile_cont(UInt8, Float64, Int64)\n\tapprox_percentile_cont(UInt8, Float64, UInt8)\n\tapprox_percentile_cont(UInt8, Float64, UInt16)\n\tapprox_percentile_cont(UInt8, Float64, UInt32)\n\tapprox_percentile_cont(UInt8, Float64, UInt64)\n\tapprox_percentile_cont(UInt16, Float64)\n\tapprox_percentile_cont(UInt16, Float64, Int8)\n\tapprox_percentile_cont(UInt16, Float64, Int16)\n\tapprox_percentile_cont(UInt16, Float64, Int32)\n\tapprox_percentile_cont(UInt16, Float64, Int64)\n\tapprox_percentile_cont(UInt16, Float64, UInt8)\n\tapprox_percentile_cont(UInt16, Float64, UInt16)\n\tapprox_percentile_cont(UInt16, Float64, UInt32)\n\tapprox_percentile_cont(UInt16, Float64, UInt64)\n\tapprox_percentile_cont(UInt32, Float64)\n\tapprox_percentile_cont(UInt32, Float64, Int8)\n\tapprox_percentile_cont(UInt32, Float64, Int16)\n\tapprox_percentile_cont(UInt32, Float64, Int32)\n\tapprox_percentile_cont(UInt32, Float64, Int64)\n\tapprox_percentile_cont(UInt32, Float64, UInt8)\n\tapprox_percentile_cont(UInt32, Float64, UInt16)\n\tapprox_percentile_cont(UInt32, Float64, UInt32)\n\tapprox_percentile_cont(UInt32, Float64, UInt64)\n\tapprox_percentile_cont(UInt64, Float64)\n\tapprox_percentile_cont(UInt64, Float64, Int8)\n\tapprox_percentile_cont(UInt64, Float64, Int16)\n\tapprox_percentile_cont(UInt64, Float64, Int32)\n\tapprox_percentile_cont(UInt64, Float64, Int64)\n\tapprox_percentile_cont(UInt64, Float64, UInt8)\n\tapprox_percentile_cont(UInt64, Float64, UInt16)\n\tapprox_percentile_cont(UInt64, Float64, UInt32)\n\tapprox_percentile_cont(UInt64, Float64, UInt64)\n\tapprox_percentile_cont(Float16, Float64)\n\tapprox_percentile_cont(Float16, Float64, Int8)\n\tapprox_percentile_cont(Float16, Float64, Int16)\n\tapprox_percentile_cont(Float16, Float64, Int32)\n\tapprox_percentile_cont(Float16, Float64, Int64)\n\tapprox_percentile_cont(Float16, Float64, UInt8)\n\tapprox_percentile_cont(Float16, Float64, UInt16)\n\tapprox_percentile_cont(Float16, Float64, UInt32)\n\tapprox_percentile_cont(Float16, Float64, UInt64)\n\tapprox_percentile_cont(Float32, Float64)\n\tapprox_percentile_cont(Float32, Float64, Int8)\n\tapprox_percentile_cont(Float32, Float64, Int16)\n\tapprox_percentile_cont(Float32, Float64, Int32)\n\tapprox_percentile_cont(Float32, Float64, Int64)\n\tapprox_percentile_cont(Float32, Float64, UInt8)\n\tapprox_percentile_cont(Float32, Float64, UInt16)\n\tapprox_percentile_cont(Float32, Float64, UInt32)\n\tapprox_percentile_cont(Float32, Float64, UInt64)\n\tapprox_percentile_cont(Float64, Float64)\n\tapprox_percentile_cont(Float64, Float64, Int8)\n\tapprox_percentile_cont(Float64, Float64, Int16)\n\tapprox_percentile_cont(Float64, Float64, Int32)\n\tapprox_percentile_cont(Float64, Float64, Int64)\n\tapprox_percentile_cont(Float64, Float64, UInt8)\n\tapprox_percentile_cont(Float64, Float64, UInt16)\n\tapprox_percentile_cont(Float64, Float64, UInt32)\n\tapprox_percentile_cont(Float64, Float64, UInt64)" } }, { @@ -2433,7 +2433,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Internal error: Expect TypeSignatureClass::Float but received NativeType::Interval(YearMonth), DataType: Interval(YearMonth).\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'percentile_cont(Interval(YearMonth), Decimal128(2, 2))'. You might need to add explicit type casts.\n\tCandidate functions:\n\tpercentile_cont(expr: Coercion(TypeSignatureClass::Float, implicit_coercion=ImplicitCoercion([Numeric], default_type=Float64), percentile: Coercion(TypeSignatureClass::Native(LogicalType(Native(Float64), Float64)), implicit_coercion=ImplicitCoercion([Numeric], default_type=Float64))" + "failure": "error in DataFusion: Error during planning: Function 'percentile_cont' requires TypeSignatureClass::Float, but received Interval(YearMonth) (DataType: Interval(YearMonth)). No function matches the given name and argument types 'percentile_cont(Interval(YearMonth), Decimal128(2, 2))'. You might need to add explicit type casts.\n\tCandidate functions:\n\tpercentile_cont(expr: Coercion(TypeSignatureClass::Float, implicit_coercion=ImplicitCoercion([Numeric], default_type=Float64), percentile: Coercion(TypeSignatureClass::Native(LogicalType(Native(Float64), Float64)), implicit_coercion=ImplicitCoercion([Numeric], default_type=Float64))" } }, { @@ -2477,7 +2477,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'percentile_disc' function: coercion from Interval(YearMonth), Decimal128(2, 2) to the signature OneOf([Exact([Int8, Float64]), Exact([Int16, Float64]), Exact([Int32, Float64]), Exact([Int64, Float64]), Exact([UInt8, Float64]), Exact([UInt16, Float64]), Exact([UInt32, Float64]), Exact([UInt64, Float64]), Exact([Float32, Float64]), Exact([Float64, Float64])]) failed No function matches the given name and argument types 'percentile_disc(Interval(YearMonth), Decimal128(2, 2))'. You might need to add explicit type casts.\n\tCandidate functions:\n\tpercentile_disc(Int8, Float64)\n\tpercentile_disc(Int16, Float64)\n\tpercentile_disc(Int32, Float64)\n\tpercentile_disc(Int64, Float64)\n\tpercentile_disc(UInt8, Float64)\n\tpercentile_disc(UInt16, Float64)\n\tpercentile_disc(UInt32, Float64)\n\tpercentile_disc(UInt64, Float64)\n\tpercentile_disc(Float32, Float64)\n\tpercentile_disc(Float64, Float64)" + "failure": "error in DataFusion: Error during planning: Failed to coerce arguments to satisfy a call to 'percentile_disc' function: coercion from Interval(YearMonth), Decimal128(2, 2) to the signature OneOf([Exact([Int8, Float64]), Exact([Int16, Float64]), Exact([Int32, Float64]), Exact([Int64, Float64]), Exact([UInt8, Float64]), Exact([UInt16, Float64]), Exact([UInt32, Float64]), Exact([UInt64, Float64]), Exact([Float16, Float64]), Exact([Float32, Float64]), Exact([Float64, Float64])]) failed No function matches the given name and argument types 'percentile_disc(Interval(YearMonth), Decimal128(2, 2))'. You might need to add explicit type casts.\n\tCandidate functions:\n\tpercentile_disc(Int8, Float64)\n\tpercentile_disc(Int16, Float64)\n\tpercentile_disc(Int32, Float64)\n\tpercentile_disc(Int64, Float64)\n\tpercentile_disc(UInt8, Float64)\n\tpercentile_disc(UInt16, Float64)\n\tpercentile_disc(UInt32, Float64)\n\tpercentile_disc(UInt64, Float64)\n\tpercentile_disc(Float16, Float64)\n\tpercentile_disc(Float32, Float64)\n\tpercentile_disc(Float64, Float64)" } }, { diff --git a/crates/sail-spark-connect/tests/gold_data/function/datetime.json b/crates/sail-spark-connect/tests/gold_data/function/datetime.json index cf3240bc36..15e74b42f1 100644 --- a/crates/sail-spark-connect/tests/gold_data/function/datetime.json +++ b/crates/sail-spark-connect/tests/gold_data/function/datetime.json @@ -392,7 +392,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: The function 'current_time' expected zero argument but received 1 No function matches the given name and argument types 'current_time(Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tcurrent_time(NullAry())" + "failure": "error in DataFusion: Error during planning: zero values expected: [Literal(Int32(0), None)]" } }, { @@ -414,7 +414,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: The function 'current_time' expected zero argument but received 1 No function matches the given name and argument types 'current_time(Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tcurrent_time(NullAry())" + "failure": "error in DataFusion: Error during planning: zero values expected: [BinaryExpr(BinaryExpr { left: Literal(Int32(1), None), op: Plus, right: Literal(Int32(1), None) })]" } }, { @@ -436,7 +436,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: The function 'current_time' expected zero argument but received 1 No function matches the given name and argument types 'current_time(Int32)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tcurrent_time(NullAry())" + "failure": "error in DataFusion: Error during planning: zero values expected: [Literal(Int32(3), None)]" } }, { @@ -1888,7 +1888,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Internal error: Expect TypeSignatureClass::Native(LogicalType(Native(Int32), Int32)) but received NativeType::Decimal(5, 3), DataType: Decimal128(5, 3).\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'make_time(Int32, Int32, Decimal128(5, 3))'. You might need to add explicit type casts.\n\tCandidate functions:\n\tmake_time(Coercion(TypeSignatureClass::Native(LogicalType(Native(Int32), Int32)), implicit_coercion=ImplicitCoercion([Integer, Native(LogicalType(Native(String), String))], default_type=Int32), Coercion(TypeSignatureClass::Native(LogicalType(Native(Int32), Int32)), implicit_coercion=ImplicitCoercion([Integer, Native(LogicalType(Native(String), String))], default_type=Int32), Coercion(TypeSignatureClass::Native(LogicalType(Native(Int32), Int32)), implicit_coercion=ImplicitCoercion([Integer, Native(LogicalType(Native(String), String))], default_type=Int32))" + "success": "ok" } }, { @@ -2592,7 +2592,7 @@ } }, "output": { - "failure": "not supported: unknown function: time_diff" + "success": "ok" } }, { @@ -2614,7 +2614,7 @@ } }, "output": { - "failure": "not supported: unknown function: time_diff" + "success": "ok" } }, { @@ -2636,7 +2636,7 @@ } }, "output": { - "failure": "not supported: unknown function: time_diff" + "success": "ok" } }, { @@ -2658,7 +2658,7 @@ } }, "output": { - "failure": "not supported: unknown function: time_trunc" + "success": "ok" } }, { @@ -2680,7 +2680,7 @@ } }, "output": { - "failure": "not supported: unknown function: time_trunc" + "success": "ok" } }, { diff --git a/crates/sail-spark-connect/tests/gold_data/function/st.json b/crates/sail-spark-connect/tests/gold_data/function/st.json index cebed977c0..5b098d42ff 100644 --- a/crates/sail-spark-connect/tests/gold_data/function/st.json +++ b/crates/sail-spark-connect/tests/gold_data/function/st.json @@ -19,7 +19,7 @@ } }, "output": { - "failure": "not supported: unknown function: st_geogfromwkb" + "success": "ok" } }, { @@ -41,7 +41,7 @@ } }, "output": { - "failure": "not supported: unknown function: st_geomfromwkb" + "success": "ok" } }, { @@ -85,7 +85,7 @@ } }, "output": { - "failure": "not supported: unknown function: st_geogfromwkb" + "failure": "not supported: unknown function: st_srid" } }, { @@ -107,7 +107,7 @@ } }, "output": { - "failure": "not supported: unknown function: st_geomfromwkb" + "failure": "not supported: unknown function: st_srid" } }, { @@ -129,7 +129,7 @@ } }, "output": { - "failure": "not supported: unknown function: ST_GeogFromWKB" + "failure": "not supported: unknown function: st_setsrid" } }, { @@ -151,7 +151,7 @@ } }, "output": { - "failure": "not supported: unknown function: ST_GeomFromWKB" + "failure": "not supported: unknown function: st_setsrid" } } ] diff --git a/crates/sail-spark-connect/tests/gold_data/function/string.json b/crates/sail-spark-connect/tests/gold_data/function/string.json index 45a9a68f28..6b0b51440c 100644 --- a/crates/sail-spark-connect/tests/gold_data/function/string.json +++ b/crates/sail-spark-connect/tests/gold_data/function/string.json @@ -283,7 +283,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Internal error: Function 'btrim' failed to match any signature, errors: Error during planning: Function 'btrim' expects 2 arguments but received 1,Internal error: Expect TypeSignatureClass::Native(LogicalType(Native(String), String)) but received NativeType::Binary, DataType: Binary.\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues.\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'btrim(Binary)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))), Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))" + "failure": "error in DataFusion: Error during planning: Internal error: Function 'btrim' failed to match any signature, errors: Error during planning: Function 'btrim' expects 2 arguments but received 1,Error during planning: Function 'btrim' requires TypeSignatureClass::Native(LogicalType(Native(String), String)), but received Binary (DataType: Binary).\n\nHint: Binary types are not automatically coerced to String. Use CAST(column AS VARCHAR) to convert Binary data to String..\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'btrim(Binary)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))), Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))" } }, { @@ -305,7 +305,7 @@ } }, "output": { - "failure": "error in DataFusion: Error during planning: Internal error: Function 'btrim' failed to match any signature, errors: Internal error: Expect TypeSignatureClass::Native(LogicalType(Native(String), String)) but received NativeType::Binary, DataType: Binary.\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues,Error during planning: Function 'btrim' expects 1 arguments but received 2.\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'btrim(Binary, Binary)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))), Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))" + "failure": "error in DataFusion: Error during planning: Internal error: Function 'btrim' failed to match any signature, errors: Error during planning: Function 'btrim' requires TypeSignatureClass::Native(LogicalType(Native(String), String)), but received Binary (DataType: Binary).\n\nHint: Binary types are not automatically coerced to String. Use CAST(column AS VARCHAR) to convert Binary data to String.,Error during planning: Function 'btrim' expects 1 arguments but received 2.\nThis issue was likely caused by a bug in DataFusion's code. Please help us to resolve this by filing a bug report in our issue tracker: https://github.com/apache/datafusion/issues No function matches the given name and argument types 'btrim(Binary, Binary)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))), Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))\n\tbtrim(Coercion(TypeSignatureClass::Native(LogicalType(Native(String), String))))" } }, { diff --git a/crates/sail-spark-connect/tests/gold_data/plan/ddl_create_table.json b/crates/sail-spark-connect/tests/gold_data/plan/ddl_create_table.json index 6e3c712391..f2572a6429 100644 --- a/crates/sail-spark-connect/tests/gold_data/plan/ddl_create_table.json +++ b/crates/sail-spark-connect/tests/gold_data/plan/ddl_create_table.json @@ -157,7 +157,249 @@ { "input": "\nCREATE TABLE my_tab (a INT, b STRING, ts TIMESTAMP) USING parquet\nPARTITIONED BY (\n a,\n bucket(16, b),\n years(ts),\n months(ts),\n days(ts),\n hours(ts),\n foo(a, \"bar\", 34))\n ", "output": { - "failure": "invalid argument: found ( at 101:102 expected ':', data type, ',', or ')'" + "success": { + "command": { + "createTable": { + "table": [ + "my_tab" + ], + "columns": [ + { + "name": "a", + "dataType": "int32", + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "b", + "dataType": { + "configuredUtf8": { + "utf8Type": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "ts", + "dataType": { + "timestamp": { + "timeUnit": "microsecond", + "timestampType": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + } + ], + "comment": null, + "constraints": [], + "location": null, + "fileFormat": { + "general": { + "format": "parquet" + } + }, + "rowFormat": null, + "partitionBy": [ + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedFunction": { + "functionName": [ + "bucket" + ], + "arguments": [ + { + "literal": { + "int32": { + "value": 16 + } + } + }, + { + "unresolvedAttribute": { + "name": [ + "b" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "years" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "months" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "days" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "hours" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "foo" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "literal": { + "utf8": { + "value": "bar" + } + } + }, + { + "literal": { + "int32": { + "value": 34 + } + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + } + ], + "sortBy": [], + "bucketBy": null, + "clusterBy": [], + "ifNotExists": false, + "replace": false, + "options": [], + "properties": [] + }, + "planId": null + } + } } }, { @@ -530,7 +772,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -580,7 +830,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -634,7 +892,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -689,7 +955,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -733,7 +1007,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -787,7 +1069,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -841,7 +1131,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -894,7 +1192,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -948,7 +1254,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -988,7 +1302,15 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1039,7 +1361,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1099,8 +1429,24 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "p1", - "p2" + { + "unresolvedAttribute": { + "name": [ + "p1" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedAttribute": { + "name": [ + "p2" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1119,7 +1465,93 @@ "input": "CREATE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p2 string, truncate(p1, 16))", "exception": "\nOperation not allowed: PARTITION BY: Cannot mix partition expressions and partition columns:\nExpressions: truncate(p1, 16)\nColumns: p2 string.\n== SQL (line 1, position 1) ==\nCREATE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p2 string, truncate(p1, 16))\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", "output": { - "failure": "invalid argument: found ( at 78:79 expected ':', data type, ',', or ')'" + "success": { + "command": { + "createTable": { + "table": [ + "my_tab" + ], + "columns": [ + { + "name": "id", + "dataType": "int64", + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "p1", + "dataType": { + "configuredUtf8": { + "utf8Type": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + } + ], + "comment": null, + "constraints": [], + "location": null, + "fileFormat": null, + "rowFormat": null, + "partitionBy": [ + { + "unresolvedAttribute": { + "name": [ + "p2" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedFunction": { + "functionName": [ + "truncate" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "p1" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "literal": { + "int32": { + "value": 16 + } + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + } + ], + "sortBy": [], + "bucketBy": null, + "clusterBy": [], + "ifNotExists": false, + "replace": false, + "options": [], + "properties": [] + }, + "planId": null + } + } } }, { @@ -1269,7 +1701,15 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1379,7 +1819,15 @@ }, "rowFormat": null, "partitionBy": [ - "a" + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1761,7 +2209,15 @@ }, "rowFormat": null, "partitionBy": [ - "a" + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, diff --git a/crates/sail-spark-connect/tests/gold_data/plan/ddl_replace_table.json b/crates/sail-spark-connect/tests/gold_data/plan/ddl_replace_table.json index 6862d672f3..3bfb766603 100644 --- a/crates/sail-spark-connect/tests/gold_data/plan/ddl_replace_table.json +++ b/crates/sail-spark-connect/tests/gold_data/plan/ddl_replace_table.json @@ -3,7 +3,249 @@ { "input": "\nREPLACE TABLE my_tab (a INT, b STRING, ts TIMESTAMP) USING parquet\nPARTITIONED BY (\n a,\n bucket(16, b),\n years(ts),\n months(ts),\n days(ts),\n hours(ts),\n foo(a, \"bar\", 34))\n ", "output": { - "failure": "invalid argument: found ( at 102:103 expected ':', data type, ',', or ')'" + "success": { + "command": { + "createTable": { + "table": [ + "my_tab" + ], + "columns": [ + { + "name": "a", + "dataType": "int32", + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "b", + "dataType": { + "configuredUtf8": { + "utf8Type": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "ts", + "dataType": { + "timestamp": { + "timeUnit": "microsecond", + "timestampType": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + } + ], + "comment": null, + "constraints": [], + "location": null, + "fileFormat": { + "general": { + "format": "parquet" + } + }, + "rowFormat": null, + "partitionBy": [ + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedFunction": { + "functionName": [ + "bucket" + ], + "arguments": [ + { + "literal": { + "int32": { + "value": 16 + } + } + }, + { + "unresolvedAttribute": { + "name": [ + "b" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "years" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "months" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "days" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "hours" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "ts" + ], + "planId": null, + "isMetadataColumn": false + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + }, + { + "unresolvedFunction": { + "functionName": [ + "foo" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "literal": { + "utf8": { + "value": "bar" + } + } + }, + { + "literal": { + "int32": { + "value": 34 + } + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + } + ], + "sortBy": [], + "bucketBy": null, + "clusterBy": [], + "ifNotExists": false, + "replace": true, + "options": [], + "properties": [] + }, + "planId": null + } + } } }, { @@ -360,7 +602,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -410,7 +660,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -464,7 +722,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -519,7 +785,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -563,7 +837,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -617,7 +899,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -671,7 +961,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -724,7 +1022,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -778,7 +1084,15 @@ } }, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -818,7 +1132,15 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -869,7 +1191,15 @@ }, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -929,8 +1259,24 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "p1", - "p2" + { + "unresolvedAttribute": { + "name": [ + "p1" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedAttribute": { + "name": [ + "p2" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -949,7 +1295,93 @@ "input": "REPLACE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p2 string, truncate(p1, 16))", "exception": "\nOperation not allowed: PARTITION BY: Cannot mix partition expressions and partition columns:\nExpressions: truncate(p1, 16)\nColumns: p2 string.\n== SQL (line 1, position 1) ==\nREPLACE TABLE my_tab (id bigint, p1 string) PARTITIONED BY (p2 string, truncate(p1, 16))\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", "output": { - "failure": "invalid argument: found ( at 79:80 expected ':', data type, ',', or ')'" + "success": { + "command": { + "createTable": { + "table": [ + "my_tab" + ], + "columns": [ + { + "name": "id", + "dataType": "int64", + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + }, + { + "name": "p1", + "dataType": { + "configuredUtf8": { + "utf8Type": "configured" + } + }, + "nullable": true, + "comment": null, + "default": null, + "generatedAlwaysAs": null + } + ], + "comment": null, + "constraints": [], + "location": null, + "fileFormat": null, + "rowFormat": null, + "partitionBy": [ + { + "unresolvedAttribute": { + "name": [ + "p2" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "unresolvedFunction": { + "functionName": [ + "truncate" + ], + "arguments": [ + { + "unresolvedAttribute": { + "name": [ + "p1" + ], + "planId": null, + "isMetadataColumn": false + } + }, + { + "literal": { + "int32": { + "value": 16 + } + } + } + ], + "namedArguments": [], + "isDistinct": false, + "isUserDefinedFunction": false, + "isInternal": null, + "ignoreNulls": null, + "filter": null, + "orderBy": null + } + } + ], + "sortBy": [], + "bucketBy": null, + "clusterBy": [], + "ifNotExists": false, + "replace": true, + "options": [], + "properties": [] + }, + "planId": null + } + } } }, { @@ -1099,7 +1531,15 @@ "fileFormat": null, "rowFormat": null, "partitionBy": [ - "part" + { + "unresolvedAttribute": { + "name": [ + "part" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1209,7 +1649,15 @@ }, "rowFormat": null, "partitionBy": [ - "a" + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, @@ -1440,7 +1888,15 @@ }, "rowFormat": null, "partitionBy": [ - "a" + { + "unresolvedAttribute": { + "name": [ + "a" + ], + "planId": null, + "isMetadataColumn": false + } + } ], "sortBy": [], "bucketBy": null, diff --git a/crates/sail-spark-connect/tests/gold_data/plan/error_misc.json b/crates/sail-spark-connect/tests/gold_data/plan/error_misc.json index a6ad7e6b15..00b2d676d0 100644 --- a/crates/sail-spark-connect/tests/gold_data/plan/error_misc.json +++ b/crates/sail-spark-connect/tests/gold_data/plan/error_misc.json @@ -18,7 +18,7 @@ "input": "CREATE DATABASE IF NOT EXISTS my-database", "exception": "\n[INVALID_IDENTIFIER] The unquoted identifier my-database is invalid and must be back quoted as: `my-database`.\nUnquoted identifiers can only contain ASCII letters ('a' - 'z', 'A' - 'Z'), digits ('0' - '9'), and underbar ('_').\nUnquoted identifiers must also not start with a digit.\nDifferent data sources and meta stores may impose additional restrictions on valid identifiers. SQLSTATE: 42602 (line 1, pos 32)\n\n== SQL ==\nCREATE DATABASE IF NOT EXISTS my-database\n--------------------------------^^^\n", "output": { - "failure": "invalid argument: found NOT at 19:22 expected '.', 'IF', 'COMMENT', 'LOCATION', 'WITH', ';', or end of input" + "failure": "invalid argument: found - at 32:33 expected '.', 'COMMENT', 'LOCATION', 'WITH', ';', or end of input" } }, { diff --git a/crates/sail-sql-analyzer/src/data_type.rs b/crates/sail-sql-analyzer/src/data_type.rs index c41fd17fbe..3be87ab915 100644 --- a/crates/sail-sql-analyzer/src/data_type.rs +++ b/crates/sail-sql-analyzer/src/data_type.rs @@ -219,11 +219,10 @@ pub fn from_ast_data_type(sql_type: DataType) -> SqlResult { "interval end field: {interval:?}" ))); } - // FIXME: Currently `start_field` and `end_field` are lost in translation. - // This does not impact computation accuracy. - // This may affect the display string in the `data_type_to_simple_string` function. - Ok(spec::DataType::Duration { - time_unit: spec::TimeUnit::Microsecond, + Ok(spec::DataType::Interval { + interval_unit: spec::IntervalUnit::DayTime, + start_field: Some(start), + end_field: end, }) } IntervalType::Default(_) => Ok(spec::DataType::Interval { diff --git a/crates/sail-sql-analyzer/src/expression.rs b/crates/sail-sql-analyzer/src/expression.rs index a395840ccf..2d8ed69a78 100644 --- a/crates/sail-sql-analyzer/src/expression.rs +++ b/crates/sail-sql-analyzer/src/expression.rs @@ -923,7 +923,9 @@ fn from_ast_atom_expression(atom: AtomExpr) -> SqlResult { .into_iter() .map(from_ast_order_by) .collect::>>()?; - let frame = window_frame.map(from_ast_window_frame).transpose()?; + let frame = window_frame + .map(|f| from_ast_window_frame(*f)) + .transpose()?; spec::Window::Unnamed { cluster_by, partition_by, @@ -958,6 +960,9 @@ fn from_ast_atom_expression(atom: AtomExpr) -> SqlResult { plan_id: None, is_metadata_column: false, }), + AtomExpr::IdentifierClause(_, _, expr, _) => Ok(spec::Expr::IdentifierClause { + expr: Box::new(from_ast_expression(*expr)?), + }), } } diff --git a/crates/sail-sql-analyzer/src/literal/interval.rs b/crates/sail-sql-analyzer/src/literal/interval.rs index 0b4b06d273..99e96bdd03 100644 --- a/crates/sail-sql-analyzer/src/literal/interval.rs +++ b/crates/sail-sql-analyzer/src/literal/interval.rs @@ -1,8 +1,7 @@ use std::iter::once; use std::str::FromStr; -use chrono; -use chrono::TimeDelta; +use chrono::{self, TimeDelta}; use lazy_static::lazy_static; use regex::Regex; use sail_common::spec; diff --git a/crates/sail-sql-analyzer/src/query.rs b/crates/sail-sql-analyzer/src/query.rs index 4b7d3ee6ad..e0921572c9 100644 --- a/crates/sail-sql-analyzer/src/query.rs +++ b/crates/sail-sql-analyzer/src/query.rs @@ -177,7 +177,7 @@ pub(crate) fn from_ast_query(query: Query) -> SqlResult { let limit = match limit { None => None, Some(LimitValue::All(_)) => None, - Some(LimitValue::Value(value)) => Some(value), + Some(LimitValue::Value(value)) => Some(*value), }; let plan = match (offset, limit) { @@ -212,7 +212,7 @@ pub(crate) fn from_ast_query(query: Query) -> SqlResult { fn from_ast_query_term(term: QueryTerm) -> SqlResult { match term { - QueryTerm::Select(select) => from_ast_query_select(select), + QueryTerm::Select(select) => from_ast_query_select(*select), QueryTerm::Table(_, name) => from_ast_query_table(name), QueryTerm::Values(values) => from_ast_values(values), QueryTerm::Nested(_, query, _) => from_ast_query(query), @@ -320,7 +320,7 @@ fn from_ast_query_select(select: QuerySelect) -> SqlResult { fn from_ast_query_body(body: QueryBody) -> SqlResult { match body { - QueryBody::Term(term) => from_ast_query_term(term), + QueryBody::Term(term) => from_ast_query_term(*term), QueryBody::SetOperation { left, operator, @@ -358,12 +358,12 @@ fn from_ast_query_body(body: QueryBody) -> SqlResult { fn from_ast_query_table(name: ObjectName) -> SqlResult { Ok(spec::QueryPlan::new(spec::QueryNode::Read { is_streaming: false, - read_type: spec::ReadType::NamedTable(spec::ReadNamedTable { + read_type: spec::ReadType::NamedTable(Box::new(spec::ReadNamedTable { name: from_ast_object_name(name)?, temporal: None, sample: None, options: Default::default(), - }), + })), })) } @@ -436,16 +436,16 @@ fn from_ast_table_factor(table: TableFactor) -> SqlResult { modifiers, alias, } => { - let temporal = temporal.map(from_ast_temporal).transpose()?; - let sample = sample.map(from_ast_table_sample).transpose()?; + let temporal = temporal.map(|t| from_ast_temporal(*t)).transpose()?; + let sample = sample.map(|s| from_ast_table_sample(*s)).transpose()?; let plan = spec::QueryPlan::new(spec::QueryNode::Read { is_streaming: false, - read_type: spec::ReadType::NamedTable(spec::ReadNamedTable { + read_type: spec::ReadType::NamedTable(Box::new(spec::ReadNamedTable { name: from_ast_object_name(name)?, temporal, sample, options: Default::default(), - }), + })), }); let plan = query_plan_with_table_modifiers(plan, modifiers)?; query_plan_with_table_alias(plan, alias) @@ -460,7 +460,7 @@ fn from_ast_table_factor(table: TableFactor) -> SqlResult { } => { let plan = from_ast_query(query)?; let plan = if let Some(sample) = sample { - let sample = from_ast_table_sample(sample)?; + let sample = from_ast_table_sample(*sample)?; spec::QueryPlan::new(spec::QueryNode::TableSample { input: Box::new(plan), sample, @@ -495,12 +495,12 @@ fn from_ast_table_factor(table: TableFactor) -> SqlResult { .unwrap_or_default(); let plan = spec::QueryPlan::new(spec::QueryNode::Read { is_streaming: false, - read_type: spec::ReadType::Udtf(spec::ReadUdtf { + read_type: spec::ReadType::Udtf(Box::new(spec::ReadUdtf { name: from_ast_object_name(name)?, arguments, named_arguments, options: Default::default(), - }), + })), }); query_plan_with_table_alias(plan, alias) } @@ -508,6 +508,25 @@ fn from_ast_table_factor(table: TableFactor) -> SqlResult { let plan = from_ast_values(values)?; query_plan_with_table_alias(plan, alias) } + TableFactor::Identifier { + identifier: _, + left: _, + expr, + right: _, + modifiers, + alias, + } => { + let plan = spec::QueryPlan::new(spec::QueryNode::Read { + is_streaming: false, + read_type: spec::ReadType::DynamicTable(Box::new(spec::ReadDynamicTable { + name: from_ast_expression(expr)?, + sample: None, + options: Default::default(), + })), + }); + let plan = query_plan_with_table_modifiers(plan, modifiers)?; + query_plan_with_table_alias(plan, alias) + } } } diff --git a/crates/sail-sql-analyzer/src/statement.rs b/crates/sail-sql-analyzer/src/statement.rs index 2922c667c1..257c9d758c 100644 --- a/crates/sail-sql-analyzer/src/statement.rs +++ b/crates/sail-sql-analyzer/src/statement.rs @@ -10,14 +10,13 @@ use sail_sql_parser::ast::query::{IdentList, WhereClause}; use sail_sql_parser::ast::statement::{ AlterTableOperation, AlterViewOperation, AnalyzeTableModifier, AsQueryClause, Assignment, AssignmentList, ColumnAlteration, ColumnAlterationList, ColumnAlterationOption, - ColumnDefinition, ColumnDefinitionList, ColumnDefinitionOption, ColumnPosition, - ColumnTypeDefinition, CommentValue, CreateDatabaseClause, CreateTableClause, CreateViewClause, - DeleteTableAlias, DescribeItem, ExplainFormat, FileFormat, InsertDirectoryDestination, - MergeMatchClause, MergeMatchedAction, MergeNotMatchedBySourceAction, - MergeNotMatchedByTargetAction, MergeSource, PartitionClause, PartitionColumn, - PartitionColumnList, PartitionValue, PartitionValueList, PropertyKey, PropertyKeyValue, - PropertyList, PropertyValue, RowFormat, RowFormatDelimitedClause, SetClause, SortColumn, - SortColumnList, Statement, UpdateTableAlias, ViewColumn, + ColumnDefinition, ColumnDefinitionList, ColumnDefinitionOption, ColumnPosition, CommentValue, + CreateDatabaseClause, CreateTableClause, CreateViewClause, DeleteTableAlias, DescribeItem, + ExplainFormat, FileFormat, InsertDirectoryDestination, MergeMatchClause, MergeMatchedAction, + MergeNotMatchedBySourceAction, MergeNotMatchedByTargetAction, MergeSource, PartitionByItem, + PartitionByList, PartitionClause, PartitionValue, PartitionValueList, PropertyKey, + PropertyKeyValue, PropertyList, PropertyValue, RowFormat, RowFormatDelimitedClause, SetClause, + SortColumn, SortColumnList, Statement, UpdateTableAlias, ViewColumn, }; use sail_sql_parser::tree::TreeText; @@ -477,7 +476,10 @@ pub fn from_ast_statement(statement: Statement) -> SqlResult { options, } => { let options = options - .map(|(_, x)| from_ast_property_list(x)) + .map(|o| { + let (_, x) = *o; + from_ast_property_list(x) + }) .transpose()? .unwrap_or_default(); ( @@ -496,10 +498,16 @@ pub fn from_ast_statement(statement: Statement) -> SqlResult { } => { let path = from_ast_string(path)?; let file_format = stored_as - .map(|(_, _, x)| from_ast_file_format(x)) + .map(|s| { + let (_, _, x) = *s; + from_ast_file_format(x) + }) .transpose()?; let row_format = row_format - .map(|(_, _, x)| from_ast_row_format(x)) + .map(|r| { + let (_, _, x) = *r; + from_ast_row_format(x) + }) .transpose()?; (Some(path), file_format, row_format, vec![]) } @@ -583,10 +591,12 @@ pub fn from_ast_statement(statement: Statement) -> SqlResult { into: _, target, alias: target_alias, - using: (_, source), - on: (_, on_expr), + using, + on, r#match, } => { + let (_, source) = *using; + let (_, on_expr) = *on; if target_alias .as_ref() .is_some_and(|alias| alias.columns.is_some()) @@ -1143,10 +1153,14 @@ fn from_ast_table_definition( .into_iter() .flatten() .map(|x| match x { - PartitionColumn::Typed(ColumnTypeDefinition { name, .. }) => name.value.into(), - PartitionColumn::Name(x) => x.value.into(), + PartitionByItem::ColumnDefinition(column) => Ok(spec::Expr::UnresolvedAttribute { + name: spec::ObjectName::bare(column.name.value), + plan_id: None, + is_metadata_column: false, + }), + PartitionByItem::Expression(expr) => from_ast_expression(expr), }) - .collect(); + .collect::>>()?; let (sort_by, bucket_by) = if let Some(bucket_by) = bucket_by { let CreateTableBucketBy { columns, @@ -1451,7 +1465,7 @@ struct CreateTableBucketBy { #[derive(Default)] struct CreateTableClauses { - partition_by: Option>, + partition_by: Option>, bucket_by: Option, cluster_by: Option>, row_format: Option, @@ -1472,7 +1486,7 @@ impl TryFrom> for CreateTableClauses { CreateTableClause::PartitionedBy( _, _, - PartitionColumnList { + PartitionByList { left: _, columns, right: _, @@ -1831,8 +1845,12 @@ fn from_ast_alter_table_operation( } AlterTableOperation::AddPartitions { .. } => {} AlterTableOperation::DropPartition { .. } => {} - AlterTableOperation::SetTableProperties { .. } => {} - AlterTableOperation::UnsetTableProperties { .. } => {} + AlterTableOperation::SetTableProperties { .. } => { + // TODO: reuse Delta metadata property canonicalization and apply via metadata-only commit. + } + AlterTableOperation::UnsetTableProperties { .. } => { + // TODO: reuse Delta metadata property canonicalization and apply via metadata-only commit. + } AlterTableOperation::SetFileFormat { .. } => {} AlterTableOperation::SetLocation { .. } => {} AlterTableOperation::RecoverPartitions { .. } => {} diff --git a/crates/sail-sql-parser/src/ast/expression.rs b/crates/sail-sql-parser/src/ast/expression.rs index ec65718dc4..696110ae31 100644 --- a/crates/sail-sql-parser/src/ast/expression.rs +++ b/crates/sail-sql-parser/src/ast/expression.rs @@ -12,8 +12,8 @@ use crate::ast::identifier::{Ident, ObjectName, Variable}; use crate::ast::keywords::{ All, And, Any, As, Asc, Between, Both, By, Case, Cast, Cube, Current, CurrentDate, CurrentTimestamp, CurrentUser, Date, Day, Days, Desc, Distinct, Div, Else, End, Escape, Exists, - Extract, False, Filter, First, Following, For, From, Group, Grouping, Hour, Hours, Ignore, - Ilike, In, Interval, Is, Last, Leading, Like, Microsecond, Microseconds, Millisecond, + Extract, False, Filter, First, Following, For, From, Group, Grouping, Hour, Hours, Identifier, + Ignore, Ilike, In, Interval, Is, Last, Leading, Like, Microsecond, Microseconds, Millisecond, Milliseconds, Minute, Minutes, Month, Months, Not, Null, Nulls, Or, Order, Over, Overlay, Placing, Position, Preceding, Range, Regexp, Respect, Rlike, Rollup, Row, Rows, Second, Seconds, Sets, Similar, Struct, Substr, Substring, Table, Then, Time, Timestamp, TimestampLtz, @@ -217,6 +217,12 @@ pub enum AtomExpr { Option<(LeftParenthesis, RightParenthesis)>, ), CurrentDate(CurrentDate, Option<(LeftParenthesis, RightParenthesis)>), + IdentifierClause( + Identifier, + LeftParenthesis, + #[parser(function = |(e, _, _), _| boxed(e))] Box, + RightParenthesis, + ), Function(#[parser(function = |(e, _, _), o| boxed(compose(e, o)))] Box), Wildcard(operator::Asterisk), StringLiteral(StringLiteral, Vec), @@ -432,16 +438,14 @@ pub struct OverClause { #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] #[parser(dependency = "Expr")] -#[expect(clippy::large_enum_variant)] pub enum WindowSpec { Named(Ident), Unnamed { left: LeftParenthesis, #[parser(function = |e, o| compose(e, o))] modifiers: Vec, - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - #[parser(function = |e, o| compose(e, o))] - window_frame: Option, + #[parser(function = |e, o| boxed(compose(e, o)).or_not())] + window_frame: Option>, right: RightParenthesis, }, } diff --git a/crates/sail-sql-parser/src/ast/query.rs b/crates/sail-sql-parser/src/ast/query.rs index 0faf1bc7ba..eccb033005 100644 --- a/crates/sail-sql-parser/src/ast/query.rs +++ b/crates/sail-sql-parser/src/ast/query.rs @@ -13,10 +13,11 @@ use crate::ast::expression::{ use crate::ast::identifier::{column_ident, object_name, table_ident, Ident, ObjectName}; use crate::ast::keywords::{ All, Anti, As, Bucket, By, Cluster, Cross, Cube, Distinct, Distribute, Except, Exclude, For, - From, Full, Group, Having, In, Include, Inner, Intersect, Join, Lateral, Left, Limit, Minus, - Name, Natural, Nulls, Of, Offset, On, Order, Out, Outer, Partition, Percent, Pivot, Recursive, - Repeatable, Right, Rollup, Rows, Select, Semi, Sort, SystemTime, SystemVersion, Table, - Tablesample, Timestamp, Union, Unpivot, Using, Values, Version, View, Where, Window, With, + From, Full, Group, Having, Identifier, In, Include, Inner, Intersect, Join, Lateral, Left, + Limit, Minus, Name, Natural, Nulls, Of, Offset, On, Order, Out, Outer, Partition, Percent, + Pivot, Recursive, Repeatable, Right, Rollup, Rows, Select, Semi, Sort, SystemTime, + SystemVersion, Table, Tablesample, Timestamp, Union, Unpivot, Using, Values, Version, View, + Where, Window, With, }; use crate::ast::literal::IntegerLiteral; use crate::ast::operator::{Comma, LeftParenthesis, RightParenthesis}; @@ -79,10 +80,8 @@ pub struct IdentList { } #[derive(Debug, Clone, TreeSyntax, TreeText)] -#[expect(clippy::large_enum_variant)] pub enum QueryBody { - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - Term(QueryTerm), + Term(Box), SetOperation { left: Box, operator: SetOperator, @@ -106,7 +105,8 @@ where options: &'a ParserOptions, ) -> impl Parser<'a, I, Self, E> + Clone { let quantifier = SetQuantifier::parser((), options).or_not(); - let term = QueryTerm::parser((query, expr, table_with_joins), options).map(QueryBody::Term); + let term = QueryTerm::parser((query, expr, table_with_joins), options) + .map(|t| QueryBody::Term(Box::new(t))); term.pratt(( infix( left(2), @@ -158,10 +158,8 @@ pub enum SetQuantifier { #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] #[parser(dependency = "(Query, Expr, TableWithJoins)")] -#[expect(clippy::large_enum_variant)] pub enum QueryTerm { - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - Select(#[parser(function = |(q, e, t), o| compose((q, e, t), o))] QuerySelect), + Select(#[parser(function = |(q, e, t), o| boxed(compose((q, e, t), o)))] Box), Table(Table, ObjectName), Values(#[parser(function = |(_, e, _), o| compose(e, o))] ValuesClause), Nested( @@ -257,7 +255,6 @@ pub struct TableWithJoins { #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] #[parser(dependency = "(Query, Expr, TableWithJoins)")] -#[expect(clippy::large_enum_variant)] pub enum TableFactor { Values { #[parser(function = |(_, e, _), o| compose(e, o))] @@ -269,8 +266,8 @@ pub enum TableFactor { #[parser(function = |(q, _, _), _| q)] query: Query, right: RightParenthesis, - #[parser(function = |(_, e, _), o| compose(e, o))] - sample: Option, + #[parser(function = |(_, e, _), o| boxed(compose(e, o)).or_not())] + sample: Option>, #[parser(function = |(_, e, _), o| compose(e, o))] modifiers: Vec, alias: Option, @@ -284,6 +281,16 @@ pub enum TableFactor { modifiers: Vec, alias: Option, }, + Identifier { + identifier: Identifier, + left: LeftParenthesis, + #[parser(function = |(_, e, _), _| e)] + expr: Expr, + right: RightParenthesis, + #[parser(function = |(_, e, _), o| compose(e, o))] + modifiers: Vec, + alias: Option, + }, TableFunction { #[parser(function = |(_, e, _), o| compose(e, o))] function: TableFunction, @@ -291,11 +298,10 @@ pub enum TableFactor { }, Name { name: ObjectName, - #[parser(function = |(_, e, _), o| compose(e, o))] - temporal: Option, - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - #[parser(function = |(_, e, _), o| compose(e, o))] - sample: Option, + #[parser(function = |(_, e, _), o| boxed(compose(e, o)).or_not())] + temporal: Option>, + #[parser(function = |(_, e, _), o| boxed(compose(e, o)).or_not())] + sample: Option>, #[parser(function = |(_, e, _), o| compose(e, o))] modifiers: Vec, alias: Option, @@ -589,11 +595,9 @@ pub struct LimitClause { #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] #[parser(dependency = "Expr")] -#[expect(clippy::large_enum_variant)] pub enum LimitValue { All(All), - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - Value(#[parser(function = |e, _| e)] Expr), + Value(#[parser(function = |e, _| boxed(e))] Box), } #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] diff --git a/crates/sail-sql-parser/src/ast/statement.rs b/crates/sail-sql-parser/src/ast/statement.rs index 7c9835971a..f135cc5ff7 100644 --- a/crates/sail-sql-parser/src/ast/statement.rs +++ b/crates/sail-sql-parser/src/ast/statement.rs @@ -50,8 +50,8 @@ pub enum Statement { CreateDatabase { create: Create, database: Either, - name: ObjectName, if_not_exists: Option<(If, Not, Exists)>, + name: ObjectName, clauses: Vec, }, AlterDatabase { @@ -90,7 +90,7 @@ pub enum Statement { columns: Option, like: Option<(Like, ObjectName)>, using: Option<(Using, Ident)>, - #[parser(function = |(_, _, _, d), o| compose(d, o))] + #[parser(function = |(_, q, e, d), o| compose((e, q, d), o))] clauses: Vec, #[parser(function = |(_, q, _, _), o| compose(q, o))] r#as: Option, @@ -102,7 +102,7 @@ pub enum Statement { #[parser(function = |(_, _, e, d), o| compose((e, d), o))] columns: Option, using: Option<(Using, Ident)>, - #[parser(function = |(_, _, _, d), o| compose(d, o))] + #[parser(function = |(_, q, e, d), o| compose((e, q, d), o))] clauses: Vec, #[parser(function = |(_, q, _, _), o| compose(q, o))] r#as: Option, @@ -241,11 +241,10 @@ pub enum Statement { into: Into, target: ObjectName, alias: Option, - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - #[parser(function = |(_, q, _, _), o| unit(o).then(compose(q, o)))] - using: (Using, MergeSource), - #[parser(function = |(_, _, e, _), o| unit(o).then(e))] - on: (On, Expr), + #[parser(function = |(_, q, _, _), o| boxed(unit(o).then(compose(q, o))))] + using: Box<(Using, MergeSource)>, + #[parser(function = |(_, _, e, _), o| boxed(unit(o).then(e)))] + on: Box<(On, Expr)>, #[parser(function = |(_, _, e, _), o| compose(e, o))] r#match: Vec, }, @@ -452,20 +451,20 @@ pub struct ColumnTypeDefinition { } #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] -#[parser(dependency = "DataType")] -#[expect(clippy::large_enum_variant)] -pub enum PartitionColumn { - // FIXME: Rust 1.87 triggers `clippy::large_enum_variant` warning - Typed(#[parser(function = |d, o| compose(d, o))] ColumnTypeDefinition), - Name(Ident), +#[parser(dependency = "(Expr, Query, DataType)")] +pub enum PartitionByItem { + /// Hive-style typed partition column definition: `col_name ` + ColumnDefinition(#[parser(function = |(_, _, d), o| compose(d, o))] ColumnTypeDefinition), + /// DataSource partition expression: `years(ts)`, `bucket(16, b)`, `ts`, ... + Expression(#[parser(function = |(e, q, d), o| compose((e, q, d), o))] Expr), } #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] -#[parser(dependency = "DataType")] -pub struct PartitionColumnList { +#[parser(dependency = "(Expr, Query, DataType)")] +pub struct PartitionByList { pub left: LeftParenthesis, - #[parser(function = |d, o| sequence(compose(d, o), unit(o)))] - pub columns: Sequence, + #[parser(function = |(e, q, d), o| sequence(compose((e, q, d), o), unit(o)))] + pub columns: Sequence, pub right: RightParenthesis, } @@ -502,13 +501,13 @@ pub enum CreateDatabaseClause { } #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] -#[parser(dependency = "DataType")] +#[parser(dependency = "(Expr, Query, DataType)")] pub enum CreateTableClause { /// The `PARTITIONED BY` clause for table. PartitionedBy( Partitioned, By, - #[parser(function = |d, o| compose(d, o))] PartitionColumnList, + #[parser(function = |(e, q, d), o| compose((e, q, d), o))] PartitionByList, ), /// The `CLUSTERED BY ... SORTED BY ... INTO ... BUCKETS` clause for table. /// In Flink, `DISTRIBUTED BY ... INTO ... BUCKETS` seems to have a similar semantic. @@ -773,18 +772,20 @@ pub enum ColumnDropList { }, } -#[expect(clippy::large_enum_variant)] #[derive(Debug, Clone, TreeParser, TreeSyntax, TreeText)] pub enum InsertDirectoryDestination { Spark { path: Option, using: (Using, Ident), - options: Option<(Options, PropertyList)>, + #[parser(function = |(), o| boxed(unit(o)).or_not())] + options: Option>, }, Hive { path: StringLiteral, - row_format: Option<(Row, Format, RowFormat)>, - stored_as: Option<(Stored, As, FileFormat)>, + #[parser(function = |(), o| boxed(unit(o)).or_not())] + row_format: Option>, + #[parser(function = |(), o| boxed(unit(o)).or_not())] + stored_as: Option>, }, } diff --git a/crates/sail-sql-parser/tests/gold_data/syntax.json b/crates/sail-sql-parser/tests/gold_data/syntax.json index 5ca251e85c..0e946de4d8 100644 --- a/crates/sail-sql-parser/tests/gold_data/syntax.json +++ b/crates/sail-sql-parser/tests/gold_data/syntax.json @@ -806,6 +806,22 @@ } ] }, + { + "sequence": [ + { + "nonTerminal": "Keyword(Identifier)" + }, + { + "nonTerminal": "Operator(LeftParenthesis)" + }, + { + "nonTerminal": "Box(Expression)" + }, + { + "nonTerminal": "Operator(RightParenthesis)" + } + ] + }, { "sequence": [ { @@ -1180,24 +1196,84 @@ "nonTerminal": "QueryBody" } }, + { + "name": "Box(QuerySelect)", + "node": { + "nonTerminal": "QuerySelect" + } + }, + { + "name": "Box(QueryTerm)", + "node": { + "nonTerminal": "QueryTerm" + } + }, { "name": "Box(Statement)", "node": { "nonTerminal": "Statement" } }, + { + "name": "Box(TableSampleClause)", + "node": { + "nonTerminal": "TableSampleClause" + } + }, { "name": "Box(TableWithJoins)", "node": { "nonTerminal": "TableWithJoins" } }, + { + "name": "Box(TemporalClause)", + "node": { + "nonTerminal": "TemporalClause" + } + }, { "name": "Box(TrimExpr)", "node": { "nonTerminal": "TrimExpr" } }, + { + "name": "Box(Tuple(Keyword(On), Expression))", + "node": { + "nonTerminal": "Tuple(Keyword(On), Expression)" + } + }, + { + "name": "Box(Tuple(Keyword(Options), PropertyList))", + "node": { + "nonTerminal": "Tuple(Keyword(Options), PropertyList)" + } + }, + { + "name": "Box(Tuple(Keyword(Row), Keyword(Format), RowFormat))", + "node": { + "nonTerminal": "Tuple(Keyword(Row), Keyword(Format), RowFormat)" + } + }, + { + "name": "Box(Tuple(Keyword(Stored), Keyword(As), FileFormat))", + "node": { + "nonTerminal": "Tuple(Keyword(Stored), Keyword(As), FileFormat)" + } + }, + { + "name": "Box(Tuple(Keyword(Using), MergeSource))", + "node": { + "nonTerminal": "Tuple(Keyword(Using), MergeSource)" + } + }, + { + "name": "Box(WindowFrame)", + "node": { + "nonTerminal": "WindowFrame" + } + }, { "name": "CaseElse", "node": { @@ -1568,7 +1644,7 @@ "nonTerminal": "Keyword(By)" }, { - "nonTerminal": "PartitionColumnList" + "nonTerminal": "PartitionByList" } ] }, @@ -3320,7 +3396,7 @@ "nonTerminal": "Tuple(Keyword(Using), Identifier)" }, { - "nonTerminal": "Option(Tuple(Keyword(Options), PropertyList))" + "nonTerminal": "Option(Box(Tuple(Keyword(Options), PropertyList)))" } ] }, @@ -3330,10 +3406,10 @@ "nonTerminal": "StringLiteral" }, { - "nonTerminal": "Option(Tuple(Keyword(Row), Keyword(Format), RowFormat))" + "nonTerminal": "Option(Box(Tuple(Keyword(Row), Keyword(Format), RowFormat)))" }, { - "nonTerminal": "Option(Tuple(Keyword(Stored), Keyword(As), FileFormat))" + "nonTerminal": "Option(Box(Tuple(Keyword(Stored), Keyword(As), FileFormat)))" } ] } @@ -4689,6 +4765,14 @@ } } }, + { + "name": "Keyword(Identifier)", + "node": { + "terminal": { + "keyword": "IDENTIFIER" + } + } + }, { "name": "Keyword(If)", "node": { @@ -6142,7 +6226,7 @@ { "sequence": [ { - "nonTerminal": "Expression" + "nonTerminal": "Box(Expression)" } ] } @@ -6810,6 +6894,54 @@ } } }, + { + "name": "Option(Box(TableSampleClause))", + "node": { + "optional": { + "nonTerminal": "Box(TableSampleClause)" + } + } + }, + { + "name": "Option(Box(TemporalClause))", + "node": { + "optional": { + "nonTerminal": "Box(TemporalClause)" + } + } + }, + { + "name": "Option(Box(Tuple(Keyword(Options), PropertyList)))", + "node": { + "optional": { + "nonTerminal": "Box(Tuple(Keyword(Options), PropertyList))" + } + } + }, + { + "name": "Option(Box(Tuple(Keyword(Row), Keyword(Format), RowFormat)))", + "node": { + "optional": { + "nonTerminal": "Box(Tuple(Keyword(Row), Keyword(Format), RowFormat))" + } + } + }, + { + "name": "Option(Box(Tuple(Keyword(Stored), Keyword(As), FileFormat)))", + "node": { + "optional": { + "nonTerminal": "Box(Tuple(Keyword(Stored), Keyword(As), FileFormat))" + } + } + }, + { + "name": "Option(Box(WindowFrame))", + "node": { + "optional": { + "nonTerminal": "Box(WindowFrame)" + } + } + }, { "name": "Option(ColumnDefinitionList)", "node": { @@ -7226,14 +7358,6 @@ } } }, - { - "name": "Option(TableSampleClause)", - "node": { - "optional": { - "nonTerminal": "TableSampleClause" - } - } - }, { "name": "Option(TableSampleRepeatable)", "node": { @@ -7242,14 +7366,6 @@ } } }, - { - "name": "Option(TemporalClause)", - "node": { - "optional": { - "nonTerminal": "TemporalClause" - } - } - }, { "name": "Option(TimezoneType)", "node": { @@ -7378,14 +7494,6 @@ } } }, - { - "name": "Option(Tuple(Keyword(Row), Keyword(Format), RowFormat))", - "node": { - "optional": { - "nonTerminal": "Tuple(Keyword(Row), Keyword(Format), RowFormat)" - } - } - }, { "name": "Option(Tuple(Keyword(Sorted), Keyword(By), SortColumnList))", "node": { @@ -7394,14 +7502,6 @@ } } }, - { - "name": "Option(Tuple(Keyword(Stored), Keyword(As), FileFormat))", - "node": { - "optional": { - "nonTerminal": "Tuple(Keyword(Stored), Keyword(As), FileFormat)" - } - } - }, { "name": "Option(Tuple(Keyword(To), IntervalDayTimeUnit))", "node": { @@ -7562,14 +7662,6 @@ } } }, - { - "name": "Option(WindowFrame)", - "node": { - "optional": { - "nonTerminal": "WindowFrame" - } - } - }, { "name": "Option(WithClause)", "node": { @@ -7690,20 +7782,7 @@ } }, { - "name": "PartitionClause", - "node": { - "sequence": [ - { - "nonTerminal": "Keyword(Partition)" - }, - { - "nonTerminal": "PartitionValueList" - } - ] - } - }, - { - "name": "PartitionColumn", + "name": "PartitionByItem", "node": { "choice": [ { @@ -7716,7 +7795,7 @@ { "sequence": [ { - "nonTerminal": "Identifier" + "nonTerminal": "Expression" } ] } @@ -7724,14 +7803,14 @@ } }, { - "name": "PartitionColumnList", + "name": "PartitionByList", "node": { "sequence": [ { "nonTerminal": "Operator(LeftParenthesis)" }, { - "nonTerminal": "Sequence(PartitionColumn, Operator(Comma))" + "nonTerminal": "Sequence(PartitionByItem, Operator(Comma))" }, { "nonTerminal": "Operator(RightParenthesis)" @@ -7739,6 +7818,19 @@ ] } }, + { + "name": "PartitionClause", + "node": { + "sequence": [ + { + "nonTerminal": "Keyword(Partition)" + }, + { + "nonTerminal": "PartitionValueList" + } + ] + } + }, { "name": "PartitionValue", "node": { @@ -7960,7 +8052,7 @@ { "sequence": [ { - "nonTerminal": "QueryTerm" + "nonTerminal": "Box(QueryTerm)" } ] }, @@ -8071,7 +8163,7 @@ { "sequence": [ { - "nonTerminal": "QuerySelect" + "nonTerminal": "Box(QuerySelect)" } ] }, @@ -8559,11 +8651,11 @@ } }, { - "name": "Sequence(PartitionColumn, Operator(Comma))", + "name": "Sequence(PartitionByItem, Operator(Comma))", "node": { "sequence": [ { - "nonTerminal": "PartitionColumn" + "nonTerminal": "PartitionByItem" }, { "oneOrMore": { @@ -8572,7 +8664,7 @@ "nonTerminal": "Operator(Comma)" }, { - "nonTerminal": "PartitionColumn" + "nonTerminal": "PartitionByItem" } ] } @@ -8984,10 +9076,10 @@ "nonTerminal": "Either(Keyword(Database), Keyword(Schema))" }, { - "nonTerminal": "ObjectName" + "nonTerminal": "Option(Tuple(Keyword(If), Keyword(Not), Keyword(Exists)))" }, { - "nonTerminal": "Option(Tuple(Keyword(If), Keyword(Not), Keyword(Exists)))" + "nonTerminal": "ObjectName" }, { "nonTerminal": "Vector(CreateDatabaseClause)" @@ -9452,10 +9544,10 @@ "nonTerminal": "Option(AliasClause)" }, { - "nonTerminal": "Tuple(Keyword(Using), MergeSource)" + "nonTerminal": "Box(Tuple(Keyword(Using), MergeSource))" }, { - "nonTerminal": "Tuple(Keyword(On), Expression)" + "nonTerminal": "Box(Tuple(Keyword(On), Expression))" }, { "nonTerminal": "Vector(MergeMatchClause)" @@ -9799,7 +9891,7 @@ "nonTerminal": "Operator(RightParenthesis)" }, { - "nonTerminal": "Option(TableSampleClause)" + "nonTerminal": "Option(Box(TableSampleClause))" }, { "nonTerminal": "Vector(TableModifier)" @@ -9828,6 +9920,28 @@ } ] }, + { + "sequence": [ + { + "nonTerminal": "Keyword(Identifier)" + }, + { + "nonTerminal": "Operator(LeftParenthesis)" + }, + { + "nonTerminal": "Expression" + }, + { + "nonTerminal": "Operator(RightParenthesis)" + }, + { + "nonTerminal": "Vector(TableModifier)" + }, + { + "nonTerminal": "Option(AliasClause)" + } + ] + }, { "sequence": [ { @@ -9844,10 +9958,10 @@ "nonTerminal": "ObjectName" }, { - "nonTerminal": "Option(TemporalClause)" + "nonTerminal": "Option(Box(TemporalClause))" }, { - "nonTerminal": "Option(TableSampleClause)" + "nonTerminal": "Option(Box(TableSampleClause))" }, { "nonTerminal": "Vector(TableModifier)" @@ -11598,7 +11712,7 @@ "nonTerminal": "Vector(WindowModifier)" }, { - "nonTerminal": "Option(WindowFrame)" + "nonTerminal": "Option(Box(WindowFrame))" }, { "nonTerminal": "Operator(RightParenthesis)" diff --git a/crates/sail-telemetry/src/execution/metrics/join.rs b/crates/sail-telemetry/src/execution/metrics/join.rs index 102cff1da6..181e1643a8 100644 --- a/crates/sail-telemetry/src/execution/metrics/join.rs +++ b/crates/sail-telemetry/src/execution/metrics/join.rs @@ -88,6 +88,10 @@ impl MetricEmitter for BuildProbeJoinMetricEmitter { .emit(); MetricHandled::Yes } + MetricValue::Count { name, .. } if name == "array_map_created_count" => { + // Internal implementation metric of the hash join; not surfaced externally. + MetricHandled::Yes + } MetricValue::Count { name, count } if name == "input_rows" => { registry .execution_join_probe_side_row_count @@ -397,6 +401,7 @@ mod tests { None, PartitionMode::CollectLeft, NullEquality::NullEqualsNothing, + false, )?); MetricEmitterTester::new() diff --git a/crates/sail-telemetry/src/execution/metrics/testing.rs b/crates/sail-telemetry/src/execution/metrics/testing.rs index f6793ec264..39e18d27f5 100644 --- a/crates/sail-telemetry/src/execution/metrics/testing.rs +++ b/crates/sail-telemetry/src/execution/metrics/testing.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::sync::Arc; +use std::time::Duration; use datafusion::common::{plan_err, DataFusionError, Result}; use datafusion::execution::TaskContext; @@ -9,7 +10,7 @@ use opentelemetry::metrics::MeterProvider; use opentelemetry_sdk::metrics::{InMemoryMetricExporter, SdkMeterProvider}; use crate::execution::physical_plan::TracingExec; -use crate::metrics::MetricRegistry; +use crate::metrics::{MetricManager, MetricRegistry}; use crate::TracingExecOptions; fn format_raw_metrics(plan: &dyn ExecutionPlan) -> String { @@ -80,7 +81,10 @@ impl MetricEmitterTester { let Some(plan) = self.plan else { return plan_err!("missing execution plan"); }; - let options = TracingExecOptions::default().with_metric_registry(self.registry); + let options = TracingExecOptions::default().with_metrics(MetricManager { + registry: self.registry, + collection_interval: Duration::ZERO, + }); let plan = Arc::new(TracingExec::new(plan, options)); let context = Arc::new(TaskContext::default()); let _ = plan.execute(0, context)?.try_collect::>().await?; diff --git a/crates/sail-telemetry/src/execution/physical_plan.rs b/crates/sail-telemetry/src/execution/physical_plan.rs index 6a48608bcb..098c2710a2 100644 --- a/crates/sail-telemetry/src/execution/physical_plan.rs +++ b/crates/sail-telemetry/src/execution/physical_plan.rs @@ -3,6 +3,7 @@ use std::fmt; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; +use std::time::{Duration, Instant}; use datafusion::arrow::array::RecordBatch; use datafusion::arrow::datatypes::SchemaRef; @@ -32,11 +33,11 @@ use sail_common_datafusion::utils::items::ItemTaker; use crate::common::{KeyValue, SpanAttribute}; use crate::execution::metrics::MetricEmitter; -use crate::metrics::{MetricAttribute, MetricRegistry}; +use crate::metrics::{MetricAttribute, MetricManager, MetricRegistry}; #[derive(Debug, Clone, Default)] pub struct TracingExecOptions { - pub metric_registry: Option>, + pub metrics: Option, pub job_id: Option, pub stage: Option, pub attempt: Option, @@ -44,8 +45,8 @@ pub struct TracingExecOptions { } impl TracingExecOptions { - pub fn with_metric_registry(mut self, registry: Arc) -> Self { - self.metric_registry = Some(registry); + pub fn with_metrics(mut self, manager: MetricManager) -> Self { + self.metrics = Some(manager); self } } @@ -116,7 +117,7 @@ impl ExecutionPlan for TracingExec { self.inner.schema() } - fn properties(&self) -> &PlanProperties { + fn properties(&self) -> &Arc { self.inner.properties() } @@ -177,13 +178,15 @@ impl ExecutionPlan for TracingExec { self.inner.execute(partition, context)? }; let schema = stream.schema(); - if let Some(ref registry) = self.options.metric_registry { + if let Some(ref manager) = self.options.metrics { let stream = MetricEmitterStream { inner: stream, plan: self.inner.clone(), emitter: self.build_metric_emitter(), attributes: self.build_metric_attributes(), - registry: registry.clone(), + registry: manager.registry.clone(), + interval: manager.collection_interval, + last_emit: None, }; Ok(Box::pin(RecordBatchStreamAdapter::new( schema, @@ -201,11 +204,6 @@ impl ExecutionPlan for TracingExec { self.inner.metrics() } - #[expect(deprecated)] - fn statistics(&self) -> Result { - self.inner.statistics() - } - fn partition_statistics(&self, partition: Option) -> Result { self.inner.partition_statistics(partition) } @@ -233,6 +231,10 @@ impl ExecutionPlan for TracingExec { Ok(None) } + fn with_preserve_order(&self, _preserve_order: bool) -> Option> { + None + } + fn gather_filters_for_pushdown( &self, _phase: FilterPushdownPhase, @@ -307,6 +309,8 @@ pin_project! { emitter: Box, attributes: Vec, registry: Arc, + interval: Duration, + last_emit: Option, } } @@ -317,12 +321,20 @@ impl Stream for MetricEmitterStream { let this = self.project(); let poll = this.inner.poll_next(cx); if poll.is_ready() { - if let Some(metrics) = this.plan.metrics() { - for metric in metrics.iter() { - let _ = this - .emitter - .try_emit(metric, this.attributes, this.registry); + let is_done = matches!(poll, Poll::Ready(None)); + // Note: metrics are not emitted regularly if a batch takes long to be produced, + // but this is acceptable for the purpose of execution metrics. + let should_emit = + is_done || this.last_emit.is_none_or(|t| t.elapsed() >= *this.interval); + if should_emit { + if let Some(metrics) = this.plan.metrics() { + for metric in metrics.iter() { + let _ = this + .emitter + .try_emit(metric, this.attributes, this.registry); + } } + *this.last_emit = Some(Instant::now()); } } poll diff --git a/crates/sail-telemetry/src/lib.rs b/crates/sail-telemetry/src/lib.rs index 53bf4ac743..1cef92dfd2 100644 --- a/crates/sail-telemetry/src/lib.rs +++ b/crates/sail-telemetry/src/lib.rs @@ -9,3 +9,4 @@ pub mod recorder; pub mod telemetry; pub use execution::physical_plan::{trace_execution_plan, TracingExecOptions}; +pub use metrics::MetricManager; diff --git a/crates/sail-telemetry/src/metrics/mod.rs b/crates/sail-telemetry/src/metrics/mod.rs index d32634ebeb..26ed1e7936 100644 --- a/crates/sail-telemetry/src/metrics/mod.rs +++ b/crates/sail-telemetry/src/metrics/mod.rs @@ -6,10 +6,20 @@ mod gen { } use std::fmt; +use std::sync::Arc; +use std::time::Duration; pub use gen::{MetricAttribute, MetricRegistry}; pub use instruments::*; +/// Encapsulates a [`MetricRegistry`] together with the metrics collection interval. +/// When metrics are enabled, both the registry and the interval are always present together. +#[derive(Clone, Debug)] +pub struct MetricManager { + pub registry: Arc, + pub collection_interval: Duration, +} + impl fmt::Debug for MetricRegistry { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("MetricRegistry").finish() diff --git a/crates/sail-telemetry/src/telemetry.rs b/crates/sail-telemetry/src/telemetry.rs index 6986dd8231..1cc8b406b9 100644 --- a/crates/sail-telemetry/src/telemetry.rs +++ b/crates/sail-telemetry/src/telemetry.rs @@ -21,7 +21,7 @@ use crate::error::{TelemetryError, TelemetryResult}; use crate::execution::join_set::DefaultJoinSetTracer; use crate::loggers::composite::CompositeLogger; use crate::loggers::span::SpanEventLogger; -use crate::metrics::MetricRegistry; +use crate::metrics::{MetricManager, MetricRegistry}; enum TelemetryStatus { Uninitialized, @@ -34,7 +34,7 @@ enum TelemetryStatus { struct TelemetryState { meter_provider: Option, meter: Option, - metric_registry: Option>, + metrics: Option, logger_provider: Option, } @@ -131,7 +131,10 @@ fn init_metrics( global::set_meter_provider(provider.clone()); let meter = global::meter_with_scope(get_instrumentation_scope()); state.meter_provider = Some(provider); - state.metric_registry = Some(Arc::new(MetricRegistry::new(&meter))); + state.metrics = Some(MetricManager { + registry: Arc::new(MetricRegistry::new(&meter)), + collection_interval: Duration::from_secs(config.metrics_collection_interval_secs), + }); state.meter = Some(meter); } Ok(()) @@ -218,12 +221,12 @@ pub fn shutdown_telemetry() { } } -pub fn global_metric_registry() -> Option> { +pub fn global_metrics() -> Option { TELEMETRY_STATUS .lock() .ok() .and_then(|status| match &*status { - TelemetryStatus::Initialized(state) => state.metric_registry.clone(), + TelemetryStatus::Initialized(state) => state.metrics.clone(), _ => None, }) } diff --git a/docker/dev/Dockerfile b/docker/dev/Dockerfile index 6ec1364e4d..a5ea60237b 100644 --- a/docker/dev/Dockerfile +++ b/docker/dev/Dockerfile @@ -1,4 +1,4 @@ -ARG RUST_VERSION=1.88.0 +ARG RUST_VERSION=1.91.0 ARG RUST_PROFILE=dev ARG RUSTFLAGS="" ARG PYSPARK_VERSION=4.1.1 diff --git a/docker/release/Dockerfile b/docker/release/Dockerfile index c8f546f7cd..f98f49d31b 100644 --- a/docker/release/Dockerfile +++ b/docker/release/Dockerfile @@ -1,5 +1,5 @@ ARG RELEASE_TAG -ARG RUST_VERSION=1.88.0 +ARG RUST_VERSION=1.91.0 ARG RUST_PROFILE=release ARG RUSTFLAGS="" ARG PYSPARK_VERSION=4.1.1 diff --git a/docs/guide/dataframe/data-types/compatibility.md b/docs/guide/dataframe/data-types/compatibility.md index 880df9b3fc..fc1e57bc71 100644 --- a/docs/guide/dataframe/data-types/compatibility.md +++ b/docs/guide/dataframe/data-types/compatibility.md @@ -162,7 +162,7 @@ The table below shows how Spark data types are mapped to Python types and Arrow TimeType - TimeType(precision: int = 6) + TimeType(precision: int = 6) datetime.time Time32(Second)
Time32(Millisecond)
Time64(Microsecond) @@ -262,4 +262,4 @@ The table below shows how Spark data types are mapped to Python types and Arrow 3. **StringType**, **CharType(_n_)**, and **VarcharType(_n_)** in Spark are mapped to either the Utf8 or LargeUtf8 type in Arrow, depending on the `spark.sql.execution.arrow.useLargeVarTypes` configuration option. 4. **BinaryType** in Spark is mapped to either the Binary or LargeBinary type in Arrow, depending on the `spark.sql.execution.arrow.useLargeVarTypes` configuration option. 5. **CalendarIntervalType** in Spark has microsecond precision while the Interval(MonthDayNano) Arrow type has nanosecond precision. So the supported data range for calendar intervals is different between JVM Spark and Arrow. -6. **TimeType** represents time of day values without a time zone. The `precision` parameter specifies the number of decimal digits following the decimal point in the seconds field. Spark 4.0 supports precision values `0`, `3`, and `6` (second, millisecond, microsecond). The default precision is `6` (microsecond). Precision `0` and `3` map to Time32 in Arrow, while precision `6` maps to Time64(Microsecond) in Arrow. Note: TIME precision 9 (nanosecond) is not supported by Spark 4.0. +6. **TimeType** represents time of day values without a time zone. The `precision` parameter specifies the number of decimal digits following the decimal point in the seconds field. Spark 4.0 supports precision values `0`, `3`, and `6` (second, millisecond, and microsecond). The default precision is `6` (microsecond). Precision `0` and `3` map to Time32 in Arrow, while precision `6` maps to Time64(Microsecond) in Arrow. Precision `9` (nanosecond) is not supported by Spark 4.0. diff --git a/docs/guide/formats/index.md b/docs/guide/formats/index.md deleted file mode 100644 index 574114cdc4..0000000000 --- a/docs/guide/formats/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: Data Formats -rank: 5 ---- - -# Data Formats - -Sail supports various data formats for reading and writing. - -You can use the `SparkSession.read`, `DataFrame.write`, and `DataFrame.writeTo()` API to load and save data in different -formats. -You can also use the `CREATE TABLE` SQL statement to create a table that refers to data stored in a specific format. - -Here is a summary of the supported (:white_check_mark:) and unsupported (:x:) data formats for reading and writing data. -There are also features that are planned in our roadmap (:construction:). - -| Format | Read Support | Write Support | -| ---------------------- | ---------------------------- | ---------------------------- | -| [Delta Lake](./delta) | :white_check_mark: (partial) | :white_check_mark: (partial) | -| [Iceberg](./iceberg) | :white_check_mark: (partial) | :white_check_mark: (partial) | -| Parquet | :white_check_mark: | :white_check_mark: | -| Binary (any file type) | :white_check_mark: | :x: | -| CSV | :white_check_mark: | :white_check_mark: | -| JSON | :white_check_mark: | :white_check_mark: | -| Text | :white_check_mark: | :white_check_mark: | -| Avro | :white_check_mark: | :white_check_mark: | -| Protocol Buffers | :construction: | :construction: | -| Hudi | :construction: | :construction: | -| ORC | :x: | :x: | diff --git a/docs/guide/integrations/_code/pyspark-skill.md b/docs/guide/integrations/_code/pyspark-skill.md new file mode 100644 index 0000000000..563cfc9bb2 --- /dev/null +++ b/docs/guide/integrations/_code/pyspark-skill.md @@ -0,0 +1,43 @@ +--- +name: run-pyspark-script +description: Runs a PySpark script. +--- + +You can run PySpark scripts via the `sail spark run` command. + +Sail is an open-source unified and distributed multimodal computation framework +that can be used as a drop-in replacement for Apache Spark. +By using Sail to run PySpark scripts, you can perform data processing tasks +with the familiar PySpark DataFrame API or Spark SQL while benefiting from +the high performance and low memory overhead of Sail. + +The script can refer to the Spark session via the `spark` variable, which +connects to a local Sail server via the Spark Connect protocol. +The Sail server starts instantly when you run the `sail spark run` command, +and it will be automatically stopped after the script finishes. + +You can pipe simple PySpark code to the `sail spark run` command directly: + +```bash +echo 'spark.sql("SELECT 1 + 1").show()' | sail spark run 2>/dev/null +``` + +Alternatively, you can use a heredoc for more complex PySpark scripts: + +```bash +cat </dev/null +import pyspark.sql.functions as F + +df = spark.createDataFrame([(1, 2), (2, 3)], ["a", "b"]) +df = df.withColumn("sum", F.col("a") + F.col("b")) +df.show() +EOF +``` + +You can also write the PySpark script to a file and run it by specifying the +file path with the `-f` option: + +```bash +echo 'spark.range(10).filter("id % 2 == 0").show()' > /tmp/script.py +sail spark run -f /tmp/script.py 2>/dev/null +``` diff --git a/docs/guide/integrations/agent-skills.md b/docs/guide/integrations/agent-skills.md new file mode 100644 index 0000000000..49fda70b72 --- /dev/null +++ b/docs/guide/integrations/agent-skills.md @@ -0,0 +1,19 @@ +--- +title: Agent Skills +rank: 2 +--- + +# Agent Skills + +The `sail spark run` command can be used as a CLI tool that enables your LLM agents to execute PySpark scripts via Sail's highly efficient compute engine. +By exposing this command as an agent skill, the agent can perform data processing tasks using the familiar PySpark API while enjoying lightning-fast performance. + +Here is an example skill definition. +Refer to your LLM provider's documentation to see how to load the skill for your agents. + +::: code-group + + +<<< ./_code/pyspark-skill.md [SKILL.md] + +::: diff --git a/docs/guide/integrations/jdbc.md b/docs/guide/integrations/jdbc.md deleted file mode 100644 index b12d5252eb..0000000000 --- a/docs/guide/integrations/jdbc.md +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: JDBC -rank: 3 ---- - -# JDBC Datasource - -Sail provides a database connector exposed under the `"jdbc"` format name for API parity with -vanilla PySpark — no actual JDBC driver or JVM is involved. - -## Installation - -```bash -pip install pysail[jdbc] -``` - -## Quick Start - -Register the datasource once per Spark session, then read using the standard PySpark API. - -```python -from pysail.datasources.jdbc import JdbcDataSource - -spark.dataSource.register(JdbcDataSource) - -# Using format("jdbc") — full option control -df = ( - spark.read.format("jdbc") - .option("url", "jdbc:postgresql://localhost:5432/mydb") - .option("dbtable", "public.users") - .option("user", "alice") - .option("password", "secret") - .load() -) - -# Using spark.read.jdbc() shorthand -df = spark.read.jdbc( - "jdbc:postgresql://localhost:5432/mydb", - "public.users", - properties={"user": "alice", "password": "secret"}, -) -df.show() -``` - -## Supported Options - -Options are consistent with the [PySpark JDBC documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). - -| Option | Required | Default | Description | -| ------------------- | --------- | ------- | --------------------------------------------------------------------------------------------------- | -| `url` | **Yes** | | JDBC URL: `jdbc:://:/` | -| `dbtable` | **Yes\*** | | Table name, optionally schema-qualified (`"schema.table"`). Mutually exclusive with `query`. | -| `query` | **Yes\*** | | Arbitrary SQL SELECT. Mutually exclusive with `dbtable`. | -| `user` | No | | Database username. Can also be passed in `properties` dict. | -| `password` | No | | Database password. Can also be passed in `properties` dict. | -| `partitionColumn` | No | | Numeric column for range-stride partitioning. Requires `lowerBound`, `upperBound`, `numPartitions`. | -| `lowerBound` | No | | Lower bound of partition stride (inclusive). | -| `upperBound` | No | | Upper bound of partition stride (inclusive on last partition). | -| `numPartitions` | No | `1` | Number of parallel read partitions. | -| `fetchsize` | No | `0` | Advisory rows-per-round-trip hint. | -| `pushDownPredicate` | No | `true` | Push `WHERE` filters to the database. Set to `false` to disable. | -| `customSchema` | No | | Spark DDL string to override inferred column types (e.g. `"id DECIMAL(38,0), name STRING"`). | - -\* Exactly one of `dbtable` or `query` is required. - -## Reading a Custom SQL Query - -Use `query` instead of `dbtable` to run arbitrary SQL: - -```python -df = ( - spark.read.format("jdbc") - .option("url", "jdbc:postgresql://localhost:5432/mydb") - .option("query", "SELECT id, name FROM users WHERE active = TRUE") - .option("user", "alice") - .option("password", "secret") - .load() -) -``` - -> **Note:** `query` and `partitionColumn` are mutually exclusive. To partition -> a custom query, wrap it in `dbtable` as a subquery: -> -> ```python -> .option("dbtable", "(SELECT * FROM events WHERE type='click') AS t") -> .option("partitionColumn", "user_id") -> ``` - -## Parallel Reads (Range Partitioning) - -Provide `partitionColumn`, `lowerBound`, `upperBound`, and `numPartitions` together to -split the read into parallel range strides — consistent with PySpark JDBC semantics: - -```python -df = ( - spark.read.format("jdbc") - .option("url", "jdbc:postgresql://localhost:5432/mydb") - .option("dbtable", "events") - .option("partitionColumn", "id") - .option("lowerBound", "1") - .option("upperBound", "10000000") - .option("numPartitions", "8") - .option("user", "alice") - .option("password", "secret") - .load() -) -``` - -Or equivalently: - -```python -df = spark.read.jdbc( - "jdbc:postgresql://localhost:5432/mydb", - "events", - column="id", - lowerBound=1, - upperBound=10_000_000, - numPartitions=8, - properties={"user": "alice", "password": "secret"}, -) -``` - -## Schema Override - -Use `customSchema` to override column types after reading: - -```python -df = ( - spark.read.format("jdbc") - .option("url", "jdbc:postgresql://localhost:5432/mydb") - .option("dbtable", "orders") - .option("customSchema", "amount DECIMAL(18,2), status STRING") - .option("user", "alice") - .option("password", "secret") - .load() -) -``` - -Columns not listed in `customSchema` retain their inferred types. diff --git a/docs/guide/formats/delta.md b/docs/guide/sources/delta/examples.md similarity index 89% rename from docs/guide/formats/delta.md rename to docs/guide/sources/delta/examples.md index 546a61566f..48eb62acdf 100644 --- a/docs/guide/formats/delta.md +++ b/docs/guide/sources/delta/examples.md @@ -1,18 +1,13 @@ --- -title: Delta Lake +title: Examples rank: 1 --- -# Delta Lake +# Examples -You can use the `delta` format in Sail to work with [Delta Lake](https://delta.io/). -You can use the Spark DataFrame API or Spark SQL to read and write Delta tables. + -## Examples - - - -### Basic Usage +## Basic Usage ::: code-group @@ -44,7 +39,7 @@ SELECT * FROM users; ::: -### Data Partitioning +## Data Partitioning You can work with partitioned Delta tables using the Spark DataFrame API. Partitioned Delta tables organize data into directories based on the values of one or more columns. @@ -78,7 +73,7 @@ SELECT * FROM metrics WHERE year > 2024; ::: -### Schema Evolution +## Schema Evolution Delta Lake handles schema evolution gracefully. By default, if you try to write data with a different schema than the one of the existing Delta table, an error will occur. @@ -96,7 +91,7 @@ But this works only if you set the write mode to `overwrite`. df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").save(path) ``` -### Time Travel +## Time Travel You can use the time travel feature to query historical versions of a Delta table. @@ -107,7 +102,7 @@ df = spark.read.format("delta").option("timestampAsOf", "2025-01-02T03:04:05.678 Time travel is not available for Spark SQL in Sail yet, but we plan to support it soon. -### Column Mapping +## Column Mapping You can write Delta tables with column mapping enabled. The supported column mapping modes are `name` and `id`. You must write to a new Delta table to enable column mapping. @@ -118,7 +113,7 @@ df.write.format("delta").option("columnMappingMode", "id").save(path) Existing Delta tables with column mapping can be read as usual. -### More Features +## More Features We will continue adding more examples for advanced Delta Lake features as they become available in Sail. In the meantime, feel free to reach out to us on [Slack](https://lakesail.com/slack) or [GitHub Discussions](https://github.com/lakehq/sail/discussions) if you have questions! diff --git a/docs/guide/sources/delta/features.md b/docs/guide/sources/delta/features.md new file mode 100644 index 0000000000..cabf21622c --- /dev/null +++ b/docs/guide/sources/delta/features.md @@ -0,0 +1,56 @@ +--- +title: Supported Features +rank: 2 +--- + +# Supported Features + +## Core Table Operations + +| Feature | Supported | +| ------------------------------------------- | ------------------ | +| Read | :white_check_mark: | +| Write (append) | :white_check_mark: | +| Write (overwrite) | :white_check_mark: | +| Data skipping (partition pruning) | :white_check_mark: | +| Data skipping (pruning via file statistics) | :white_check_mark: | +| Schema validation | :white_check_mark: | +| Schema evolution | :white_check_mark: | +| Time travel (by version) | :white_check_mark: | +| Time travel (by timestamp) | :white_check_mark: | + +Both non-partitioned and partitioned tables are supported for reading and writing. + +## DML Operations + +| Feature | Supported | +| ------------------------ | ------------------ | +| `DELETE` (copy-on-write) | :white_check_mark: | +| `MERGE` (copy-on-write) | :white_check_mark: | +| `DELETE` (merge-on-read) | :construction: | +| `MERGE` (merge-on-read) | :construction: | +| `UPDATE` | :construction: | + +The "merge-on-read" mode refers to updating the table with deletion vectors. This reduces the amount of data that needs to be rewritten during DML operations, but incurs additional read overhead when querying the table. + +## Table Maintenance Operations + +| Feature | Supported | +| ---------- | -------------- | +| `VACUUM` | :construction: | +| `OPTIMIZE` | :construction: | +| `RESTORE` | :construction: | + +## Protocol Internals + +| Feature | Supported | +| -------------------------------- | ------------------ | +| Checkpointing | :white_check_mark: | +| Log clean-up | :white_check_mark: | +| Column mapping | :white_check_mark: | +| Deletion vectors | :construction: | +| Constraints | :construction: | +| Identity columns | :construction: | +| Generated columns | :construction: | +| Transaction (conflict detection) | :construction: | +| Change data feed | :construction: | diff --git a/docs/guide/sources/delta/index.data.ts b/docs/guide/sources/delta/index.data.ts new file mode 100644 index 0000000000..191f6432dd --- /dev/null +++ b/docs/guide/sources/delta/index.data.ts @@ -0,0 +1,5 @@ +import { createContentLoader } from "vitepress"; + +export default createContentLoader([ + "/guide/sources/delta/!(index|_*/**|**/_*/**).md", +]); diff --git a/docs/guide/sources/delta/index.md b/docs/guide/sources/delta/index.md new file mode 100644 index 0000000000..16f3624428 --- /dev/null +++ b/docs/guide/sources/delta/index.md @@ -0,0 +1,18 @@ +--- +title: Delta Lake +rank: 1 +--- + +# Delta Lake + +You can use the `delta` format in Sail to work with [Delta Lake](https://delta.io/). +You can use the Spark DataFrame API or Spark SQL to read and write Delta tables. + +## Topics + + + + diff --git a/docs/guide/formats/iceberg.md b/docs/guide/sources/iceberg/examples.md similarity index 86% rename from docs/guide/formats/iceberg.md rename to docs/guide/sources/iceberg/examples.md index 2d9ae3a418..f68b0d5b27 100644 --- a/docs/guide/formats/iceberg.md +++ b/docs/guide/sources/iceberg/examples.md @@ -1,18 +1,13 @@ --- -title: Iceberg -rank: 2 +title: Examples +rank: 1 --- -# Iceberg +# Examples -You can use the `iceberg` format in Sail to work with [Apache Iceberg](https://iceberg.apache.org/). -You can use the Spark DataFrame API or Spark SQL to read and write Iceberg tables. + -## Examples - - - -### Basic Usage +## Basic Usage ::: code-group @@ -44,7 +39,7 @@ SELECT * FROM users; ::: -### Data Partitioning +## Data Partitioning You can work with partitioned Iceberg tables using the Spark DataFrame API. Partitioned Iceberg tables organize data into directories based on the values of one or more columns. @@ -78,7 +73,7 @@ SELECT * FROM metrics WHERE year > 2024; ::: -### Time Travel +## Time Travel You can use the time travel feature to query tags, branches, or historical versions of an Iceberg table. @@ -90,7 +85,7 @@ df = spark.read.format("iceberg").option("branch", "main").load(path) Time travel is not available for Spark SQL in Sail yet, but we plan to support it soon. -### More Features +## More Features We will continue adding more examples for advanced Iceberg features as they become available in Sail. In the meantime, feel free to reach out to us on [Slack](https://lakesail.com/slack) or [GitHub Discussions](https://github.com/lakehq/sail/discussions) if you have questions! diff --git a/docs/guide/sources/iceberg/features.md b/docs/guide/sources/iceberg/features.md new file mode 100644 index 0000000000..41c8231524 --- /dev/null +++ b/docs/guide/sources/iceberg/features.md @@ -0,0 +1,62 @@ +--- +title: Supported Features +rank: 2 +--- + +# Supported Features + +## Overview + +Here is a high-level overview of the features supported by Sail for Iceberg tables. + +| Feature | Supported | +| ----------------- | ------------------ | +| Read | :white_check_mark: | +| Write (append) | :white_check_mark: | +| Write (overwrite) | :white_check_mark: | +| `DELETE` | :white_check_mark: | +| `MERGE` | :construction: | +| `UPDATE` | :construction: | + +Both non-partitioned and partitioned tables are supported for reading and writing. + +The write operations currently follow "copy-on-write" semantics. +We plan to support delete files and deletion vectors, which would enable "merge-on-read" write operations in the future. + +## Version-specific Features + +We classify the supported features according to the [Iceberg specification](https://iceberg.apache.org/spec/). + +### Version 1: Analytic Data Tables + +| Feature | Supported | +| --------------------- | ------------------ | +| Metadata | :white_check_mark: | +| Manifest list | :white_check_mark: | +| File format (Parquet) | :white_check_mark: | +| File format (Avro) | :white_check_mark: | +| File format (ORC) | :construction: | +| Schema evolution | :white_check_mark: | +| Partition evolution | :construction: | +| Time travel | :white_check_mark: | +| Column statistics | :white_check_mark: | + +Reading existing branches and tags is supported (time travel). +We plan to support creating branches and tags in DDL operations in the future. + +### Version 2: Row-Level Deletes + +| Feature | Supported | +| ------------------- | ------------------ | +| Delete files | :construction: | +| Sequence numbers | :white_check_mark: | +| Manifest extensions | :construction: | + +### Version 3: Extended Types and Capabilities + +| Feature | Supported | +| --------------------- | -------------- | +| Deletion vectors | :construction: | +| Row lineage | :construction: | +| Column default values | :construction: | +| Encryption keys | :construction: | diff --git a/docs/guide/sources/iceberg/index.data.ts b/docs/guide/sources/iceberg/index.data.ts new file mode 100644 index 0000000000..b0b718411c --- /dev/null +++ b/docs/guide/sources/iceberg/index.data.ts @@ -0,0 +1,5 @@ +import { createContentLoader } from "vitepress"; + +export default createContentLoader([ + "/guide/sources/iceberg/!(index|_*/**|**/_*/**).md", +]); diff --git a/docs/guide/sources/iceberg/index.md b/docs/guide/sources/iceberg/index.md new file mode 100644 index 0000000000..5ec82a610e --- /dev/null +++ b/docs/guide/sources/iceberg/index.md @@ -0,0 +1,18 @@ +--- +title: Iceberg +rank: 2 +--- + +# Iceberg + +You can use the `iceberg` format in Sail to work with [Apache Iceberg](https://iceberg.apache.org/). +You can use the Spark DataFrame API or Spark SQL to read and write Iceberg tables. + +## Topics + + + + diff --git a/docs/guide/sources/index.md b/docs/guide/sources/index.md new file mode 100644 index 0000000000..fa2d53d150 --- /dev/null +++ b/docs/guide/sources/index.md @@ -0,0 +1,30 @@ +--- +title: Data Sources +rank: 5 +--- + +# Data Sources + +Sail supports various data sources for reading and writing. + +You can use the `SparkSession.read`, `DataFrame.write`, and `DataFrame.writeTo()` API to load and save data in different +formats. +You can also use the `CREATE TABLE` SQL statement to create a table that refers to a specific data source. + +Here is a summary of the supported (:white_check_mark:) and unsupported (:x:) data sources for reading and writing data. +There are also features that are planned in our roadmap (:construction:). + +| Format | Read Support | Write Support | +| ---------------------- | ------------------ | ------------------ | +| [Delta Lake](./delta/) | :white_check_mark: | :white_check_mark: | +| [Iceberg](./iceberg/) | :white_check_mark: | :white_check_mark: | +| Files (Parquet) | :white_check_mark: | :white_check_mark: | +| Files (CSV) | :white_check_mark: | :white_check_mark: | +| Files (JSON) | :white_check_mark: | :white_check_mark: | +| Files (Binary) | :white_check_mark: | :x: | +| Files (Text) | :white_check_mark: | :white_check_mark: | +| Files (Avro) | :white_check_mark: | :white_check_mark: | +| [Python](./python/) | :white_check_mark: | :white_check_mark: | +| [JDBC](./jdbc/) | :white_check_mark: | :construction: | +| Hudi | :construction: | :construction: | +| Files (ORC) | :construction: | :construction: | diff --git a/docs/guide/sources/jdbc/index.md b/docs/guide/sources/jdbc/index.md new file mode 100644 index 0000000000..365d484cf6 --- /dev/null +++ b/docs/guide/sources/jdbc/index.md @@ -0,0 +1,161 @@ +--- +title: JDBC +rank: 4 +--- + +# JDBC Data Source + +Sail provides a database connector exposed under the `jdbc` format name for API parity with vanilla PySpark. +The implementation is based on the Python `connectorx` library. +No actual JDBC driver or JVM is involved. + + + +## Installation + +You need to install the `pysail` package with the `jdbc` extra to use the JDBC data source. + +```bash +pip install pysail[jdbc] +``` + +## Quick Start + +Register the datasource once per Spark session. + +```python +from pysail.spark.datasource.jdbc import JdbcDataSource + +spark.dataSource.register(JdbcDataSource) +``` + +Then read from a database using the standard PySpark API. + +```python +df = ( + spark.read.format("jdbc") + .option("url", "jdbc:postgresql://localhost:5432/mydb") + .option("dbtable", "public.users") + .option("user", "alice") + .option("password", "secret") + .load() +) +``` + +Alternatively, you can use the `spark.read.jdbc()` shorthand method. + +```python +df = spark.read.jdbc( + "jdbc:postgresql://localhost:5432/mydb", + "public.users", + properties={"user": "alice", "password": "secret"}, +) +``` + +## Options + +The data source options are consistent with +the [PySpark JDBC documentation](https://spark.apache.org/docs/latest/sql-data-sources-jdbc.html). + +| Name | Required | Default | Description | +| ------------------- | -------- | ------- | ---------------------------------------------------------------------------------------------------------------- | +| `url` | Yes | | The JDBC URL in the form of `jdbc:://:/`. | +| `dbtable` | Yes | | The table name, optionally qualified (`.`). This is mutually exclusive with `query`. | +| `query` | Yes | | An arbitrary SQL `SELECT` statement. This is mutually exclusive with `dbtable`. | +| `user` | No | | The database username. | +| `password` | No | | The database password. | +| `partitionColumn` | No | | The numeric column for range-stride partitioning. This requires `lowerBound`, `upperBound`, and `numPartitions`. | +| `lowerBound` | No | | The lower bound of partition stride (inclusive). | +| `upperBound` | No | | The upper bound of partition stride (inclusive on last partition). | +| `numPartitions` | No | `1` | The number of parallel read partitions. | +| `fetchsize` | No | `0` | An advisory rows-per-round-trip hint. | +| `pushDownPredicate` | No | `true` | Whether to push `WHERE` filters to the database. | +| `customSchema` | No | | A Spark DDL string to override inferred column types. | + +::: info +Exactly one of the `dbtable` or `query` options is required. +::: + +## Examples + +### Custom SQL Queries + +Use `query` instead of `dbtable` to run arbitrary SQL queries: + +```python +df = ( + spark.read.format("jdbc") + .option("url", "jdbc:postgresql://localhost:5432/mydb") + .option("query", "SELECT id, name FROM users WHERE active = TRUE") + .option("user", "alice") + .option("password", "secret") + .load() +) +``` + +The `query` and `partitionColumn` options are mutually exclusive. To partition a custom query, wrap it in `dbtable` as a +subquery: + +```python{4-5} +df = ( + spark.read.format("jdbc") + .option("url", "jdbc:postgresql://localhost:5432/mydb") + .option("dbtable", "(SELECT * FROM events WHERE type='click') AS t") + .option("partitionColumn", "user_id") + .option("user", "alice") + .option("password", "secret") + .load() +) +``` + +### Parallel Reads with Range Partitioning + +Provide `partitionColumn`, `lowerBound`, `upperBound`, and `numPartitions` together to split the read into parallel +range strides: + +```python +df = ( + spark.read.format("jdbc") + .option("url", "jdbc:postgresql://localhost:5432/mydb") + .option("dbtable", "events") + .option("partitionColumn", "id") + .option("lowerBound", "1") + .option("upperBound", "10000000") + .option("numPartitions", "8") + .option("user", "alice") + .option("password", "secret") + .load() +) +``` + +Or equivalently, you can use the `spark.read.jdbc()` method with the same options: + +```python +df = spark.read.jdbc( + "jdbc:postgresql://localhost:5432/mydb", + "events", + column="id", + lowerBound=1, + upperBound=10_000_000, + numPartitions=8, + properties={"user": "alice", "password": "secret"}, +) +``` + +### Schema Override + +Use `customSchema` to override column types after reading: + +```python +df = ( + spark.read.format("jdbc") + .option("url", "jdbc:postgresql://localhost:5432/mydb") + .option("dbtable", "orders") + .option("customSchema", "amount DECIMAL(18,2), status STRING") + .option("user", "alice") + .option("password", "secret") + .load() +) +``` + +Columns not listed in `customSchema` retain their inferred types. diff --git a/docs/guide/sources/python/index.md b/docs/guide/sources/python/index.md new file mode 100644 index 0000000000..c142d3a754 --- /dev/null +++ b/docs/guide/sources/python/index.md @@ -0,0 +1,29 @@ +--- +title: Python +rank: 3 +--- + +# Python Data Sources + +The Python data source allows you to extend the `SparkSession.read` and `DataFrame.write` APIs to support custom formats and external system integrations. +It optionally supports Arrow for zero-copy data exchange between the Python process and the Sail execution engine. This gives you flexibility in data source implementations without incurring performance penalties. + +You can define a Python class that inherits from the `pyspark.sql.datasource.DataSource` abstract class, and register it to the Spark session to create a custom data source that can be used in the standard PySpark API. The `DataSource` class provides methods for defining the name and schema of the data source, as well as methods for creating readers and writers. + +Currently, Sail supports Python data sources for batch reading and writing. + +## Examples + + + +### Batch Reader + +<<< @/../python/pysail/tests/spark/test_python_datasource_read.txt{python-console} + +### Batch Arrow Reader + +<<< @/../python/pysail/tests/spark/test_python_datasource_read_arrow.txt{python-console} + +### More Examples + +Please refer to the [Spark documentation](https://spark.apache.org/docs/latest/api/python/tutorial/sql/python_data_source.html) for more Python data source examples, including how to define a batch writer. We will also add more examples to this guide in the future. Stay tuned! diff --git a/docs/guide/sql/data-types/compatibility.md b/docs/guide/sql/data-types/compatibility.md index 5a9bb4be60..8c19bd255a 100644 --- a/docs/guide/sql/data-types/compatibility.md +++ b/docs/guide/sql/data-types/compatibility.md @@ -309,7 +309,7 @@ but they are still supported in Sail. You can work with these types in Python UD 4. For the SQL timestamp types, the optional parameter specifies the precision of the timestamp. A number of `0`, `3`, `6`, or `9` represents second, millisecond, microsecond, or nanosecond precision respectively. The default value is `6` (microsecond precision). Note that only the microsecond precision timestamp is compatible with Spark. 5. For the SQL decimal types, the optional and parameters specify the precision and scale of the decimal number respectively. The default precision is `10` and the default scale is `0`. The decimal type maps to either Decimal128 or Decimal256 type in Arrow depending on the specified precision. 6. The SQL `INTERVAL` type is mapped to the Interval(MonthDayNano) Arrow type which has nanosecond precision. **CalendarIntervalType** in Spark has microsecond precision so the supported data range is different. -7. For the SQL `TIME` type, the optional parameter specifies the precision of the time value. Spark 4.0 supports precision `0`, `3`, and `6` (second, millisecond, microsecond). The default value is `6` (microsecond precision). The TIME type is available in Spark 4.0 and later. +7. For the SQL `TIME` type, the optional parameter specifies the precision of the time value. Spark 4.0 supports precision `0`, `3`, and `6` (second, millisecond, and microsecond). The default value is `6` (microsecond). The `TIME` type is available in Spark 4.0 and later.