diff --git a/duckdb b/duckdb index cef0f388b4d..555d9b7d149 160000 --- a/duckdb +++ b/duckdb @@ -1 +1 @@ -Subproject commit cef0f388b4d762ad90fda7e3a70305cea4e215b1 +Subproject commit 555d9b7d149f6b572cc94da5d37884806848bb24 diff --git a/extension_config.cmake b/extension_config.cmake index 12a109cb165..e04a260efe4 100644 --- a/extension_config.cmake +++ b/extension_config.cmake @@ -2,7 +2,7 @@ # Extension from this repo duckdb_extension_load(ducklake - SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR} ) duckdb_extension_load(icu) diff --git a/src/common/ducklake_types.cpp b/src/common/ducklake_types.cpp index fabbfb7d21b..903af749695 100644 --- a/src/common/ducklake_types.cpp +++ b/src/common/ducklake_types.cpp @@ -42,19 +42,27 @@ static constexpr const ducklake_type_array DUCKLAKE_TYPES {{{"boolean", LogicalT {"blob", LogicalTypeId::BLOB}, {"uuid", LogicalTypeId::UUID}}}; -LogicalType ParseBaseType(const string &str) { +static LogicalType ParseBaseType(const string &str) { for (auto &ducklake_type : DUCKLAKE_TYPES) { if (StringUtil::CIEquals(str, ducklake_type.name)) { return ducklake_type.id; } } + if (StringUtil::CIEquals(str, "json")) { return LogicalType::JSON(); } + + if (StringUtil::CIEquals(str, "geometry")) { + LogicalType geo_type(LogicalTypeId::BLOB); + geo_type.SetAlias("GEOMETRY"); + return geo_type; + } + throw InvalidInputException("Failed to parse DuckLake type - unsupported type '%s'", str); } -string ToStringBaseType(const LogicalType &type) { +static string ToStringBaseType(const LogicalType &type) { for (auto &ducklake_type : DUCKLAKE_TYPES) { if (type.id() == ducklake_type.id) { return ducklake_type.name; @@ -83,6 +91,9 @@ string DuckLakeTypes::ToString(const LogicalType &type) { if (type.IsJSONType()) { return "json"; } + if (type.GetAlias() == "GEOMETRY" && type.id() == LogicalTypeId::BLOB) { + return "geometry"; + } throw InvalidInputException("Unsupported user-defined type"); } switch (type.id()) { diff --git a/src/common/ducklake_util.cpp b/src/common/ducklake_util.cpp index 0ef19ad6e8b..7afaab5b24c 100644 --- a/src/common/ducklake_util.cpp +++ b/src/common/ducklake_util.cpp @@ -89,11 +89,16 @@ string DuckLakeUtil::StatsToString(const string &text) { return DuckLakeUtil::SQLLiteralToString(text); } -string DuckLakeUtil::ValueToSQL(const Value &val) { +string DuckLakeUtil::ValueToSQL(ClientContext &context, const Value &val) { // FIXME: this should be upstreamed if (val.IsNull()) { return val.ToSQLString(); } + if (val.type().HasAlias()) { + // extension type: cast to string + auto str_val = val.CastAs(context, LogicalType::VARCHAR); + return DuckLakeUtil::ValueToSQL(context, str_val); + } switch (val.type().id()) { case LogicalTypeId::VARCHAR: { auto &str_val = StringValue::Get(val); diff --git a/src/functions/ducklake_add_data_files.cpp b/src/functions/ducklake_add_data_files.cpp index e6f3ff1110d..3a21c801dfc 100644 --- a/src/functions/ducklake_add_data_files.cpp +++ b/src/functions/ducklake_add_data_files.cpp @@ -69,7 +69,8 @@ struct DuckLakeAddDataFilesState : public GlobalTableFunctionState { bool finished = false; }; -unique_ptr DuckLakeAddDataFilesInit(ClientContext &context, TableFunctionInitInput &input) { +static unique_ptr DuckLakeAddDataFilesInit(ClientContext &context, + TableFunctionInitInput &input) { return make_uniq(); } @@ -241,7 +242,7 @@ Value DuckLakeFileProcessor::GetStatsValue(string name, Value val) { void DuckLakeFileProcessor::ReadParquetStats(const string &glob) { auto result = transaction.Query(StringUtil::Format(R"( -SELECT file_name, column_id, num_values, coalesce(stats_min, stats_min_value), coalesce(stats_max, stats_max_value), stats_null_count, total_compressed_size +SELECT file_name, column_id, num_values, coalesce(stats_min, stats_min_value), coalesce(stats_max, stats_max_value), stats_null_count, total_compressed_size, geo_bbox, geo_types FROM parquet_metadata(%s) )", SQLString(glob))); @@ -280,6 +281,27 @@ FROM parquet_metadata(%s) if (!row.IsNull(6)) { stats.column_stats.push_back(GetStatsValue("column_size_bytes", row.GetValue(6))); } + if (!row.IsNull(7)) { + // Split the bbox struct into individual entries + auto bbox_value = row.iterator.chunk->GetValue(7, row.row); + auto &bbox_type = bbox_value.type(); + + auto &bbox_child_types = StructType::GetChildTypes(bbox_type); + auto &bbox_child_values = StructValue::GetChildren(bbox_value); + + for (idx_t child_idx = 0; child_idx < bbox_child_types.size(); child_idx++) { + auto &name = bbox_child_types[child_idx].first; + auto &value = bbox_child_values[child_idx]; + if (!value.IsNull()) { + stats.column_stats.push_back(GetStatsValue("bbox_" + name, value)); + } + } + } + if (!row.IsNull(8)) { + auto list_value = row.iterator.chunk->GetValue(8, row.row); + stats.column_stats.push_back(GetStatsValue("geo_types", std::move(list_value))); + } + column.column_stats.push_back(std::move(stats)); } } @@ -373,6 +395,10 @@ LogicalType DuckLakeParquetTypeChecker::DeriveLogicalType(const ParquetColumn &s return LogicalType::TIMESTAMP_NS; } else if (StringUtil::StartsWith(s_ele.logical_type, "TimestampType(isAdjustedToUTC=1")) { return LogicalType::TIMESTAMP_TZ; + } else if (StringUtil::StartsWith(s_ele.logical_type, "Geometry")) { + LogicalType geo_type(LogicalTypeId::BLOB); + geo_type.SetAlias("GEOMETRY"); + return geo_type; } } if (!s_ele.converted_type.empty()) { @@ -440,7 +466,7 @@ LogicalType DuckLakeParquetTypeChecker::DeriveLogicalType(const ParquetColumn &s throw InvalidInputException("Unrecognized type %s for parquet file", s_ele.type); } -string FormatExpectedError(const vector &expected) { +static string FormatExpectedError(const vector &expected) { string error; for (auto &type : expected) { if (!error.empty()) { @@ -695,7 +721,7 @@ unique_ptr DuckLakeFileProcessor::MapColumn(ParquetFileMet return map_entry; } -bool SupportsHivePartitioning(const LogicalType &type) { +static bool SupportsHivePartitioning(const LogicalType &type) { if (type.IsNested()) { return false; } @@ -830,7 +856,7 @@ vector DuckLakeFileProcessor::AddFiles(const vector &g return written_files; } -void DuckLakeAddDataFilesExecute(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { +static void DuckLakeAddDataFilesExecute(ClientContext &context, TableFunctionInput &data_p, DataChunk &output) { auto &state = data_p.global_state->Cast(); auto &bind_data = data_p.bind_data->Cast(); auto &transaction = DuckLakeTransaction::Get(context, bind_data.catalog); diff --git a/src/functions/ducklake_compaction_functions.cpp b/src/functions/ducklake_compaction_functions.cpp index 4c9edf4b6c0..21e5a84f92e 100644 --- a/src/functions/ducklake_compaction_functions.cpp +++ b/src/functions/ducklake_compaction_functions.cpp @@ -407,6 +407,7 @@ DuckLakeCompactor::GenerateCompactionCommand(vector auto ducklake_scan = make_uniq(table_idx, std::move(scan_function), std::move(bind_data), copy_options.expected_types, copy_options.names, std::move(virtual_columns)); + auto &column_ids = ducklake_scan->GetMutableColumnIds(); for (idx_t i = 0; i < columns.PhysicalColumnCount(); i++) { column_ids.emplace_back(i); @@ -416,6 +417,16 @@ DuckLakeCompactor::GenerateCompactionCommand(vector } column_ids.emplace_back(DuckLakeMultiFileReader::COLUMN_IDENTIFIER_SNAPSHOT_ID); + // Resolve types so we can check if we need casts + ducklake_scan->ResolveOperatorTypes(); + + // Insert a cast projection if necessary + auto root = unique_ptr_cast(std::move(ducklake_scan)); + + if (DuckLakeInsert::RequireCasts(root->types)) { + root = DuckLakeInsert::InsertCasts(binder, root); + } + // generate the LogicalCopyToFile auto copy = make_uniq(std::move(copy_options.copy_function), std::move(copy_options.bind_data), std::move(copy_options.info)); @@ -440,8 +451,7 @@ DuckLakeCompactor::GenerateCompactionCommand(vector copy->preserve_order = PreserveOrderType::PRESERVE_ORDER; copy->file_size_bytes = optional_idx(); copy->rotate = false; - - copy->children.push_back(std::move(ducklake_scan)); + copy->children.push_back(std::move(root)); optional_idx target_row_id_start; if (files_are_adjacent) { @@ -459,8 +469,8 @@ DuckLakeCompactor::GenerateCompactionCommand(vector //===--------------------------------------------------------------------===// // Function //===--------------------------------------------------------------------===// -unique_ptr GenerateCompactionOperator(TableFunctionBindInput &input, idx_t bind_index, - vector> &compactions) { +static unique_ptr GenerateCompactionOperator(TableFunctionBindInput &input, idx_t bind_index, + vector> &compactions) { if (compactions.empty()) { // nothing to compact - generate an empty result vector bindings; @@ -478,9 +488,10 @@ unique_ptr GenerateCompactionOperator(TableFunctionBindInput &i return union_op; } -void GenerateCompaction(ClientContext &context, DuckLakeTransaction &transaction, DuckLakeCatalog &ducklake_catalog, - TableFunctionBindInput &input, DuckLakeTableEntry &cur_table, CompactionType type, - double delete_threshold, vector> &compactions) { +static void GenerateCompaction(ClientContext &context, DuckLakeTransaction &transaction, + DuckLakeCatalog &ducklake_catalog, TableFunctionBindInput &input, + DuckLakeTableEntry &cur_table, CompactionType type, double delete_threshold, + vector> &compactions) { switch (type) { case CompactionType::MERGE_ADJACENT_TABLES: { DuckLakeCompactor compactor(context, ducklake_catalog, transaction, *input.binder, cur_table.GetTableId()); @@ -559,8 +570,8 @@ unique_ptr BindCompaction(ClientContext &context, TableFunction return GenerateCompactionOperator(input, bind_index, compactions); } -unique_ptr MergeAdjacentFilesBind(ClientContext &context, TableFunctionBindInput &input, - idx_t bind_index, vector &return_names) { +static unique_ptr MergeAdjacentFilesBind(ClientContext &context, TableFunctionBindInput &input, + idx_t bind_index, vector &return_names) { return_names.push_back("Success"); return BindCompaction(context, input, bind_index, CompactionType::MERGE_ADJACENT_TABLES); } @@ -579,8 +590,8 @@ TableFunctionSet DuckLakeMergeAdjacentFilesFunction::GetFunctions() { return set; } -unique_ptr RewriteFilesBind(ClientContext &context, TableFunctionBindInput &input, idx_t bind_index, - vector &return_names) { +static unique_ptr RewriteFilesBind(ClientContext &context, TableFunctionBindInput &input, + idx_t bind_index, vector &return_names) { return_names.push_back("Success"); return BindCompaction(context, input, bind_index, CompactionType::REWRITE_DELETES); diff --git a/src/functions/ducklake_flush_inlined_data.cpp b/src/functions/ducklake_flush_inlined_data.cpp index 5affefc73c4..5d5fc1f5f7a 100644 --- a/src/functions/ducklake_flush_inlined_data.cpp +++ b/src/functions/ducklake_flush_inlined_data.cpp @@ -184,8 +184,7 @@ unique_ptr DuckLakeDataFlusher::GenerateFlushCommand() { column_ids.emplace_back(COLUMN_IDENTIFIER_ROW_ID); column_ids.emplace_back(DuckLakeMultiFileReader::COLUMN_IDENTIFIER_SNAPSHOT_ID); - unique_ptr root; - root = std::move(ducklake_scan); + auto root = unique_ptr_cast(std::move(ducklake_scan)); if (!copy_options.projection_list.empty()) { // push a projection @@ -194,6 +193,12 @@ unique_ptr DuckLakeDataFlusher::GenerateFlushCommand() { root = std::move(proj); } + // Add another projection with casts if necessary + root->ResolveOperatorTypes(); + if (DuckLakeInsert::RequireCasts(root->types)) { + root = DuckLakeInsert::InsertCasts(binder, root); + } + // generate the LogicalCopyToFile auto copy = make_uniq(std::move(copy_options.copy_function), std::move(copy_options.bind_data), std::move(copy_options.info)); @@ -227,8 +232,8 @@ unique_ptr DuckLakeDataFlusher::GenerateFlushCommand() { //===--------------------------------------------------------------------===// // Function //===--------------------------------------------------------------------===// -unique_ptr FlushInlinedDataBind(ClientContext &context, TableFunctionBindInput &input, - idx_t bind_index, vector &return_names) { +static unique_ptr FlushInlinedDataBind(ClientContext &context, TableFunctionBindInput &input, + idx_t bind_index, vector &return_names) { // gather a list of files to compact auto &catalog = BaseMetadataFunction::GetCatalog(context, input.inputs[0]); auto &ducklake_catalog = catalog.Cast(); diff --git a/src/include/common/ducklake_util.hpp b/src/include/common/ducklake_util.hpp index f163de5b4dc..9a03ac500c9 100644 --- a/src/include/common/ducklake_util.hpp +++ b/src/include/common/ducklake_util.hpp @@ -28,7 +28,7 @@ class DuckLakeUtil { static string SQLIdentifierToString(const string &text); static string SQLLiteralToString(const string &text); static string StatsToString(const string &text); - static string ValueToSQL(const Value &val); + static string ValueToSQL(ClientContext &context, const Value &val); static ParsedCatalogEntry ParseCatalogEntry(const string &input); static string JoinPath(FileSystem &fs, const string &a, const string &b); diff --git a/src/include/storage/ducklake_insert.hpp b/src/include/storage/ducklake_insert.hpp index e256e38841b..3b17fc74830 100644 --- a/src/include/storage/ducklake_insert.hpp +++ b/src/include/storage/ducklake_insert.hpp @@ -69,6 +69,11 @@ class DuckLakeInsert : public PhysicalOperator { return true; } + static bool RequireCasts(const vector &types); + static void InsertCasts(const vector &types, ClientContext &context, PhysicalPlanGenerator &planner, + optional_ptr &plan); + static unique_ptr InsertCasts(Binder &binder, unique_ptr &plan); + static DuckLakeColumnStats ParseColumnStats(const LogicalType &type, const vector &stats); static DuckLakeCopyOptions GetCopyOptions(ClientContext &context, DuckLakeCopyInput ©_input); static PhysicalOperator &PlanCopyForInsert(ClientContext &context, PhysicalPlanGenerator &planner, diff --git a/src/include/storage/ducklake_metadata_info.hpp b/src/include/storage/ducklake_metadata_info.hpp index 4aa3d180b74..69d3a6c0580 100644 --- a/src/include/storage/ducklake_metadata_info.hpp +++ b/src/include/storage/ducklake_metadata_info.hpp @@ -99,6 +99,7 @@ struct DuckLakeColumnStatsInfo { string min_val; string max_val; string contains_nan; + string extra_stats; }; struct DuckLakeFilePartitionInfo { @@ -195,6 +196,9 @@ struct DuckLakeGlobalColumnStatsInfo { string max_val; bool has_max = false; + + string extra_stats; + bool has_extra_stats = false; }; struct DuckLakeGlobalStatsInfo { diff --git a/src/include/storage/ducklake_stats.hpp b/src/include/storage/ducklake_stats.hpp index a2827c9198f..e839abe33c1 100644 --- a/src/include/storage/ducklake_stats.hpp +++ b/src/include/storage/ducklake_stats.hpp @@ -16,10 +16,56 @@ namespace duckdb { class BaseStatistics; +struct DuckLakeColumnExtraStats { + virtual ~DuckLakeColumnExtraStats() = default; + + virtual void Merge(const DuckLakeColumnExtraStats &new_stats) = 0; + virtual unique_ptr Copy() const = 0; + + // Convert the stats into a string representation for storage (e.g. JSON) + virtual string Serialize() const = 0; + // Parse the stats from a string + virtual void Deserialize(const string &stats) = 0; + + template + TARGET &Cast() { + DynamicCastCheck(this); + return reinterpret_cast(*this); + } + template + const TARGET &Cast() const { + DynamicCastCheck(this); + return reinterpret_cast(*this); + } +}; + +struct DuckLakeColumnGeoStats final : public DuckLakeColumnExtraStats { + + DuckLakeColumnGeoStats(); + void Merge(const DuckLakeColumnExtraStats &new_stats) override; + unique_ptr Copy() const override; + + string Serialize() const override; + void Deserialize(const string &stats) override; + +public: + double xmin, xmax, ymin, ymax, zmin, zmax, mmin, mmax; + set geo_types; +}; + struct DuckLakeColumnStats { explicit DuckLakeColumnStats(LogicalType type_p) : type(std::move(type_p)) { + if (type.id() == LogicalTypeId::BLOB && type.HasAlias() && type.GetAlias() == "GEOMETRY") { + extra_stats = make_uniq(); + } } + // Copy constructor + DuckLakeColumnStats(const DuckLakeColumnStats &other); + DuckLakeColumnStats &operator=(const DuckLakeColumnStats &other); + DuckLakeColumnStats(DuckLakeColumnStats &&other) noexcept = default; + DuckLakeColumnStats &operator=(DuckLakeColumnStats &&other) noexcept = default; + LogicalType type; string min; string max; @@ -32,9 +78,12 @@ struct DuckLakeColumnStats { bool any_valid = true; bool has_contains_nan = false; + unique_ptr extra_stats; + public: unique_ptr ToStats() const; void MergeStats(const DuckLakeColumnStats &new_stats); + DuckLakeColumnStats Copy() const; private: unique_ptr CreateNumericStats() const; diff --git a/src/storage/ducklake_catalog.cpp b/src/storage/ducklake_catalog.cpp index 3f7993a1354..407fe922890 100644 --- a/src/storage/ducklake_catalog.cpp +++ b/src/storage/ducklake_catalog.cpp @@ -63,7 +63,7 @@ void DuckLakeCatalog::FinalizeLoad(optional_ptr context) { initialized = true; } -bool CanGeneratePathFromName(const string &name) { +static bool CanGeneratePathFromName(const string &name) { for (auto c : name) { if (StringUtil::CharacterIsAlphaNumeric(c)) { continue; @@ -178,7 +178,7 @@ DuckLakeCatalogSet &DuckLakeCatalog::GetSchemaForSnapshot(DuckLakeTransaction &t return result; } -unique_ptr TransformColumnType(DuckLakeColumnInfo &col) { +static unique_ptr TransformColumnType(DuckLakeColumnInfo &col) { DuckLakeColumnData col_data; col_data.id = col.id; if (col.children.empty()) { @@ -367,7 +367,7 @@ DuckLakeStats &DuckLakeCatalog::GetStatsForSnapshot(DuckLakeTransaction &transac return result; } -unique_ptr ConvertNameMap(DuckLakeColumnMappingInfo column_mapping) { +static unique_ptr ConvertNameMap(DuckLakeColumnMappingInfo column_mapping) { if (column_mapping.map_type != "map_by_name") { throw InvalidInputException("Unsupported column mapping type \"%s\"", column_mapping.map_type); } @@ -485,6 +485,11 @@ unique_ptr DuckLakeCatalog::LoadStatsForSnapshot(DuckLakeTransact if (column_stats.has_max) { column_stats.max = col_stats.max_val; } + if (col_stats.has_extra_stats && column_stats.extra_stats) { + // The extra_stats should already be allocated in the constructor + // if the logical type requires extra stats. + column_stats.extra_stats->Deserialize(col_stats.extra_stats); + } table_stats->column_stats.insert(make_pair(col_stats.column_id, std::move(column_stats))); } lake_stats->table_stats.insert(make_pair(stats.table_id, std::move(table_stats))); diff --git a/src/storage/ducklake_field_data.cpp b/src/storage/ducklake_field_data.cpp index 6c666c6b8a6..6155e63bcf0 100644 --- a/src/storage/ducklake_field_data.cpp +++ b/src/storage/ducklake_field_data.cpp @@ -40,7 +40,7 @@ DuckLakeFieldId::DuckLakeFieldId(DuckLakeColumnData column_data_p, string name_p } } -Value ExtractDefaultValue(optional_ptr default_expr, const LogicalType &type) { +static Value ExtractDefaultValue(optional_ptr default_expr, const LogicalType &type) { if (!default_expr) { return Value(type); } diff --git a/src/storage/ducklake_inline_data.cpp b/src/storage/ducklake_inline_data.cpp index 417bb4a6791..63a5058f050 100644 --- a/src/storage/ducklake_inline_data.cpp +++ b/src/storage/ducklake_inline_data.cpp @@ -1,4 +1,6 @@ #include "storage/ducklake_inline_data.hpp" + +#include "duckdb/common/type_visitor.hpp" #include "storage/ducklake_insert.hpp" #include "storage/ducklake_table_entry.hpp" #include "storage/ducklake_transaction.hpp" @@ -10,6 +12,14 @@ DuckLakeInlineData::DuckLakeInlineData(PhysicalPlan &physical_plan, PhysicalOper : PhysicalOperator(physical_plan, PhysicalOperatorType::EXTENSION, child.types, child.estimated_cardinality), inline_row_limit(inline_row_limit) { children.push_back(child); + + for (const auto &type : types) { + if (TypeVisitor::Contains(type, [&](const LogicalType &typ) { + return type.id() == LogicalTypeId::BLOB && type.HasAlias() && type.GetAlias() == "GEOMETRY"; + })) { + throw NotImplementedException("DuckLake does not yet support data-inlining of GEOMETRY columns"); + }; + } } enum class InlinePhase { INLINING_ROWS, EMITTING_PREVIOUSLY_INLINED_ROWS, PASS_THROUGH_ROWS }; diff --git a/src/storage/ducklake_insert.cpp b/src/storage/ducklake_insert.cpp index 37185e84876..64db37d0ea3 100644 --- a/src/storage/ducklake_insert.cpp +++ b/src/storage/ducklake_insert.cpp @@ -20,6 +20,7 @@ #include "duckdb/common/multi_file/multi_file_reader.hpp" #include "duckdb/main/extension_helper.hpp" #include "duckdb/function/function_binder.hpp" +#include "duckdb/planner/operator/logical_projection.hpp" namespace duckdb { @@ -68,24 +69,53 @@ DuckLakeColumnStats DuckLakeInsert::ParseColumnStats(const LogicalType &type, co for (idx_t stats_idx = 0; stats_idx < col_stats.size(); stats_idx++) { auto &stats_children = StructValue::GetChildren(col_stats[stats_idx]); auto &stats_name = StringValue::Get(stats_children[0]); - auto &stats_value = StringValue::Get(stats_children[1]); if (stats_name == "min") { D_ASSERT(!column_stats.has_min); - column_stats.min = stats_value; + column_stats.min = StringValue::Get(stats_children[1]); column_stats.has_min = true; } else if (stats_name == "max") { D_ASSERT(!column_stats.has_max); - column_stats.max = stats_value; + column_stats.max = StringValue::Get(stats_children[1]); column_stats.has_max = true; } else if (stats_name == "null_count") { D_ASSERT(!column_stats.has_null_count); column_stats.has_null_count = true; - column_stats.null_count = StringUtil::ToUnsigned(stats_value); + column_stats.null_count = StringUtil::ToUnsigned(StringValue::Get(stats_children[1])); } else if (stats_name == "column_size_bytes") { - column_stats.column_size_bytes = StringUtil::ToUnsigned(stats_value); + column_stats.column_size_bytes = StringUtil::ToUnsigned(StringValue::Get(stats_children[1])); } else if (stats_name == "has_nan") { column_stats.has_contains_nan = true; - column_stats.contains_nan = stats_value == "true"; + column_stats.contains_nan = StringValue::Get(stats_children[1]) == "true"; + } else if (stats_name == "bbox_xmax") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.xmax = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_xmin") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.xmin = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_ymax") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.ymax = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_ymin") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.ymin = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_zmax") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.zmax = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_zmin") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.zmin = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_mmax") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.mmax = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "bbox_mmin") { + auto &geo_stats = column_stats.extra_stats->Cast(); + geo_stats.mmin = stats_children[1].DefaultCastAs(LogicalType::DOUBLE).GetValue(); + } else if (stats_name == "geo_types") { + auto &geo_stats = column_stats.extra_stats->Cast(); + auto list_value = stats_children[1].DefaultCastAs(LogicalType::LIST(LogicalType::VARCHAR)); + for (const auto &child : ListValue::GetChildren(list_value)) { + geo_stats.geo_types.insert(StringValue::Get(child)); + } } else { throw NotImplementedException("Unsupported stats type \"%s\" in DuckLakeInsert::Sink()", stats_name); } @@ -237,7 +267,7 @@ CopyFunctionCatalogEntry &DuckLakeFunctions::GetCopyFunction(ClientContext &cont return *entry; } -Value GetFieldIdValue(const DuckLakeFieldId &field_id) { +static Value GetFieldIdValue(const DuckLakeFieldId &field_id) { auto field_id_value = Value::BIGINT(NumericCast(field_id.GetFieldIndex().index)); if (!field_id.HasChildren()) { // primitive type - return the field-id directly @@ -252,17 +282,17 @@ Value GetFieldIdValue(const DuckLakeFieldId &field_id) { return Value::STRUCT(std::move(values)); } -bool WriteRowId(InsertVirtualColumns virtual_columns) { +static bool WriteRowId(InsertVirtualColumns virtual_columns) { return virtual_columns == InsertVirtualColumns::WRITE_ROW_ID || virtual_columns == InsertVirtualColumns::WRITE_ROW_ID_AND_SNAPSHOT_ID; } -bool WriteSnapshotId(InsertVirtualColumns virtual_columns) { +static bool WriteSnapshotId(InsertVirtualColumns virtual_columns) { return virtual_columns == InsertVirtualColumns::WRITE_SNAPSHOT_ID || virtual_columns == InsertVirtualColumns::WRITE_ROW_ID_AND_SNAPSHOT_ID; } -Value WrittenFieldIds(DuckLakeFieldData &field_data, InsertVirtualColumns virtual_columns) { +static Value WrittenFieldIds(DuckLakeFieldData &field_data, InsertVirtualColumns virtual_columns) { child_list_t values; for (idx_t c_idx = 0; c_idx < field_data.GetColumnCount(); c_idx++) { auto &field_id = field_data.GetByRootIndex(PhysicalIndex(c_idx)); @@ -303,7 +333,7 @@ DuckLakeCopyInput::DuckLakeCopyInput(ClientContext &context, DuckLakeSchemaEntry encryption_key = catalog.GenerateEncryptionKey(context); } -void StripTrailingSeparator(FileSystem &fs, string &path) { +static void StripTrailingSeparator(FileSystem &fs, string &path) { auto sep = fs.PathSeparator(path); if (!StringUtil::EndsWith(path, sep)) { return; @@ -311,7 +341,8 @@ void StripTrailingSeparator(FileSystem &fs, string &path) { path = path.substr(0, path.size() - sep.size()); } -const DuckLakeFieldId &GetTopLevelColumn(DuckLakeCopyInput ©_input, FieldIndex field_id, optional_idx &index) { +static const DuckLakeFieldId &GetTopLevelColumn(DuckLakeCopyInput ©_input, FieldIndex field_id, + optional_idx &index) { if (!copy_input.field_data) { throw InvalidInputException("Partitioning requires field ids"); } @@ -329,8 +360,8 @@ const DuckLakeFieldId &GetTopLevelColumn(DuckLakeCopyInput ©_input, FieldInd throw InvalidInputException("Partitioning is only supported on top-level columns"); } -unique_ptr CreateColumnReference(DuckLakeCopyInput ©_input, const LogicalType &type, - idx_t column_index) { +static unique_ptr CreateColumnReference(DuckLakeCopyInput ©_input, const LogicalType &type, + idx_t column_index) { if (copy_input.get_table_index.IsValid()) { // logical plan generation: generate a bound column ref ColumnBinding column_binding(copy_input.get_table_index.GetIndex(), column_index); @@ -340,14 +371,14 @@ unique_ptr CreateColumnReference(DuckLakeCopyInput ©_input, cons return make_uniq(type, column_index); } -unique_ptr GetColumnReference(DuckLakeCopyInput ©_input, FieldIndex field_id) { +static unique_ptr GetColumnReference(DuckLakeCopyInput ©_input, FieldIndex field_id) { optional_idx index; auto &column_field_id = GetTopLevelColumn(copy_input, field_id, index); return CreateColumnReference(copy_input, column_field_id.Type(), index.GetIndex()); } -unique_ptr GetFunction(ClientContext &context, DuckLakeCopyInput ©_input, const string &function_name, - FieldIndex field_id) { +static unique_ptr GetFunction(ClientContext &context, DuckLakeCopyInput ©_input, + const string &function_name, FieldIndex field_id) { vector> children; children.push_back(GetColumnReference(copy_input, field_id)); @@ -360,8 +391,8 @@ unique_ptr GetFunction(ClientContext &context, DuckLakeCopyInput &co return function; } -unique_ptr GetPartitionExpression(ClientContext &context, DuckLakeCopyInput ©_input, - const DuckLakePartitionField &field) { +static unique_ptr GetPartitionExpression(ClientContext &context, DuckLakeCopyInput ©_input, + const DuckLakePartitionField &field) { switch (field.transform.type) { case DuckLakeTransformType::IDENTITY: return GetColumnReference(copy_input, field.field_id); @@ -378,8 +409,8 @@ unique_ptr GetPartitionExpression(ClientContext &context, DuckLakeCo } } -string GetPartitionExpressionName(DuckLakeCopyInput ©_input, const DuckLakePartitionField &field, - case_insensitive_set_t &names) { +static string GetPartitionExpressionName(DuckLakeCopyInput ©_input, const DuckLakePartitionField &field, + case_insensitive_set_t &names) { auto field_id = copy_input.field_data->GetByFieldIndex(field.field_id); string prefix; switch (field.transform.type) { @@ -407,8 +438,8 @@ string GetPartitionExpressionName(DuckLakeCopyInput ©_input, const DuckLakeP return prefix + "_" + field_id->Name(); } -void GeneratePartitionExpressions(ClientContext &context, DuckLakeCopyInput ©_input, - DuckLakeCopyOptions ©_options) { +static void GeneratePartitionExpressions(ClientContext &context, DuckLakeCopyInput ©_input, + DuckLakeCopyOptions ©_options) { bool all_identity = true; for (auto &field : copy_input.partition_data->fields) { if (field.transform.type != DuckLakeTransformType::IDENTITY) { @@ -545,7 +576,19 @@ DuckLakeCopyOptions DuckLakeInsert::GetCopyOptions(ClientContext &context, DuckL types_to_write.push_back(LogicalType::BIGINT); } - auto function_data = copy_fun.function.copy_to_bind(context, bind_input, names_to_write, types_to_write); + vector casted_types; + for (const auto &type : types_to_write) { + if (type.id() == LogicalTypeId::BLOB && type.HasAlias() && type.GetAlias() == "GEOMETRY") { + // we write GEOMETRY as WKB_BLOB + LogicalType wkb_type(LogicalTypeId::BLOB); + wkb_type.SetAlias("WKB_BLOB"); + casted_types.push_back(wkb_type); + } else { + casted_types.push_back(type); + } + } + + auto function_data = copy_fun.function.copy_to_bind(context, bind_input, names_to_write, casted_types); DuckLakeCopyOptions result(std::move(info), copy_fun.function); result.bind_data = std::move(function_data); @@ -581,19 +624,73 @@ DuckLakeCopyOptions DuckLakeInsert::GetCopyOptions(ClientContext &context, DuckL return result; } -void GenerateProjection(ClientContext &context, PhysicalPlanGenerator &planner, DuckLakeCopyOptions ©_options, - optional_ptr &plan) { +static void GenerateProjection(ClientContext &context, PhysicalPlanGenerator &planner, + vector> &expressions, optional_ptr &plan) { // push the projection vector types; - for (auto &expr : copy_options.projection_list) { + for (auto &expr : expressions) { types.push_back(expr->return_type); } - auto &proj = planner.Make(std::move(types), std::move(copy_options.projection_list), - plan->estimated_cardinality); + auto &proj = + planner.Make(std::move(types), std::move(expressions), plan->estimated_cardinality); proj.children.push_back(*plan); plan = proj; } +bool DuckLakeInsert::RequireCasts(const vector &types) { + for (auto &expected_type : types) { + if (expected_type.id() == LogicalTypeId::BLOB && expected_type.HasAlias() && + expected_type.GetAlias() == "GEOMETRY") { + return true; + } + } + return false; +} + +void DuckLakeInsert::InsertCasts(const vector &types, ClientContext &context, + PhysicalPlanGenerator &planner, optional_ptr &plan) { + vector> expressions; + idx_t col_idx = 0; + for (auto &expected_type : types) { + auto expr = make_uniq(expected_type, col_idx++); + if (expected_type.id() == LogicalTypeId::BLOB && expected_type.HasAlias() && + expected_type.GetAlias() == "GEOMETRY") { + LogicalType wkb_type(LogicalTypeId::BLOB); + wkb_type.SetAlias("WKB_BLOB"); + expressions.push_back(BoundCastExpression::AddCastToType(context, std::move(expr), wkb_type)); + } else { + expressions.push_back(std::move(expr)); + } + } + GenerateProjection(context, planner, expressions, plan); +} + +unique_ptr DuckLakeInsert::InsertCasts(Binder &binder, unique_ptr &plan) { + vector> cast_expressions; + + auto &types = plan->types; + auto bindings = plan->GetColumnBindings(); + + for (idx_t col_idx = 0; col_idx < types.size(); col_idx++) { + auto &type = types[col_idx]; + auto &binding = bindings[col_idx]; + auto ref_expr = make_uniq(type, binding); + if (type.id() == LogicalTypeId::BLOB && type.HasAlias() && type.GetAlias() == "GEOMETRY") { + LogicalType wkb_type(LogicalTypeId::BLOB); + wkb_type.SetAlias("WKB_BLOB"); + cast_expressions.push_back( + BoundCastExpression::AddCastToType(binder.context, std::move(ref_expr), wkb_type)); + } else { + cast_expressions.push_back(std::move(ref_expr)); + } + } + + auto result = make_uniq(binder.GenerateTableIndex(), std::move(cast_expressions)); + result->children.push_back(std::move(plan)); + + return result; +} + PhysicalOperator &DuckLakeInsert::PlanCopyForInsert(ClientContext &context, PhysicalPlanGenerator &planner, DuckLakeCopyInput ©_input, optional_ptr plan) { @@ -602,8 +699,16 @@ PhysicalOperator &DuckLakeInsert::PlanCopyForInsert(ClientContext &context, Phys if (!copy_options.projection_list.empty()) { // generate a projection - GenerateProjection(context, planner, copy_options, plan); + GenerateProjection(context, planner, copy_options.projection_list, plan); } + + if (RequireCasts(copy_options.expected_types)) { + // Insert a cast projection + InsertCasts(copy_options.expected_types, context, planner, plan); + // Update the expected types to match the casted types + copy_options.expected_types = plan->types; + } + auto copy_return_types = GetCopyFunctionReturnLogicalTypes(CopyFunctionReturnType::WRITTEN_FILE_STATISTICS); auto &physical_copy = planner .Make(copy_return_types, std::move(copy_options.copy_function), diff --git a/src/storage/ducklake_metadata_manager.cpp b/src/storage/ducklake_metadata_manager.cpp index 38565fa6136..6e4785d98a4 100644 --- a/src/storage/ducklake_metadata_manager.cpp +++ b/src/storage/ducklake_metadata_manager.cpp @@ -58,11 +58,11 @@ CREATE TABLE {METADATA_CATALOG}.ducklake_view(view_id BIGINT, view_uuid UUID, be CREATE TABLE {METADATA_CATALOG}.ducklake_tag(object_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT, key VARCHAR, value VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_column_tag(table_id BIGINT, column_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT, key VARCHAR, value VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_data_file(data_file_id BIGINT PRIMARY KEY, table_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT, file_order BIGINT, path VARCHAR, path_is_relative BOOLEAN, file_format VARCHAR, record_count BIGINT, file_size_bytes BIGINT, footer_size BIGINT, row_id_start BIGINT, partition_id BIGINT, encryption_key VARCHAR, partial_file_info VARCHAR, mapping_id BIGINT); -CREATE TABLE {METADATA_CATALOG}.ducklake_file_column_stats(data_file_id BIGINT, table_id BIGINT, column_id BIGINT, column_size_bytes BIGINT, value_count BIGINT, null_count BIGINT, min_value VARCHAR, max_value VARCHAR, contains_nan BOOLEAN); +CREATE TABLE {METADATA_CATALOG}.ducklake_file_column_stats(data_file_id BIGINT, table_id BIGINT, column_id BIGINT, column_size_bytes BIGINT, value_count BIGINT, null_count BIGINT, min_value VARCHAR, max_value VARCHAR, contains_nan BOOLEAN, extra_stats VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_delete_file(delete_file_id BIGINT PRIMARY KEY, table_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT, data_file_id BIGINT, path VARCHAR, path_is_relative BOOLEAN, format VARCHAR, delete_count BIGINT, file_size_bytes BIGINT, footer_size BIGINT, encryption_key VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_column(column_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT, table_id BIGINT, column_order BIGINT, column_name VARCHAR, column_type VARCHAR, initial_default VARCHAR, default_value VARCHAR, nulls_allowed BOOLEAN, parent_column BIGINT); CREATE TABLE {METADATA_CATALOG}.ducklake_table_stats(table_id BIGINT, record_count BIGINT, next_row_id BIGINT, file_size_bytes BIGINT); -CREATE TABLE {METADATA_CATALOG}.ducklake_table_column_stats(table_id BIGINT, column_id BIGINT, contains_null BOOLEAN, contains_nan BOOLEAN, min_value VARCHAR, max_value VARCHAR); +CREATE TABLE {METADATA_CATALOG}.ducklake_table_column_stats(table_id BIGINT, column_id BIGINT, contains_null BOOLEAN, contains_nan BOOLEAN, min_value VARCHAR, max_value VARCHAR, extra_stats VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_partition_info(partition_id BIGINT, table_id BIGINT, begin_snapshot BIGINT, end_snapshot BIGINT); CREATE TABLE {METADATA_CATALOG}.ducklake_partition_column(partition_id BIGINT, table_id BIGINT, partition_key_index BIGINT, column_id BIGINT, transform VARCHAR); CREATE TABLE {METADATA_CATALOG}.ducklake_file_partition_value(data_file_id BIGINT, table_id BIGINT, partition_key_index BIGINT, partition_value VARCHAR); @@ -117,7 +117,8 @@ UPDATE {METADATA_CATALOG}.ducklake_metadata SET value = '0.3-dev1' WHERE key = ' CREATE TABLE {METADATA_CATALOG}.ducklake_schema_versions(begin_snapshot BIGINT, schema_version BIGINT); INSERT INTO {METADATA_CATALOG}.ducklake_schema_versions SELECT MIN(snapshot_id), schema_version FROM {METADATA_CATALOG}.ducklake_snapshot GROUP BY schema_version ORDER BY schema_version; ALTER TABLE {METADATA_CATALOG}.ducklake_file_column_statistics RENAME TO ducklake_file_column_stats; - +ALTER TABLE {METADATA_CATALOG}.ducklake_file_column_stats ADD COLUMN extra_stats VARCHAR DEFAULT NULL; +ALTER TABLE {METADATA_CATALOG}.ducklake_table_column_stats ADD COLUMN extra_stats VARCHAR DEFAULT NULL; )"; auto result = transaction.Query(migrate_query); if (result->HasError()) { @@ -172,7 +173,7 @@ SELECT key, value FROM {METADATA_CATALOG}.ducklake_metadata return metadata; } -bool AddChildColumn(vector &columns, FieldIndex parent_id, DuckLakeColumnInfo &column_info) { +static bool AddChildColumn(vector &columns, FieldIndex parent_id, DuckLakeColumnInfo &column_info) { for (auto &col : columns) { if (col.id == parent_id) { col.children.push_back(std::move(column_info)); @@ -185,7 +186,7 @@ bool AddChildColumn(vector &columns, FieldIndex parent_id, D return false; } -vector LoadTags(const Value &tag_map) { +static vector LoadTags(const Value &tag_map) { vector result; for (auto &tag : ListValue::GetChildren(tag_map)) { auto &struct_children = StructValue::GetChildren(tag); @@ -200,7 +201,7 @@ vector LoadTags(const Value &tag_map) { return result; } -vector LoadInlinedDataTables(const Value &list) { +static vector LoadInlinedDataTables(const Value &list) { vector result; for (auto &val : ListValue::GetChildren(list)) { auto &struct_children = StructValue::GetChildren(val); @@ -419,7 +420,7 @@ ORDER BY part.table_id, partition_id, partition_key_index vector DuckLakeMetadataManager::GetGlobalTableStats(DuckLakeSnapshot snapshot) { // query the most recent stats auto result = transaction.Query(snapshot, R"( -SELECT table_id, column_id, record_count, next_row_id, file_size_bytes, contains_null, contains_nan, min_value, max_value +SELECT table_id, column_id, record_count, next_row_id, file_size_bytes, contains_null, contains_nan, min_value, max_value, extra_stats FROM {METADATA_CATALOG}.ducklake_table_stats LEFT JOIN {METADATA_CATALOG}.ducklake_table_column_stats USING (table_id) WHERE record_count IS NOT NULL AND file_size_bytes IS NOT NULL @@ -473,6 +474,13 @@ ORDER BY table_id; column_stats.max_val = row.GetValue(COLUMN_STATS_START + 3); } + if (row.IsNull(COLUMN_STATS_START + 4)) { + column_stats.has_extra_stats = false; + } else { + column_stats.has_extra_stats = true; + column_stats.extra_stats = row.GetValue(COLUMN_STATS_START + 4); + } + stats_entry.column_stats.push_back(std::move(column_stats)); } return global_stats; @@ -519,7 +527,7 @@ DuckLakeFileData DuckLakeMetadataManager::ReadDataFile(DuckLakeTableEntry &table return data; } -string PartialFileInfoToString(const vector &partial_file_info) { +static string PartialFileInfoToString(const vector &partial_file_info) { string result; for (auto &info : partial_file_info) { if (!result.empty()) { @@ -534,20 +542,6 @@ string PartialFileInfoToString(const vector &partial_fi enum class PartialFileInfoType { PARTIAL_MAX, SPLITS }; -vector ParsePartialFileInfo(const string &str, PartialFileInfoType type, - DuckLakeSnapshot snapshot); - -idx_t GetMaxRowCount(DuckLakeSnapshot snapshot, const string &partial_file_info_str) { - auto partial_file_info = ParsePartialFileInfo(partial_file_info_str, PartialFileInfoType::SPLITS, snapshot); - idx_t max_row_count = 0; - for (auto &info : partial_file_info) { - if (info.snapshot_id <= snapshot.snapshot_id) { - max_row_count = MaxValue(max_row_count, info.max_row_count); - } - } - return max_row_count; -} - vector ParsePartialFileInfo(const string &str, PartialFileInfoType type, DuckLakeSnapshot snapshot) { vector result; @@ -580,8 +574,19 @@ vector ParsePartialFileInfo(const string &str, PartialF } } -void ParsePartialFileInfo(DuckLakeSnapshot snapshot, const string &partial_file_info_str, - DuckLakeFileListEntry &file_entry) { +static idx_t GetMaxRowCount(DuckLakeSnapshot snapshot, const string &partial_file_info_str) { + auto partial_file_info = ParsePartialFileInfo(partial_file_info_str, PartialFileInfoType::SPLITS, snapshot); + idx_t max_row_count = 0; + for (auto &info : partial_file_info) { + if (info.snapshot_id <= snapshot.snapshot_id) { + max_row_count = MaxValue(max_row_count, info.max_row_count); + } + } + return max_row_count; +} + +static void ParsePartialFileInfo(DuckLakeSnapshot snapshot, const string &partial_file_info_str, + DuckLakeFileListEntry &file_entry) { if (StringUtil::StartsWith(partial_file_info_str, "partial_max:")) { auto max_partial_file_snapshot = StringUtil::ToUnsigned(partial_file_info_str.substr(12)); if (max_partial_file_snapshot <= snapshot.snapshot_id) { @@ -992,7 +997,8 @@ void DuckLakeMetadataManager::WriteNewSchemas(DuckLakeSnapshot commit_snapshot, } } -void ColumnToSQLRecursive(const DuckLakeColumnInfo &column, TableIndex table_id, optional_idx parent, string &result) { +static void ColumnToSQLRecursive(const DuckLakeColumnInfo &column, TableIndex table_id, optional_idx parent, + string &result) { if (!result.empty()) { result += ","; } @@ -1085,7 +1091,7 @@ void DuckLakeMetadataManager::WriteNewTables(DuckLakeSnapshot commit_snapshot, WriteNewInlinedTables(commit_snapshot, new_tables); } -string GetInlinedTableName(const DuckLakeTableInfo &table, const DuckLakeSnapshot &snapshot) { +static string GetInlinedTableName(const DuckLakeTableInfo &table, const DuckLakeSnapshot &snapshot) { return StringUtil::Format("ducklake_inlined_data_%d_%d", table.id.index, snapshot.schema_version); } @@ -1209,6 +1215,10 @@ void DuckLakeMetadataManager::WriteNewInlinedData(DuckLakeSnapshot &commit_snaps if (new_data.empty()) { return; } + + auto context_ptr = transaction.context.lock(); + auto &context = *context_ptr; + for (auto &entry : new_data) { // get the latest table to insert into // FIXME: we could keep this cached some other way to avoid the round-trip/dependency @@ -1265,7 +1275,7 @@ WHERE table_id = %d AND schema_version=( values += ", {SNAPSHOT_ID}, NULL"; for (idx_t c = 0; c < chunk.ColumnCount(); c++) { values += ", "; - values += DuckLakeUtil::ValueToSQL(chunk.GetValue(c, r)); + values += DuckLakeUtil::ValueToSQL(context, chunk.GetValue(c, r)); } values += ")"; row_id++; @@ -1562,10 +1572,10 @@ void DuckLakeMetadataManager::WriteNewDataFiles(DuckLakeSnapshot commit_snapshot column_stats_insert_query += ","; } auto column_id = column_stats.column_id.index; - column_stats_insert_query += - StringUtil::Format("(%d, %d, %d, %s, %s, %s, %s, %s, %s)", data_file_index, table_id, column_id, - column_stats.column_size_bytes, column_stats.value_count, column_stats.null_count, - column_stats.min_val, column_stats.max_val, column_stats.contains_nan); + column_stats_insert_query += StringUtil::Format( + "(%d, %d, %d, %s, %s, %s, %s, %s, %s, %s)", data_file_index, table_id, column_id, + column_stats.column_size_bytes, column_stats.value_count, column_stats.null_count, column_stats.min_val, + column_stats.max_val, column_stats.contains_nan, column_stats.extra_stats); } if (file.partition_id.IsValid() == file.partition_values.empty()) { throw InternalException("File should either not be partitioned, or have partition values"); @@ -1730,7 +1740,7 @@ void DuckLakeMetadataManager::InsertSnapshot(const DuckLakeSnapshot commit_snaps } } -string SQLStringOrNull(const string &str) { +static string SQLStringOrNull(const string &str) { if (str.empty()) { return "NULL"; } @@ -1770,7 +1780,7 @@ SnapshotChangeInfo DuckLakeMetadataManager::GetChangesMadeAfterSnapshot(DuckLake return change_info; } -unique_ptr TryGetSnapshotInternal(QueryResult &result) { +static unique_ptr TryGetSnapshotInternal(QueryResult &result) { unique_ptr snapshot; for (auto &row : result) { if (snapshot) { @@ -1837,8 +1847,9 @@ WHERE snapshot_id = ( return snapshot; } -unordered_map GetNewPartitions(const vector &old_partitions, - const vector &new_partitions) { +static unordered_map +GetNewPartitions(const vector &old_partitions, + const vector &new_partitions) { unordered_map new_partition_map; @@ -2050,9 +2061,11 @@ void DuckLakeMetadataManager::UpdateGlobalTableStats(const DuckLakeGlobalStatsIn } string min_val = col_stats.has_min ? DuckLakeUtil::StatsToString(col_stats.min_val) : "NULL"; string max_val = col_stats.has_max ? DuckLakeUtil::StatsToString(col_stats.max_val) : "NULL"; + string extra_stats_val = col_stats.has_extra_stats ? col_stats.extra_stats : "NULL"; + column_stats_values += - StringUtil::Format("(%d, %d, %s, %s, %s, %s)", stats.table_id.index, col_stats.column_id.index, - contains_null, contains_nan, min_val, max_val); + StringUtil::Format("(%d, %d, %s, %s, %s, %s, %s)", stats.table_id.index, col_stats.column_id.index, + contains_null, contains_nan, min_val, max_val, extra_stats_val); } if (!stats.initialized) { @@ -2080,11 +2093,11 @@ void DuckLakeMetadataManager::UpdateGlobalTableStats(const DuckLakeGlobalStatsIn result->GetErrorObject().Throw("Failed to update stats information in DuckLake: "); } result = transaction.Query(StringUtil::Format(R"( -WITH new_values(tid, cid, new_contains_null, new_contains_nan, new_min, new_max) AS ( +WITH new_values(tid, cid, new_contains_null, new_contains_nan, new_min, new_max, new_extra_stats) AS ( VALUES %s ) UPDATE {METADATA_CATALOG}.ducklake_table_column_stats -SET contains_null=new_contains_null, contains_nan=new_contains_nan, min_value=new_min, max_value=new_max +SET contains_null=new_contains_null, contains_nan=new_contains_nan, min_value=new_min, max_value=new_max, extra_stats=new_extra_stats FROM new_values WHERE table_id=tid AND column_id=cid )", @@ -2095,7 +2108,7 @@ WHERE table_id=tid AND column_id=cid } template -timestamp_tz_t GetTimestampTZFromRow(ClientContext &context, const T &row, idx_t col_idx) { +static timestamp_tz_t GetTimestampTZFromRow(ClientContext &context, const T &row, idx_t col_idx) { auto val = row.iterator.chunk->GetValue(col_idx, row.row); return val.CastAs(context, LogicalType::TIMESTAMP_TZ).template GetValue(); } diff --git a/src/storage/ducklake_stats.cpp b/src/storage/ducklake_stats.cpp index 5dccdda3808..4803494f85e 100644 --- a/src/storage/ducklake_stats.cpp +++ b/src/storage/ducklake_stats.cpp @@ -1,9 +1,58 @@ #include "storage/ducklake_stats.hpp" +#include "duckdb/common/types/string.hpp" #include "duckdb/common/types/value.hpp" #include "duckdb/storage/statistics/base_statistics.hpp" +#include "duckdb/common/helper.hpp" +#include "duckdb/common/limits.hpp" + +#include "yyjson.hpp" namespace duckdb { +using namespace duckdb_yyjson; // NOLINT + +DuckLakeColumnStats::DuckLakeColumnStats(const DuckLakeColumnStats &other) { + type = other.type; + min = other.min; + max = other.max; + null_count = other.null_count; + column_size_bytes = other.column_size_bytes; + contains_nan = other.contains_nan; + has_null_count = other.has_null_count; + has_min = other.has_min; + has_max = other.has_max; + any_valid = other.any_valid; + has_contains_nan = other.has_contains_nan; + + if (other.extra_stats) { + extra_stats = other.extra_stats->Copy(); + } +} + +DuckLakeColumnStats &DuckLakeColumnStats::operator=(const DuckLakeColumnStats &other) { + if (this == &other) { + return *this; + } + type = other.type; + min = other.min; + max = other.max; + null_count = other.null_count; + column_size_bytes = other.column_size_bytes; + contains_nan = other.contains_nan; + has_null_count = other.has_null_count; + has_min = other.has_min; + has_max = other.has_max; + any_valid = other.any_valid; + has_contains_nan = other.has_contains_nan; + + if (other.extra_stats) { + extra_stats = other.extra_stats->Copy(); + } else { + extra_stats.reset(); + } + return *this; +} + void DuckLakeColumnStats::MergeStats(const DuckLakeColumnStats &new_stats) { if (type != new_stats.type) { // handle type promotion - adopt the new type @@ -71,6 +120,14 @@ void DuckLakeColumnStats::MergeStats(const DuckLakeColumnStats &new_stats) { max = new_stats.max; } } + + if (new_stats.extra_stats) { + if (extra_stats) { + extra_stats->Merge(*new_stats.extra_stats); + } else { + extra_stats = new_stats.extra_stats->Copy(); + } + } } void DuckLakeTableStats::MergeStats(FieldIndex col_id, const DuckLakeColumnStats &file_stats) { @@ -153,4 +210,123 @@ unique_ptr DuckLakeColumnStats::ToStats() const { } } +DuckLakeColumnGeoStats::DuckLakeColumnGeoStats() : DuckLakeColumnExtraStats() { + xmin = NumericLimits::Maximum(); + xmax = NumericLimits::Minimum(); + ymin = NumericLimits::Maximum(); + ymax = NumericLimits::Minimum(); + zmin = NumericLimits::Maximum(); + zmax = NumericLimits::Minimum(); + mmin = NumericLimits::Maximum(); + mmax = NumericLimits::Minimum(); +} + +unique_ptr DuckLakeColumnGeoStats::Copy() const { + return make_uniq(*this); +} + +void DuckLakeColumnGeoStats::Merge(const DuckLakeColumnExtraStats &new_stats) { + auto &geo_stats = new_stats.Cast(); + + xmin = MinValue(xmin, geo_stats.xmin); + xmax = MaxValue(xmax, geo_stats.xmax); + ymin = MinValue(ymin, geo_stats.ymin); + ymax = MaxValue(ymax, geo_stats.ymax); + zmin = MinValue(zmin, geo_stats.zmin); + zmax = MaxValue(zmax, geo_stats.zmax); + mmin = MinValue(mmin, geo_stats.mmin); + mmax = MaxValue(mmax, geo_stats.mmax); + + geo_types.insert(geo_stats.geo_types.begin(), geo_stats.geo_types.end()); +} + +string DuckLakeColumnGeoStats::Serialize() const { + + // Format as JSON + auto xmin_val = xmin == NumericLimits::Maximum() ? "null" : std::to_string(xmin); + auto xmax_val = xmax == NumericLimits::Minimum() ? "null" : std::to_string(xmax); + auto ymin_val = ymin == NumericLimits::Maximum() ? "null" : std::to_string(ymin); + auto ymax_val = ymax == NumericLimits::Minimum() ? "null" : std::to_string(ymax); + auto zmin_val = zmin == NumericLimits::Maximum() ? "null" : std::to_string(zmin); + auto zmax_val = zmax == NumericLimits::Minimum() ? "null" : std::to_string(zmax); + auto mmin_val = mmin == NumericLimits::Maximum() ? "null" : std::to_string(mmin); + auto mmax_val = mmax == NumericLimits::Minimum() ? "null" : std::to_string(mmax); + + auto bbox = StringUtil::Format( + R"({"xmin": %s, "xmax": %s, "ymin": %s, "ymax": %s, "zmin": %s, "zmax": %s, "mmin": %s, "mmax": %s})", xmin_val, + xmax_val, ymin_val, ymax_val, zmin_val, zmax_val, mmin_val, mmax_val); + + string types = "["; + for (auto &type : geo_types) { + if (types.size() > 1) { + types += ", "; + } + types += StringUtil::Format("\"%s\"", type); + } + types += "]"; + + return StringUtil::Format(R"('{"bbox": %s, "types": %s}')", bbox, types); +} + +void DuckLakeColumnGeoStats::Deserialize(const string &stats) { + auto doc = yyjson_read(stats.c_str(), stats.size(), 0); + if (!doc) { + throw InvalidInputException("Failed to parse geo stats JSON"); + } + auto root = yyjson_doc_get_root(doc); + if (!yyjson_is_obj(root)) { + yyjson_doc_free(doc); + throw InvalidInputException("Invalid geo stats JSON"); + } + + auto bbox_json = yyjson_obj_get(root, "bbox"); + if (yyjson_is_obj(bbox_json)) { + auto xmin_json = yyjson_obj_get(bbox_json, "xmin"); + if (yyjson_is_num(xmin_json)) { + xmin = yyjson_get_real(xmin_json); + } + auto xmax_json = yyjson_obj_get(bbox_json, "xmax"); + if (yyjson_is_num(xmax_json)) { + xmax = yyjson_get_real(xmax_json); + } + auto ymin_json = yyjson_obj_get(bbox_json, "ymin"); + if (yyjson_is_num(ymin_json)) { + ymin = yyjson_get_real(ymin_json); + } + auto ymax_json = yyjson_obj_get(bbox_json, "ymax"); + if (yyjson_is_num(ymax_json)) { + ymax = yyjson_get_real(ymax_json); + } + auto zmin_json = yyjson_obj_get(bbox_json, "zmin"); + if (yyjson_is_num(zmin_json)) { + zmin = yyjson_get_real(zmin_json); + } + auto zmax_json = yyjson_obj_get(bbox_json, "zmax"); + if (yyjson_is_num(zmax_json)) { + zmax = yyjson_get_real(zmax_json); + } + auto mmin_json = yyjson_obj_get(bbox_json, "mmin"); + if (yyjson_is_num(mmin_json)) { + mmin = yyjson_get_real(mmin_json); + } + auto mmax_json = yyjson_obj_get(bbox_json, "mmax"); + if (yyjson_is_num(mmax_json)) { + mmax = yyjson_get_real(mmax_json); + } + } + + auto types_json = yyjson_obj_get(root, "types"); + if (yyjson_is_arr(types_json)) { + yyjson_arr_iter iter; + yyjson_arr_iter_init(types_json, &iter); + yyjson_val *type_json; + while ((type_json = yyjson_arr_iter_next(&iter))) { + if (yyjson_is_str(type_json)) { + geo_types.insert(yyjson_get_str(type_json)); + } + } + } + yyjson_doc_free(doc); +} + } // namespace duckdb diff --git a/src/storage/ducklake_transaction.cpp b/src/storage/ducklake_transaction.cpp index 485604590ef..c666bd53ee0 100644 --- a/src/storage/ducklake_transaction.cpp +++ b/src/storage/ducklake_transaction.cpp @@ -901,6 +901,12 @@ void DuckLakeTransaction::UpdateGlobalTableStats(TableIndex table_id, const Duck if (column_stats.has_max) { col_stats.max_val = column_stats.max; } + if (column_stats.extra_stats) { + col_stats.has_extra_stats = true; + col_stats.extra_stats = column_stats.extra_stats->Serialize(); + } else { + col_stats.has_extra_stats = false; + } stats.column_stats.push_back(std::move(col_stats)); } stats.record_count = new_stats.record_count; @@ -949,6 +955,11 @@ DuckLakeFileInfo DuckLakeTransaction::GetNewDataFile(DuckLakeDataFile &file, Duc } else { column_stats.contains_nan = "NULL"; } + if (stats.extra_stats) { + column_stats.extra_stats = stats.extra_stats->Serialize(); + } else { + column_stats.extra_stats = "NULL"; + } data_file.column_stats.push_back(std::move(column_stats)); } diff --git a/test/configs/attach_ducklake.json b/test/configs/attach_ducklake.json index fb7512e45dc..b75897cacf2 100644 --- a/test/configs/attach_ducklake.json +++ b/test/configs/attach_ducklake.json @@ -212,6 +212,17 @@ "test/sql/catalog/view/test_view_sql.test" ] }, + { + "reason": "FIXME: Deserialization field id failure? Fixed on v1.4 branch..?", + "paths": [ + "test/sql/storage/compression/simple_compression.test", + "test/sql/storage/test_large_commits.test", + "test/sql/storage/multiple_clients_checkpoint_pending_updates.test", + "test/sql/storage/test_store_nulls_strings.test", + "test/sql/storage/compression/roaring/fetch_row.test", + "test/sql/storage/test_store_integers.test" + ] + }, { "reason": "Hugeint is unsupported", "paths": [ diff --git a/test/sql/geo/ducklake_geometry.test b/test/sql/geo/ducklake_geometry.test new file mode 100644 index 00000000000..981e5e38ff5 --- /dev/null +++ b/test/sql/geo/ducklake_geometry.test @@ -0,0 +1,101 @@ +# name: test/sql/geo/ducklake_geometry.test +# group: [geo] + +require ducklake + +require parquet + +require spatial + +statement ok +ATTACH 'ducklake:__TEST_DIR__/ducklake.db' AS ducklake (DATA_PATH '__TEST_DIR__/ducklake_files', METADATA_CATALOG 'ducklake_meta') + +statement ok +USE ducklake; + +statement ok +create table t1 (g GEOMETRY); + +statement ok +insert into t1 VALUES (ST_POINT(1,2)); + +query I +select * from t1; +---- +POINT (1 2) + +# Inspect file stats +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats; +---- +{"bbox": {"xmin": 1.000000, "xmax": 1.000000, "ymin": 2.000000, "ymax": 2.000000, "zmin": null, "zmax": null, "mmin": null, "mmax": null}, "types": ["point"]} + +query I +select extra_stats from ducklake_meta.ducklake_table_column_stats; +---- +{"bbox": {"xmin": 1.000000, "xmax": 1.000000, "ymin": 2.000000, "ymax": 2.000000, "zmin": null, "zmax": null, "mmin": null, "mmax": null}, "types": ["point"]} + +statement ok +insert into t1 VALUES ('LINESTRING Z (5 5 5, 10 10 10)'::GEOMETRY); + +query I +select * from t1; +---- +POINT (1 2) +LINESTRING Z (5 5 5, 10 10 10) + +# Inspect file stats - should be separate for each file +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats order by all; +---- +{"bbox": {"xmin": 1.000000, "xmax": 1.000000, "ymin": 2.000000, "ymax": 2.000000, "zmin": null, "zmax": null, "mmin": null, "mmax": null}, "types": ["point"]} +{"bbox": {"xmin": 5.000000, "xmax": 10.000000, "ymin": 5.000000, "ymax": 10.000000, "zmin": 5.000000, "zmax": 10.000000, "mmin": null, "mmax": null}, "types": ["linestring_z"]} + +# Check table stats are a union of the two files +query I +select extra_stats from ducklake_meta.ducklake_table_column_stats order by all; +---- +{"bbox": {"xmin": 1.000000, "xmax": 10.000000, "ymin": 2.000000, "ymax": 10.000000, "zmin": 5.000000, "zmax": 10.000000, "mmin": null, "mmax": null}, "types": ["linestring_z", "point"]} + +# Merge files into a single file and check stats remain the same +statement ok +CALL ducklake_merge_adjacent_files('ducklake') + +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats order by all; +---- +{"bbox": {"xmin": 1.000000, "xmax": 10.000000, "ymin": 2.000000, "ymax": 10.000000, "zmin": 5.000000, "zmax": 10.000000, "mmin": null, "mmax": null}, "types": ["linestring_z", "point"]} + +# Insert a geometry with M dimension +statement ok +insert into t1 VALUES ('POINT M (20 20 5)'::GEOMETRY); + +# Insert a geometry with a ZM dimension +statement ok +insert into t1 VALUES ('POINT ZM (-30 -30 -30 -30)'::GEOMETRY); + +query I +select * from t1 ORDER BY ALL; +---- +POINT (1 2) +POINT M (20 20 5) +POINT ZM (-30 -30 -30 -30) +LINESTRING Z (5 5 5, 10 10 10) + +# Check stats +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats order by all; +---- +{"bbox": {"xmin": -30.000000, "xmax": -30.000000, "ymin": -30.000000, "ymax": -30.000000, "zmin": -30.000000, "zmax": -30.000000, "mmin": -30.000000, "mmax": -30.000000}, "types": ["point_zm"]} +{"bbox": {"xmin": 1.000000, "xmax": 10.000000, "ymin": 2.000000, "ymax": 10.000000, "zmin": 5.000000, "zmax": 10.000000, "mmin": null, "mmax": null}, "types": ["linestring_z", "point"]} +{"bbox": {"xmin": 20.000000, "xmax": 20.000000, "ymin": 20.000000, "ymax": 20.000000, "zmin": null, "zmax": null, "mmin": 5.000000, "mmax": 5.000000}, "types": ["point_m"]} + +# Now merge all again +statement ok +CALL ducklake_merge_adjacent_files('ducklake') + +# Check merged stats +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats order by all; +---- +{"bbox": {"xmin": -30.000000, "xmax": 20.000000, "ymin": -30.000000, "ymax": 20.000000, "zmin": -30.000000, "zmax": 10.000000, "mmin": -30.000000, "mmax": 5.000000}, "types": ["linestring_z", "point", "point_m", "point_zm"]} diff --git a/test/sql/geo/ducklake_geometry_add_files.test b/test/sql/geo/ducklake_geometry_add_files.test new file mode 100644 index 00000000000..be3e3354430 --- /dev/null +++ b/test/sql/geo/ducklake_geometry_add_files.test @@ -0,0 +1,38 @@ +# name: test/sql/geo/ducklake_geometry_add_files.test +# group: [geo] + +require ducklake + +require parquet + +require spatial + +statement ok +ATTACH 'ducklake:__TEST_DIR__/ducklake.db' AS ducklake ( + DATA_PATH '__TEST_DIR__/ducklake_files', + METADATA_CATALOG 'ducklake_meta' +) + +# Copy over a test parquet file with geometry column +statement ok +COPY (select st_point(5, 5) as g) TO '__TEST_DIR__/points.parquet' (FORMAT PARQUET); + +statement ok +USE ducklake; + +statement ok +create table t1 (g GEOMETRY); + +# Now add the file +statement ok +CALL ducklake_add_data_files('ducklake', 't1', '__TEST_DIR__/points.parquet'); + +query I +select * from t1; +---- +POINT (5 5) + +query I +select extra_stats from ducklake_meta.ducklake_file_column_stats order by all; +---- +{"bbox": {"xmin": 5.000000, "xmax": 5.000000, "ymin": 5.000000, "ymax": 5.000000, "zmin": null, "zmax": null, "mmin": null, "mmax": null}, "types": ["point"]} \ No newline at end of file diff --git a/test/sql/geo/ducklake_geometry_inlining.test b/test/sql/geo/ducklake_geometry_inlining.test new file mode 100644 index 00000000000..a5cc8b135a9 --- /dev/null +++ b/test/sql/geo/ducklake_geometry_inlining.test @@ -0,0 +1,31 @@ +# name: test/sql/geo/ducklake_geometry_inlining.test +# group: [geo] + +require ducklake + +require parquet + +require spatial + +statement ok +ATTACH 'ducklake:__TEST_DIR__/ducklake.db' AS ducklake ( + DATA_PATH '__TEST_DIR__/ducklake_files', + METADATA_CATALOG 'ducklake_meta', + DATA_INLINING_ROW_LIMIT 5 +) + +statement ok +USE ducklake; + +statement ok +create table t1 (g GEOMETRY); + +statement error +insert into t1 VALUES (ST_POINT(1,2)); +---- +Not implemented Error: DuckLake does not yet support data-inlining of GEOMETRY columns + +statement error +create table t2 as select ST_POINT(1,2) as g; +---- +Not implemented Error: DuckLake does not yet support data-inlining of GEOMETRY columns \ No newline at end of file