diff --git a/Cargo.lock b/Cargo.lock index f3fb73786b4..51c6903895f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5305,15 +5305,19 @@ dependencies = [ "base64 0.22.1", "bytes", "chrono", + "datafusion-functions", "futures", "google-cloud-auth", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "lance-table", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -6027,6 +6031,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 9beaccc254a..42a0eaff193 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3746,6 +3746,7 @@ dependencies = [ "lance-linalg", "lance-namespace", "lance-namespace-impls", + "lance-namespace-reqwest-client", "lance-table", "log", "object_store", @@ -3798,14 +3799,18 @@ dependencies = [ "axum", "bytes", "chrono", + "datafusion-functions", "futures", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "lance-table", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -4199,6 +4204,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index e839028d5f0..f00bedf1b32 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -24,11 +24,12 @@ lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-reqwest-client = "0.5.2" lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } lance-table = { path = "../../rust/lance-table" } arrow = { version = "57.1", features = ["ffi"] } -arrow-schema = "57.1" +arrow-schema = "57.3.0" object_store = { version = "0.12.2" } tokio = { version = "1.23", features = [ "rt-multi-thread", diff --git a/java/lance-jni/src/lib.rs b/java/lance-jni/src/lib.rs index 53ce125aca8..77fcbd416b1 100644 --- a/java/lance-jni/src/lib.rs +++ b/java/lance-jni/src/lib.rs @@ -52,6 +52,7 @@ mod index; mod merge_insert; mod namespace; mod optimize; +mod partition; mod schema; mod session; mod sql; diff --git a/java/lance-jni/src/namespace.rs b/java/lance-jni/src/namespace.rs index 856904f343a..9f83e6580cd 100644 --- a/java/lance-jni/src/namespace.rs +++ b/java/lance-jni/src/namespace.rs @@ -5,6 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; + use bytes::Bytes; use jni::JNIEnv; use jni::objects::{GlobalRef, JByteArray, JMap, JObject, JString, JValue}; diff --git a/java/lance-jni/src/partition.rs b/java/lance-jni/src/partition.rs new file mode 100644 index 00000000000..1f817baf646 --- /dev/null +++ b/java/lance-jni/src/partition.rs @@ -0,0 +1,1003 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +use std::io::Cursor; +use std::sync::Arc; + +use arrow::datatypes::Schema as ArrowSchema; +use arrow::ipc::reader::StreamReader; +use bytes::Bytes; +use jni::objects::JValue; +use jni::objects::{JByteArray, JMap, JObject, JString}; +use jni::sys::{jbyteArray, jlong, jstring}; +use jni::JNIEnv; +use lance_namespace::models::*; +use lance_namespace::schema::convert_json_arrow_schema; +use lance_namespace::LanceNamespace as LanceNamespaceTrait; +use lance_namespace_impls::partition::{ + parse_filter_expr_from_sql, PartitionField, PartitionSpec, PartitionTable, + PartitionedNamespace, PartitionedNamespaceBuilder, +}; +use lance_namespace_impls::DirectoryNamespaceBuilder; +use lance_namespace_reqwest_client::models::{ + PartitionField as JsonPartitionField, PartitionSpec as JsonPartitionSpec, +}; +use serde::{Deserialize, Serialize}; + +use crate::error::{Error, Result}; +use crate::namespace::JavaDynamicContextProvider; +use crate::utils::to_rust_map; +use crate::RT; + +fn java_schema_to_rust_schema( + env: &mut JNIEnv, + partitioned_ns_obj: &JObject, + schema_obj: &JObject, +) -> Result { + let schema_ipc_obj = env + .call_method( + partitioned_ns_obj, + "schemaToIpc", + "(Lorg/apache/arrow/vector/types/pojo/Schema;)[B", + &[JValue::Object(schema_obj)], + ) + .map_err(|e| Error::runtime_error(format!("Failed to call schemaToIpc: {}", e)))? + .l() + .map_err(|e| Error::runtime_error(format!("schemaToIpc did not return object: {}", e)))?; + let schema_ipc = env + .convert_byte_array(JByteArray::from(schema_ipc_obj)) + .map_err(|e| Error::runtime_error(format!("Failed to read schema IPC bytes: {}", e)))?; + + let reader = StreamReader::try_new(Cursor::new(schema_ipc), None) + .map_err(|e| Error::runtime_error(format!("Failed to decode schema IPC: {}", e)))?; + Ok(reader.schema().as_ref().clone()) +} + +fn java_partition_fields_to_rust_partition_fields( + env: &mut JNIEnv, + partition_fields_list: &JObject, +) -> Result> { + if partition_fields_list.is_null() { + return Err(Error::input_error("partitionFields is null".to_string())); + } + + let pf_json = java_object_to_json(env, partition_fields_list)?; + let partition_fields: Vec = serde_json::from_str(&pf_json) + .map_err(|e| Error::input_error(format!("Invalid partition fields JSON: {}", e)))?; + + let mut fields = Vec::with_capacity(partition_fields.len()); + for jf in &partition_fields { + fields.push( + PartitionField::from_json(jf) + .map_err(|e| Error::input_error(format!("Invalid partition field: {}", e)))?, + ); + } + Ok(fields) +} + +fn java_object_to_json(env: &mut JNIEnv, obj: &JObject) -> Result { + let object_mapper_class = env + .find_class("com/fasterxml/jackson/databind/ObjectMapper") + .map_err(|e| Error::runtime_error(format!("Failed to find ObjectMapper class: {}", e)))?; + let object_mapper = env + .new_object(&object_mapper_class, "()V", &[]) + .map_err(|e| Error::runtime_error(format!("Failed to create ObjectMapper: {}", e)))?; + + let json_obj = env + .call_method( + &object_mapper, + "writeValueAsString", + "(Ljava/lang/Object;)Ljava/lang/String;", + &[JValue::Object(obj)], + ) + .map_err(|e| { + Error::runtime_error(format!( + "Failed to serialize object via ObjectMapper: {}", + e + )) + })? + .l() + .map_err(|e| { + Error::runtime_error(format!("writeValueAsString did not return object: {}", e)) + })?; + let json: String = env + .get_string(&JString::from(json_obj)) + .map_err(|e| Error::runtime_error(format!("Failed to convert JSON string: {}", e)))? + .into(); + Ok(json) +} + +/// Blocking wrapper for PartitionedNamespace +pub struct BlockingPartitionedNamespace { + pub(crate) inner: PartitionedNamespace, +} + +#[derive(Debug, Serialize)] +struct JavaPartitionTable { + id: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + read_version: Option, +} + +#[derive(Debug, Serialize)] +struct JavaPlanScanItem { + table: JavaPartitionTable, + refine_expr: String, +} + +fn to_java_partition_table(t: &PartitionTable) -> JavaPartitionTable { + JavaPartitionTable { + id: t.id.clone(), + read_version: t.read_version, + } +} + +/// Helper function to call namespace methods that return a response object (PartitionedNamespace) +fn call_partitioned_namespace_method<'local, Req, Resp, F>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, + f: F, +) -> Result> +where + Req: for<'de> Deserialize<'de>, + Resp: Serialize, + F: FnOnce(&BlockingPartitionedNamespace, Req) -> lance_core::Result, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let response = f(namespace, request) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + let response_json = serde_json::to_string(&response) + .map_err(|e| Error::runtime_error(format!("Failed to serialize response: {}", e)))?; + + env.new_string(response_json).map_err(Into::into) +} + +/// Helper function for void methods (PartitionedNamespace) +fn call_partitioned_namespace_void_method( + env: &mut JNIEnv, + handle: jlong, + request_json: JString, + f: F, +) -> Result<()> +where + Req: for<'de> Deserialize<'de>, + F: FnOnce(&BlockingPartitionedNamespace, Req) -> lance_core::Result<()>, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + f(namespace, request) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + Ok(()) +} + +/// Helper function for count methods (PartitionedNamespace) +fn call_partitioned_namespace_count_method( + env: &mut JNIEnv, + handle: jlong, + request_json: JString, +) -> Result { + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: CountTableRowsRequest = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let count = RT + .block_on(namespace.inner.count_table_rows(request)) + .map_err(|e| Error::runtime_error(format!("Count table rows failed: {}", e)))?; + + Ok(count) +} + +/// Helper function for methods with data parameter (PartitionedNamespace) +fn call_partitioned_namespace_with_data_method<'local, Req, Resp, F>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, + request_data: JByteArray, + f: F, +) -> Result> +where + Req: for<'de> Deserialize<'de>, + Resp: Serialize, + F: FnOnce(&BlockingPartitionedNamespace, Req, Bytes) -> lance_core::Result, +{ + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: Req = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let data_vec = env.convert_byte_array(request_data)?; + let data = bytes::Bytes::from(data_vec); + + let response = f(namespace, request, data) + .map_err(|e| Error::runtime_error(format!("Namespace operation failed: {}", e)))?; + + let response_json = serde_json::to_string(&response) + .map_err(|e| Error::runtime_error(format!("Failed to serialize response: {}", e)))?; + + env.new_string(response_json).map_err(Into::into) +} + +/// Helper function for query methods that return byte arrays (PartitionedNamespace) +fn call_partitioned_namespace_query_method<'local>( + env: &mut JNIEnv<'local>, + handle: jlong, + request_json: JString, +) -> Result> { + let namespace = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let request_str: String = env.get_string(&request_json)?.into(); + let request: QueryTableRequest = serde_json::from_str(&request_str) + .map_err(|e| Error::input_error(format!("Failed to parse request JSON: {}", e)))?; + + let result_bytes = RT + .block_on(namespace.inner.query_table(request)) + .map_err(|e| Error::runtime_error(format!("Query table failed: {}", e)))?; + + let byte_array = env.byte_array_from_slice(&result_bytes)?; + Ok(byte_array) +} + +// ============================================================================ +// PartitionedNamespace JNI Functions +// ============================================================================ + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNative( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_partitioned_namespace_internal(&mut env, properties_map, None), + 0 + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_partitioned_namespace_internal(&mut env, properties_map, Some(context_provider)), + 0 + ) +} + +fn create_partitioned_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { + // Convert Java HashMap to Rust HashMap + let jmap = JMap::from_env(env, &properties_map)?; + let mut properties = to_rust_map(env, &jmap)?; + + // Use the same key as DirectoryNamespace to locate root. + if !properties.contains_key("root") + && let Some(location) = properties.get("location").cloned() + { + properties.insert("root".to_string(), location); + } + let location = properties.get("root").cloned().ok_or_else(|| { + Error::input_error("Missing 'root' (or 'location') in configProperties".to_string()) + })?; + + let schema_json = properties.get("schema").cloned(); + let partition_spec_json = properties.get("partition_spec").cloned(); + + // Build DirectoryNamespace using properties so we can reuse storage options, credential vending, + // and the Java dynamic context provider. + let mut dir_builder = DirectoryNamespaceBuilder::from_properties(properties, None) + .map_err(|e| { + Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) + })? + .manifest_enabled(true) + .dir_listing_enabled(false) + .inline_optimization_enabled(true); + + if let Some(provider_obj) = context_provider + && !provider_obj.is_null() + { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + dir_builder = dir_builder.context_provider(Arc::new(java_provider)); + } + + let directory = RT + .block_on(dir_builder.build()) + .map_err(|e| Error::runtime_error(format!("Failed to build DirectoryNamespace: {}", e)))?; + + let mut builder = PartitionedNamespaceBuilder::new(location).directory(directory); + + if let Some(schema_json) = schema_json { + let json_schema: JsonArrowSchema = serde_json::from_str(&schema_json).map_err(|e| { + Error::input_error(format!( + "Invalid Arrow schema JSON in configProperties['schema']: {}", + e + )) + })?; + let arrow_schema = convert_json_arrow_schema(&json_schema).map_err(|e| { + Error::input_error(format!( + "Invalid Arrow schema in configProperties['schema']: {}", + e + )) + })?; + builder = builder.schema(arrow_schema); + } + + if let Some(partition_spec_json) = partition_spec_json { + let json_partition_spec: JsonPartitionSpec = serde_json::from_str(&partition_spec_json) + .map_err(|e| { + Error::input_error(format!( + "Invalid partition spec JSON in configProperties['partition_spec']: {}", + e + )) + })?; + let partition_spec = PartitionSpec::from_json(&json_partition_spec).map_err(|e| { + Error::input_error(format!( + "Invalid partition spec in configProperties['partition_spec']: {}", + e + )) + })?; + builder = builder.partition_spec(partition_spec); + } + let ns = RT.block_on(builder.build()).map_err(|e| { + Error::runtime_error(format!("Failed to build PartitionedNamespace: {}", e)) + })?; + + Ok(Box::into_raw(Box::new(BlockingPartitionedNamespace { inner: ns })) as jlong) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_releaseNative( + _env: JNIEnv, + _obj: JObject, + handle: jlong, +) { + if handle != 0 { + unsafe { + let _ = Box::from_raw(handle as *mut BlockingPartitionedNamespace); + } + } +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_namespaceIdNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let namespace_id = ns.inner.namespace_id(); + ok_or_throw_with_return!( + env, + env.new_string(namespace_id).map_err(Error::from), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listNamespacesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_namespaces(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.create_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_dropNamespaceNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.drop_namespace(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_namespaceExistsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) { + ok_or_throw_without_return!( + env, + call_partitioned_namespace_void_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.namespace_exists(req)) + }) + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listTablesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_tables(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_registerTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.register_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_tableExistsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) { + ok_or_throw_without_return!( + env, + call_partitioned_namespace_void_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.table_exists(req)) + }) + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_dropTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.drop_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_deregisterTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.deregister_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_countTableRowsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jlong { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_count_method(&mut env, handle, request_json), + 0 + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.create_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_insertIntoTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.insert_into_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_mergeInsertIntoTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, + request_data: JByteArray, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_with_data_method( + &mut env, + handle, + request_json, + request_data, + |ns, req, data| { RT.block_on(ns.inner.merge_insert_into_table(req, data)) } + ), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_updateTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.update_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_deleteFromTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.delete_from_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_queryTableNative<'local>( + mut env: JNIEnv<'local>, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jbyteArray { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_query_method(&mut env, handle, request_json), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_createTableIndexNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.create_table_index(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_listTableIndicesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.list_table_indices(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTableIndexStatsNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_table_index_stats(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_describeTransactionNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.describe_transaction(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_alterTransactionNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_partitioned_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.alter_transaction(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_schemaNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jbyteArray { + ok_or_throw_with_return!( + env, + (|| { + use arrow::ipc::writer::StreamWriter; + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let schema = Arc::new(ns.inner.schema()?); + + // Write a schema-only Arrow IPC stream. + let mut data = Vec::new(); + let mut writer = StreamWriter::try_new(&mut data, &schema).map_err(|e| { + Error::runtime_error(format!("Failed to create StreamWriter: {}", e)) + })?; + writer.finish().map_err(|e| { + Error::runtime_error(format!("Failed to finish schema stream: {}", e)) + })?; + + Ok::(env.byte_array_from_slice(&data)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_planScanNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + filter: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let filter: String = env.get_string(&filter)?.into(); + let arrow_schema: ArrowSchema = ns.inner.schema()?; + let expr = RT + .block_on(parse_filter_expr_from_sql(&filter, &arrow_schema)) + .map_err(|e| Error::runtime_error(format!("Failed to parse filter SQL: {}", e)))?; + let planned = RT + .block_on(ns.inner.plan_scan(&expr)) + .map_err(|e| Error::runtime_error(format!("plan_scan failed: {}", e)))?; + + let items: Vec = planned + .into_iter() + .map(|(t, refine)| JavaPlanScanItem { + table: to_java_partition_table(&t), + refine_expr: refine.to_string(), + }) + .collect(); + + let json = serde_json::to_string(&items) + .map_err(|e| Error::runtime_error(format!("Failed to serialize plan: {}", e)))?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_partitionSpecNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let spec = RT + .block_on(ns.inner.partition_spec()) + .map_err(|e| Error::runtime_error(format!("partition_spec failed: {}", e)))? + .to_json()?; + let json = serde_json::to_string(&spec).map_err(|e| { + Error::runtime_error(format!("Failed to serialize partition spec: {}", e)) + })?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_updateTableSpecNative( + mut env: JNIEnv, + obj: JObject, + handle: jlong, + schema_obj: JObject, + partition_fields_obj: JObject, +) { + ok_or_throw_without_return!( + env, + (|| { + let ns = unsafe { &mut *(handle as *mut BlockingPartitionedNamespace) }; + + if schema_obj.is_null() { + return Err(Error::input_error("schema is null".to_string())); + } + + let schema = java_schema_to_rust_schema(&mut env, &obj, &schema_obj)?; + let partition_fields = + java_partition_fields_to_rust_partition_fields(&mut env, &partition_fields_obj)?; + + RT.block_on(ns.inner.update_table_spec(schema, partition_fields)) + .map_err(|e| Error::runtime_error(format!("update_table_spec failed: {}", e)))?; + Ok(()) + })() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_resolveOrCreatePartitionTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + arrow_array_stream_addr: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + use arrow::ffi_stream::{ArrowArrayStreamReader, FFI_ArrowArrayStream}; + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; + let mut reader = + unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }.map_err(|e| { + Error::runtime_error(format!("Failed to import ArrowArrayStream: {}", e)) + })?; + + let batch = reader + .next() + .transpose() + .map_err(|e| Error::runtime_error(format!("Failed to read record batch: {}", e)))? + .ok_or_else(|| Error::input_error("Empty ArrowArrayStream".to_string()))?; + + if batch.num_rows() != 1 { + return Err(Error::input_error(format!( + "resolve_or_create_partition_table expects exactly 1 row, got {}", + batch.num_rows() + ))); + } + + let table = RT + .block_on(ns.inner.resolve_or_create_partition_table(&batch)) + .map_err(|e| { + Error::runtime_error(format!("resolve_or_create_partition_table failed: {}", e)) + })?; + let json = serde_json::to_string(&to_java_partition_table(&table)) + .map_err(|e| Error::runtime_error(format!("Failed to serialize table: {}", e)))?; + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_tablesNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, +) -> jstring { + ok_or_throw_with_return!( + env, + (|| { + let ns = unsafe { &*(handle as *const BlockingPartitionedNamespace) }; + let tables = RT.block_on(ns.inner.tables()).map_err(|e| { + Error::runtime_error(format!("PartitionedNamespace.tables failed: {}", e)) + })?; + + let java_tables: Vec = + tables.iter().map(to_java_partition_table).collect(); + let json = serde_json::to_string(&java_tables) + .map_err(|e| Error::runtime_error(format!("Failed to serialize tables: {}", e)))?; + + Ok::(env.new_string(json)?.into_raw()) + })(), + std::ptr::null_mut() + ) +} + +#[unsafe(no_mangle)] +pub extern "system" fn Java_org_lance_namespace_PartitionedNamespace_commitNative( + mut env: JNIEnv, + _obj: JObject, + _handle: jlong, + _read_version_json: JObject, + _new_version_json: JObject, +) -> jstring { + let err = Error::runtime_error("PartitionedNamespace.commit is not implemented".to_string()); + err.throw(&mut env); + std::ptr::null_mut() +} diff --git a/java/src/main/java/org/lance/namespace/PartitionedNamespace.java b/java/src/main/java/org/lance/namespace/PartitionedNamespace.java new file mode 100644 index 00000000000..01162db126c --- /dev/null +++ b/java/src/main/java/org/lance/namespace/PartitionedNamespace.java @@ -0,0 +1,485 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import org.lance.JniLoader; +import org.lance.namespace.model.*; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.core.type.TypeReference; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.base.Preconditions; +import org.apache.arrow.c.ArrowArrayStream; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.ipc.ArrowStreamReader; +import org.apache.arrow.vector.ipc.ArrowStreamWriter; +import org.apache.arrow.vector.types.pojo.Schema; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.Closeable; +import java.util.List; +import java.util.Map; + +/** Java wrapper for the native Rust PartitionedNamespace implementation. */ +public final class PartitionedNamespace implements LanceNamespace, Closeable { + static { + JniLoader.ensureLoaded(); + } + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper(); + + private long nativeHandle; + private BufferAllocator allocator; + + public PartitionedNamespace() {} + + PartitionedNamespace(long nativeHandle) { + Preconditions.checkArgument(nativeHandle != 0, "nativeHandle is 0"); + this.nativeHandle = nativeHandle; + } + + @Override + public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** Initialize with a dynamic context provider. */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { + Preconditions.checkNotNull(configProperties, "configProperties is null"); + Preconditions.checkNotNull(allocator, "allocator is null"); + Preconditions.checkArgument(nativeHandle == 0, "PartitionedNamespace already initialized"); + this.allocator = allocator; + if (contextProvider != null) { + this.nativeHandle = createNativeWithProvider(configProperties, contextProvider); + } else { + this.nativeHandle = createNative(configProperties); + } + } + + @Override + public String namespaceId() { + ensureOpen(); + return namespaceIdNative(nativeHandle); + } + + @Override + public ListNamespacesResponse listNamespaces(ListNamespacesRequest request) { + ensureOpen(); + String json = listNamespacesNative(nativeHandle, toJson(request)); + return fromJson(json, ListNamespacesResponse.class); + } + + @Override + public DescribeNamespaceResponse describeNamespace(DescribeNamespaceRequest request) { + ensureOpen(); + String json = describeNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeNamespaceResponse.class); + } + + @Override + public CreateNamespaceResponse createNamespace(CreateNamespaceRequest request) { + ensureOpen(); + String json = createNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, CreateNamespaceResponse.class); + } + + @Override + public DropNamespaceResponse dropNamespace(DropNamespaceRequest request) { + ensureOpen(); + String json = dropNamespaceNative(nativeHandle, toJson(request)); + return fromJson(json, DropNamespaceResponse.class); + } + + @Override + public void namespaceExists(NamespaceExistsRequest request) { + ensureOpen(); + namespaceExistsNative(nativeHandle, toJson(request)); + } + + @Override + public ListTablesResponse listTables(ListTablesRequest request) { + ensureOpen(); + String json = listTablesNative(nativeHandle, toJson(request)); + return fromJson(json, ListTablesResponse.class); + } + + @Override + public DescribeTableResponse describeTable(DescribeTableRequest request) { + ensureOpen(); + String json = describeTableNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTableResponse.class); + } + + @Override + public RegisterTableResponse registerTable(RegisterTableRequest request) { + ensureOpen(); + String json = registerTableNative(nativeHandle, toJson(request)); + return fromJson(json, RegisterTableResponse.class); + } + + @Override + public void tableExists(TableExistsRequest request) { + ensureOpen(); + tableExistsNative(nativeHandle, toJson(request)); + } + + @Override + public DropTableResponse dropTable(DropTableRequest request) { + ensureOpen(); + String json = dropTableNative(nativeHandle, toJson(request)); + return fromJson(json, DropTableResponse.class); + } + + @Override + public DeregisterTableResponse deregisterTable(DeregisterTableRequest request) { + ensureOpen(); + String json = deregisterTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeregisterTableResponse.class); + } + + @Override + public Long countTableRows(CountTableRowsRequest request) { + ensureOpen(); + return countTableRowsNative(nativeHandle, toJson(request)); + } + + @Override + public CreateTableResponse createTable(CreateTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = createTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, CreateTableResponse.class); + } + + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureOpen(); + String json = declareTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeclareTableResponse.class); + } + + @Override + public InsertIntoTableResponse insertIntoTable( + InsertIntoTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = insertIntoTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, InsertIntoTableResponse.class); + } + + @Override + public MergeInsertIntoTableResponse mergeInsertIntoTable( + MergeInsertIntoTableRequest request, byte[] requestData) { + ensureOpen(); + Preconditions.checkNotNull(requestData, "requestData is null"); + String json = mergeInsertIntoTableNative(nativeHandle, toJson(request), requestData); + return fromJson(json, MergeInsertIntoTableResponse.class); + } + + @Override + public UpdateTableResponse updateTable(UpdateTableRequest request) { + ensureOpen(); + String json = updateTableNative(nativeHandle, toJson(request)); + return fromJson(json, UpdateTableResponse.class); + } + + @Override + public DeleteFromTableResponse deleteFromTable(DeleteFromTableRequest request) { + ensureOpen(); + String json = deleteFromTableNative(nativeHandle, toJson(request)); + return fromJson(json, DeleteFromTableResponse.class); + } + + @Override + public byte[] queryTable(QueryTableRequest request) { + ensureOpen(); + return queryTableNative(nativeHandle, toJson(request)); + } + + @Override + public CreateTableIndexResponse createTableIndex(CreateTableIndexRequest request) { + ensureOpen(); + String json = createTableIndexNative(nativeHandle, toJson(request)); + return fromJson(json, CreateTableIndexResponse.class); + } + + @Override + public ListTableIndicesResponse listTableIndices(ListTableIndicesRequest request) { + ensureOpen(); + String json = listTableIndicesNative(nativeHandle, toJson(request)); + return fromJson(json, ListTableIndicesResponse.class); + } + + @Override + public DescribeTableIndexStatsResponse describeTableIndexStats( + DescribeTableIndexStatsRequest request, String indexName) { + ensureOpen(); + String json = describeTableIndexStatsNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTableIndexStatsResponse.class); + } + + @Override + public DescribeTransactionResponse describeTransaction(DescribeTransactionRequest request) { + ensureOpen(); + String json = describeTransactionNative(nativeHandle, toJson(request)); + return fromJson(json, DescribeTransactionResponse.class); + } + + @Override + public AlterTransactionResponse alterTransaction(AlterTransactionRequest request) { + ensureOpen(); + String json = alterTransactionNative(nativeHandle, toJson(request)); + return fromJson(json, AlterTransactionResponse.class); + } + + /** Shared logical schema enforced across all partition tables. */ + public Schema schema() { + ensureOpen(); + Preconditions.checkNotNull(allocator, "allocator is null"); + byte[] schemaIpc = schemaNative(nativeHandle); + try (ArrowStreamReader reader = + new ArrowStreamReader(new ByteArrayInputStream(schemaIpc), allocator)) { + return reader.getVectorSchemaRoot().getSchema(); + } catch (Exception e) { + throw new RuntimeException("Failed to decode Arrow schema", e); + } + } + + byte[] schemaToIpc(Schema schema) { + Preconditions.checkNotNull(schema, "schema is null"); + Preconditions.checkNotNull(allocator, "allocator is null"); + + try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) { + writer.start(); + writer.end(); + return out.toByteArray(); + } catch (Exception e) { + throw new RuntimeException("Failed to serialize Arrow schema", e); + } + } + + /** + * Partition pruning for the given filter expression. + * + * @param filter SQL expression used in a WHERE clause (empty means TRUE) + */ + public List planScan(String filter) { + ensureOpen(); + Preconditions.checkNotNull(filter, "filter is null"); + String json = planScanNative(nativeHandle, filter); + return fromJson(json, new TypeReference>() {}); + } + + /** Get the current partition spec. */ + public PartitionSpec partitionSpec() { + ensureOpen(); + String json = partitionSpecNative(nativeHandle); + return fromJson(json, PartitionSpec.class); + } + + /** + * Update the table spec. + * + * @param schema new table schema + * @param partitionFields new table partition spec + * @return the latest table spec + */ + public void updateTableSpec(Schema schema, List partitionFields) { + ensureOpen(); + Preconditions.checkNotNull(schema, "schema is null"); + Preconditions.checkNotNull(partitionFields, "partitionFields is null"); + updateTableSpecNative(nativeHandle, schema, partitionFields); + } + + /** + * Resolve the target partition table for the input row. Create it (empty table) if not exists. + * + *

The stream must contain exactly one record batch with exactly one row. + */ + public PartitionTable resolveOrCreatePartitionTable(ArrowArrayStream recordStream) { + ensureOpen(); + Preconditions.checkNotNull(recordStream, "recordStream is null"); + String json = resolveOrCreatePartitionTableNative(nativeHandle, recordStream.memoryAddress()); + return fromJson(json, new TypeReference<>() {}); + } + + /** List all partition tables in this partitioned namespace. */ + public List tables() { + ensureOpen(); + String json = tablesNative(nativeHandle); + return fromJson(json, new TypeReference<>() {}); + } + + /** Commit (currently not implemented on the Rust side). */ + public String commit(Object readVersionJson, Object newVersionJson) { + ensureOpen(); + return commitNative(nativeHandle, readVersionJson, newVersionJson); + } + + @Override + public void close() { + if (nativeHandle != 0) { + releaseNative(nativeHandle); + nativeHandle = 0; + } + } + + private void ensureOpen() { + Preconditions.checkArgument(nativeHandle != 0, "PartitionedNamespace is closed"); + } + + private static T fromJson(String json, TypeReference typeRef) { + try { + return OBJECT_MAPPER.readValue(json, typeRef); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to deserialize JSON", e); + } + } + + private static String toJson(Object obj) { + try { + return OBJECT_MAPPER.writeValueAsString(obj); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to serialize request to JSON", e); + } + } + + private static T fromJson(String json, Class clazz) { + try { + return OBJECT_MAPPER.readValue(json, clazz); + } catch (JsonProcessingException e) { + throw new RuntimeException("Failed to deserialize JSON", e); + } + } + + /** PlanScan result item. */ + public static final class PlanScanItem { + @JsonProperty("table") + private PartitionTable table; + + @JsonProperty("refine_expr") + private String refineExpr; + + public PlanScanItem() {} + + public PartitionTable table() { + return table; + } + + public String refineExpr() { + return refineExpr; + } + } + + /** Partition table identifier. */ + public static final class PartitionTable { + @JsonProperty("id") + private List id; + + @JsonProperty("read_version") + private Long readVersion; + + public PartitionTable() {} + + public List id() { + return id; + } + + public Long readVersion() { + return readVersion; + } + } + + // Native methods + private native long createNative(Map configProperties); + + private native long createNativeWithProvider( + Map configProperties, DynamicContextProvider contextProvider); + + private native void releaseNative(long handle); + + private native String namespaceIdNative(long handle); + + private native String listNamespacesNative(long handle, String requestJson); + + private native String describeNamespaceNative(long handle, String requestJson); + + private native String createNamespaceNative(long handle, String requestJson); + + private native String dropNamespaceNative(long handle, String requestJson); + + private native void namespaceExistsNative(long handle, String requestJson); + + private native String listTablesNative(long handle, String requestJson); + + private native String describeTableNative(long handle, String requestJson); + + private native String registerTableNative(long handle, String requestJson); + + private native void tableExistsNative(long handle, String requestJson); + + private native String dropTableNative(long handle, String requestJson); + + private native String deregisterTableNative(long handle, String requestJson); + + private native long countTableRowsNative(long handle, String requestJson); + + private native String createTableNative(long handle, String requestJson, byte[] requestData); + + private native String declareTableNative(long handle, String requestJson); + + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); + + private native String mergeInsertIntoTableNative( + long handle, String requestJson, byte[] requestData); + + private native String updateTableNative(long handle, String requestJson); + + private native String deleteFromTableNative(long handle, String requestJson); + + private native byte[] queryTableNative(long handle, String requestJson); + + private native String createTableIndexNative(long handle, String requestJson); + + private native String listTableIndicesNative(long handle, String requestJson); + + private native String describeTableIndexStatsNative(long handle, String requestJson); + + private native String describeTransactionNative(long handle, String requestJson); + + private native String alterTransactionNative(long handle, String requestJson); + + private native byte[] schemaNative(long handle); + + private native String planScanNative(long handle, String filter); + + private native String partitionSpecNative(long handle); + + private native void updateTableSpecNative( + long handle, Schema schema, List partitionFields); + + private native String resolveOrCreatePartitionTableNative(long handle, long arrowArrayStreamAddr); + + private native String tablesNative(long handle); + + private native String commitNative(long handle, Object readVersionJson, Object newVersionJson); +} diff --git a/python/Cargo.lock b/python/Cargo.lock index 38579810684..123775ee05d 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4326,14 +4326,18 @@ dependencies = [ "axum", "bytes", "chrono", + "datafusion-functions", "futures", "lance", + "lance-arrow", "lance-core", "lance-index", "lance-io", "lance-namespace", + "lance-namespace-reqwest-client", "lance-table", "log", + "murmur3", "object_store", "rand 0.9.2", "reqwest", @@ -4942,6 +4946,12 @@ version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" +[[package]] +name = "murmur3" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9252111cf132ba0929b6f8e030cac2a24b507f3a4d6db6fb2896f27b354c714b" + [[package]] name = "murmurhash32" version = "0.3.1" diff --git a/rust/lance-arrow/src/schema.rs b/rust/lance-arrow/src/schema.rs index 8ce9442b4e5..54bbf022271 100644 --- a/rust/lance-arrow/src/schema.rs +++ b/rust/lance-arrow/src/schema.rs @@ -4,9 +4,12 @@ //! Extension to arrow schema use arrow_schema::{ArrowError, DataType, Field, FieldRef, Schema}; +use std::cmp::max; use crate::{ARROW_EXT_NAME_KEY, BLOB_META_KEY, BLOB_V2_EXT_NAME}; +pub const LANCE_FIELD_ID_META_KEY: &str = "lance:field_id"; + pub enum Indentation { OneLine, MultiLine(u8), @@ -43,6 +46,18 @@ pub trait FieldExt { /// Check if the field is marked as a blob fn is_blob_v2(&self) -> bool; + + /// Get path and field with the input id + fn path_and_field_by_id( + &self, + id: i32, + prefix: &str, + ) -> Result, ArrowError>; + + /// Get the max field id of itself and all children. + /// + /// If `ignore_id_not_found` is true, ignore field without id. Otherwise, return error. + fn max_id(&self, ignore_id_not_found: bool) -> Result, ArrowError>; } impl FieldExt for Field { @@ -119,6 +134,103 @@ impl FieldExt for Field { .map(|value| value == BLOB_V2_EXT_NAME) .unwrap_or(false) } + + fn path_and_field_by_id( + &self, + id: i32, + prefix: &str, + ) -> Result, ArrowError> { + let path = if prefix.is_empty() { + self.name().to_string() + } else { + format!("{}.{}", prefix, self.name()) + }; + + // find current field id + if let Some(id_str) = self.metadata().get(LANCE_FIELD_ID_META_KEY) { + let field_id: i32 = id_str.parse().map_err(|e| { + ArrowError::CastError(format!( + "Invalid {} metadata value '{}' for field '{}': {}", + LANCE_FIELD_ID_META_KEY, + id_str, + self.name(), + e + )) + })?; + + if id == field_id { + return Ok(Some((path, self.clone()))); + } + }; + + // find in children. + match self.data_type() { + DataType::Struct(fields) => { + for child in fields.iter() { + let v = child.path_and_field_by_id(id, &path)?; + if v.is_some() { + return Ok(v); + } + } + } + DataType::List(child) + | DataType::LargeList(child) + | DataType::FixedSizeList(child, _) => { + return child.path_and_field_by_id(id, &path); + } + DataType::Map(child, _) => { + return child.path_and_field_by_id(id, &path); + } + _ => {} + } + + Ok(None) + } + + fn max_id(&self, ignore_id_not_found: bool) -> Result, ArrowError> { + let id = match self.metadata().get(LANCE_FIELD_ID_META_KEY) { + Some(id_str) => id_str.parse::().ok(), + None => None, + }; + + if id.is_none() && !ignore_id_not_found { + return Err(ArrowError::CastError(format!( + "Invalid {} metadata value or value not fount for field '{}'", + LANCE_FIELD_ID_META_KEY, + self.name(), + ))); + } + + let mut max_id = id.unwrap_or(-1); + match self.data_type() { + DataType::Struct(fields) => { + for child in fields.iter() { + if let Some(child_max_id) = child.max_id(ignore_id_not_found)? { + max_id = max(max_id, child_max_id); + } + } + } + DataType::List(child) + | DataType::LargeList(child) + | DataType::FixedSizeList(child, _) => { + if let Some(child_max_id) = child.max_id(ignore_id_not_found)? { + max_id = max(max_id, child_max_id); + } + } + DataType::Map(child, _) => { + if let Some(child_max_id) = child.max_id(ignore_id_not_found)? { + max_id = max(max_id, child_max_id); + } + } + _ => {} + } + + if max_id == -1 { + Ok(None) + } else { + Ok(Some(max_id)) + } + } } /// Extends the functionality of [arrow_schema::Schema]. @@ -140,6 +252,14 @@ pub trait SchemaExt { /// /// This is intended for display purposes and not for serialization fn to_compact_string(&self, indent: Indentation) -> String; + + /// Get path and field with the input id + fn path_and_field_by_id(&self, id: i32) -> Result, ArrowError>; + + /// Get the max field id. + /// + /// If `ignore_id_not_found` is true, ignore field without id. Otherwise, return error. + fn max_id(&self, ignore_id_not_found: bool) -> Result, ArrowError>; } impl SchemaExt for Schema { @@ -204,4 +324,29 @@ impl SchemaExt for Schema { result += "}"; result } + + fn path_and_field_by_id(&self, id: i32) -> Result, ArrowError> { + for f in self.fields().iter() { + let v = f.path_and_field_by_id(id, "")?; + if v.is_some() { + return Ok(v); + } + } + Ok(None) + } + + fn max_id(&self, ignore_id_not_found: bool) -> Result, ArrowError> { + let mut max_id = -1; + for f in self.fields().iter() { + let max_field_id = f.max_id(ignore_id_not_found)?; + if let Some(max_field_id) = max_field_id { + max_id = max(max_id, max_field_id); + } + } + if max_id == -1 { + Ok(None) + } else { + Ok(Some(max_id)) + } + } } diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 8c84e1bbe8b..3fabadd5408 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -13,8 +13,8 @@ rust-version.workspace = true [features] default = ["dir-aws", "dir-azure", "dir-gcp", "dir-oss", "dir-huggingface"] -rest = ["dep:reqwest", "dep:serde"] -rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http", "dep:serde"] +rest = ["dep:reqwest"] +rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http"] # Cloud storage features for directory implementation - align with lance-io dir-gcp = ["lance-io/gcp", "lance/gcp"] dir-aws = ["lance-io/aws", "lance/aws"] @@ -23,12 +23,15 @@ dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] # Credential vending features credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types", "dep:sha2", "dep:base64"] -credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde", "dep:sha2", "dep:base64"] +credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:sha2", "dep:base64"] credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time", "dep:sha2", "dep:base64", "dep:reqwest"] [dependencies] lance-namespace.workspace = true lance-core.workspace = true +lance-namespace-reqwest-client.workspace = true +murmur3 = "0.5" +datafusion-functions.workspace = true # REST implementation dependencies (optional, enabled by "rest" feature) reqwest = { version = "0.12", optional = true, default-features = false, features = [ @@ -54,7 +57,7 @@ arrow-schema = { workspace = true } axum = { workspace = true, optional = true } tower = { workspace = true, optional = true } tower-http = { workspace = true, optional = true, features = ["trace", "cors", "normalize-path"] } -serde = { workspace = true, optional = true } +serde = { workspace = true } # Common dependencies async-trait.workspace = true @@ -83,6 +86,7 @@ azure_identity = { version = "0.21", optional = true } azure_storage = { version = "0.21", optional = true } azure_storage_blobs = { version = "0.21", optional = true } time = { version = "0.3", optional = true } +lance-arrow = { workspace = true } [dev-dependencies] tokio = { workspace = true, features = ["full"] } diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 73906198961..392cd2636c5 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -8,6 +8,7 @@ pub mod manifest; +use crate::context::DynamicContextProvider; use arrow::record_batch::RecordBatchIterator; use arrow_ipc::reader::StreamReader; use async_trait::async_trait; @@ -24,7 +25,6 @@ use std::collections::HashMap; use std::io::Cursor; use std::sync::Arc; -use crate::context::DynamicContextProvider; use lance_namespace::models::{ BatchDeleteTableVersionsRequest, BatchDeleteTableVersionsResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, CreateTableVersionRequest, @@ -41,6 +41,7 @@ use lance_core::{Error, Result, box_error}; use lance_namespace::LanceNamespace; use lance_namespace::schema::arrow_schema_to_json; +use crate::ManifestNamespace; use crate::credentials::{ CredentialVendor, create_credential_vendor_for_location, has_credential_vendor_config, }; @@ -599,6 +600,13 @@ impl std::fmt::Display for DirectoryNamespace { } impl DirectoryNamespace { + pub fn manifest_namespace(&self) -> Result> { + match self.manifest_ns { + Some(ref ns) => Ok(ns.clone()), + None => Err(Error::namespace("Not manifest namespace")), + } + } + /// Apply pagination to a list of table names /// /// Sorts the list alphabetically and applies pagination using page_token (start_after) and limit. diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index af78e41856c..89d3a37cabf 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -6,16 +6,22 @@ //! This module provides a namespace implementation that uses a manifest table //! to track tables and nested namespaces. -use arrow::array::{Array, RecordBatch, RecordBatchIterator, StringArray}; +use arrow::array::{Array, ArrayRef, RecordBatch, RecordBatchIterator, StringArray}; use arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use arrow_ipc::reader::StreamReader; +use arrow_schema::{FieldRef, Schema, SchemaRef}; use async_trait::async_trait; use bytes::Bytes; use futures::{FutureExt, stream::StreamExt}; use lance::dataset::optimize::{CompactionOptions, compact_files}; -use lance::dataset::{ReadParams, WriteParams, builder::DatasetBuilder}; +use lance::dataset::transaction::UpdateMapEntry; +use lance::dataset::{NewColumnTransform, ReadParams, WriteParams, builder::DatasetBuilder}; +use lance::deps::arrow_array::RecordBatchOptions; +use lance::deps::datafusion::logical_expr::Expr; +use lance::deps::datafusion::scalar::ScalarValue; use lance::session::Session; use lance::{Dataset, dataset::scanner::Scanner}; +use lance_arrow::RecordBatchExt; use lance_core::Error as LanceError; use lance_core::datatypes::LANCE_UNENFORCED_PRIMARY_KEY_POSITION; use lance_core::{Error, Result, box_error}; @@ -37,9 +43,12 @@ use lance_namespace::models::{ }; use lance_namespace::schema::arrow_schema_to_json; use object_store::path::Path; +use std::collections::HashSet; use std::io::Cursor; +use std::str::FromStr; use std::{ collections::HashMap, + f32, f64, hash::{DefaultHasher, Hash, Hasher}, ops::{Deref, DerefMut}, sync::Arc, @@ -47,7 +56,7 @@ use std::{ use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard}; const MANIFEST_TABLE_NAME: &str = "__manifest"; -const DELIMITER: &str = "$"; +pub(crate) const DELIMITER: &str = "$"; // Index names for the __manifest table /// BTREE index on the object_id column for fast lookups @@ -81,12 +90,18 @@ impl ObjectType { } } +pub enum ManifestObject { + Table(TableInfo), + Namespace(NamespaceInfo), +} + /// Information about a table stored in the manifest #[derive(Debug, Clone)] pub struct TableInfo { pub namespace: Vec, pub name: String, pub location: String, + pub properties: Option>, } /// Information about a namespace stored in the manifest @@ -224,23 +239,36 @@ impl DerefMut for DatasetWriteGuard<'_> { } } +/// Extended properties are special properties started with `lance.manifest.extended.` prefix, and +/// stored in the manifest table. +/// +/// For example, a namespace object contains metadata like: +/// ```json +/// { +/// "user_name": "Alice", +/// "lance.manifest.extended.user_id": "123456" +/// } +/// ``` +/// The first one is stored at column named "metadata", the second is stored at column named "user_id". +pub(crate) static EXTENDED_PREFIX: &str = "lance.manifest.extended."; + /// Manifest-based namespace implementation /// /// Uses a special `__manifest` Lance table to track tables and nested namespaces. pub struct ManifestNamespace { - root: String, - storage_options: Option>, + pub(crate) root: String, + pub(crate) storage_options: Option>, #[allow(dead_code)] session: Option>, #[allow(dead_code)] - object_store: Arc, + pub(crate) object_store: Arc, #[allow(dead_code)] - base_path: Path, - manifest_dataset: DatasetConsistencyWrapper, + pub(crate) base_path: Path, + pub(crate) manifest_dataset: DatasetConsistencyWrapper, /// Whether directory listing is enabled in dual mode /// If true, root namespace tables use {table_name}.lance naming /// If false, they use namespace-prefixed names - dir_listing_enabled: bool, + pub(crate) dir_listing_enabled: bool, /// Whether to perform inline optimization (compaction and indexing) on the __manifest table /// after every write. Defaults to true. inline_optimization_enabled: bool, @@ -274,13 +302,15 @@ impl std::fmt::Debug for ManifestNamespace { fn convert_lance_commit_error(e: &LanceError, operation: &str, object_id: Option<&str>) -> Error { match e { // CommitConflict: version collision retries exhausted -> Throttled (safe to retry) - LanceError::CommitConflict { .. } => NamespaceError::Throttled { - message: format!("Too many concurrent writes, please retry later: {:?}", e), + // TooMuchWriteContention: RetryableCommitConflict (semantic conflict) retries exhausted -> Throttled (safe to retry) + LanceError::CommitConflict { .. } | LanceError::TooMuchWriteContention { .. } => { + NamespaceError::Throttled { + message: format!("Too many concurrent writes, please retry later: {:?}", e), + } + .into() } - .into(), - // TooMuchWriteContention: RetryableCommitConflict (semantic conflict) retries exhausted -> ConcurrentModification // IncompatibleTransaction: incompatible concurrent change -> ConcurrentModification - LanceError::TooMuchWriteContention { .. } | LanceError::IncompatibleTransaction { .. } => { + LanceError::IncompatibleTransaction { .. } => { let message = if let Some(id) = object_id { format!( "Object '{}' was concurrently modified by another operation: {:?}", @@ -379,8 +409,80 @@ impl ManifestNamespace { } } + /// Add extended properties to the manifest table. + pub async fn add_extended_properties(&self, properties: &Vec<(&str, DataType)>) -> Result<()> { + let full_schema = self.full_manifest_schema().await?; + let fields: Vec = properties + .iter() + .map(|(name, data_type)| { + if !name.starts_with(EXTENDED_PREFIX) { + return Err(Error::io(format!( + "Extended properties key {} must start with prefix: {}", + name, EXTENDED_PREFIX + ))); + } + Ok(Field::new( + name.strip_prefix(EXTENDED_PREFIX).unwrap().to_string(), + data_type.clone(), + true, + )) + }) + .collect::>>()? + .into_iter() + .filter(|f| full_schema.column_with_name(f.name()).is_none()) + .collect(); + + let schema = Schema::new(fields); + let transform = NewColumnTransform::AllNulls(Arc::new(schema)); + + let mut ds = self.manifest_dataset.get_mut().await?; + ds.add_columns(transform, None, None).await?; + + Ok(()) + } + + /// Get all extended properties keys + pub async fn get_extended_properties_keys(&self) -> Result> { + let basic_cols: HashSet = Self::basic_manifest_schema() + .fields + .iter() + .map(|f| f.name().to_string()) + .collect(); + let mut extended_props_keys = vec![]; + for f in self.full_manifest_schema().await?.fields.iter() { + if !basic_cols.contains(f.name().as_str()) { + extended_props_keys.push(format!("{}{}", EXTENDED_PREFIX, f.name())); + } + } + Ok(extended_props_keys) + } + + /// Remove extended properties from the manifest table. + pub async fn remove_extended_properties(&mut self, properties: &Vec<&str>) -> Result<()> { + let full_schema = self.full_manifest_schema().await?; + let to_remove: Vec = properties + .iter() + .map(|name| { + if !name.starts_with(EXTENDED_PREFIX) { + return Err(Error::io(format!( + "Extended properties key {} must start with prefix: {}", + name, EXTENDED_PREFIX + ))); + } + Ok(name.strip_prefix(EXTENDED_PREFIX).unwrap().to_string()) + }) + .collect::>>()? + .into_iter() + .filter(|s| full_schema.column_with_name(s.as_str()).is_some()) + .collect(); + let remove: Vec<&str> = to_remove.iter().map(|s| s.as_str()).collect(); + + let mut ds = self.manifest_dataset.get_mut().await?; + ds.drop_columns(&remove).await + } + /// Split an object ID (table_id as vec of strings) into namespace and table name - fn split_object_id(table_id: &[String]) -> (Vec, String) { + pub(crate) fn split_object_id(table_id: &[String]) -> (Vec, String) { if table_id.len() == 1 { (vec![], table_id[0].clone()) } else { @@ -402,7 +504,7 @@ impl ManifestNamespace { /// failed table creation, delete and create new table of the same name, etc. /// The object_id is added after the hash to ensure /// dir name uniqueness and make debugging easier. - fn generate_dir_name(object_id: &str) -> String { + pub(crate) fn generate_dir_name(object_id: &str) -> String { // Generate a random number for uniqueness let random_num: u64 = rand::random(); @@ -452,7 +554,7 @@ impl ManifestNamespace { /// 3. Optimizes existing indices /// /// This is called automatically after writes when inline_optimization_enabled is true. - async fn run_inline_optimization(&self) -> Result<()> { + pub(crate) async fn run_inline_optimization(&self) -> Result<()> { if !self.inline_optimization_enabled { return Ok(()); } @@ -595,8 +697,8 @@ impl ManifestNamespace { Ok(()) } - /// Get the manifest schema - fn manifest_schema() -> Arc { + /// Get the manifest schema of basic fields: object_id, object_type, location, metadata, base_objects + pub(crate) fn basic_manifest_schema() -> Arc { Arc::new(ArrowSchema::new(vec![ // Set unenforced primary key on object_id for bloom filter conflict detection Field::new("object_id", DataType::Utf8, false).with_metadata( @@ -618,6 +720,27 @@ impl ManifestNamespace { ])) } + /// Get the full manifest schema, including basic fields and extended fields. + pub(crate) async fn full_manifest_schema(&self) -> Result { + let dataset_guard = self.manifest_dataset.get().await?; + let schema = ArrowSchema::from(dataset_guard.schema()); + Ok(schema) + } + + /// Get the extended manifest schema, excluding basic fields. + pub(crate) async fn extended_manifest_schema(&self) -> Result { + let full = self.full_manifest_schema().await?; + let basic = Self::basic_manifest_schema(); + let mut fields = vec![]; + for field in full.fields.into_iter() { + let name = field.name(); + if basic.column_with_name(name.as_str()).is_none() { + fields.push(field.clone()); + } + } + Ok(ArrowSchema::new(fields)) + } + /// Get a scanner for the manifest dataset async fn manifest_scanner(&self) -> Result { let dataset_guard = self.manifest_dataset.get().await?; @@ -658,7 +781,7 @@ impl ManifestNamespace { } /// Check if the manifest contains an object with the given ID - async fn manifest_contains_object(&self, object_id: &str) -> Result { + pub(crate) async fn manifest_contains_object(&self, object_id: &str) -> Result { let escaped_id = object_id.replace('\'', "''"); let filter = format!("object_id = '{}'", escaped_id); @@ -693,52 +816,28 @@ impl ManifestNamespace { } /// Query the manifest for a table with the given object ID - async fn query_manifest_for_table(&self, object_id: &str) -> Result> { + pub(crate) async fn query_manifest_for_table( + &self, + object_id: &str, + ) -> Result> { let escaped_id = object_id.replace('\'', "''"); let filter = format!("object_id = '{}' AND object_type = 'table'", escaped_id); - let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { - Error::io_source(box_error(std::io::Error::other(format!( - "Failed to filter: {}", - e - )))) - })?; - scanner.project(&["object_id", "location"]).map_err(|e| { - Error::io_source(box_error(std::io::Error::other(format!( - "Failed to project: {}", - e - )))) - })?; - let batches = Self::execute_scanner(scanner).await?; - let mut found_result: Option = None; - let mut total_rows = 0; - - for batch in batches { - if batch.num_rows() == 0 { + let objects = self.query_manifest(&filter).await?; + let mut found: Option = None; + for obj in objects { + let ManifestObject::Table(t) = obj else { continue; - } - - total_rows += batch.num_rows(); - if total_rows > 1 { + }; + if found.is_some() { return Err(Error::io(format!( - "Expected exactly 1 table with id '{}', found {}", - object_id, total_rows + "Expected exactly 1 table with id '{}', found more than 1", + object_id ))); } - - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let location_array = Self::get_string_column(&batch, "location")?; - let location = location_array.value(0).to_string(); - let (namespace, name) = Self::parse_object_id(object_id_array.value(0)); - found_result = Some(TableInfo { - namespace, - name, - location, - }); + found = Some(t); } - - Ok(found_result) + Ok(found) } /// List all table locations in the manifest (for root namespace only) @@ -782,22 +881,23 @@ impl ManifestNamespace { object_type: ObjectType, location: Option, ) -> Result<()> { - self.insert_into_manifest_with_metadata(object_id, object_type, location, None, None) + self.insert_into_manifest_with_metadata(object_id, object_type, location, None, None, None) .await } /// Insert an entry into the manifest table with metadata and base_objects - async fn insert_into_manifest_with_metadata( + pub(crate) async fn insert_into_manifest_with_metadata( &self, object_id: String, object_type: ObjectType, location: Option, metadata: Option, base_objects: Option>, + extended_batch: Option, ) -> Result<()> { use arrow::array::builder::{ListBuilder, StringBuilder}; - let schema = Self::manifest_schema(); + let basic_schema = Self::basic_manifest_schema(); // Create base_objects array from the provided list let string_builder = StringBuilder::new(); @@ -833,7 +933,7 @@ impl ManifestNamespace { }; let batch = RecordBatch::try_new( - schema.clone(), + basic_schema.clone(), vec![ Arc::new(StringArray::from(vec![object_id.as_str()])), Arc::new(StringArray::from(vec![object_type.as_str()])), @@ -844,7 +944,15 @@ impl ManifestNamespace { ) .map_err(|e| Error::io(format!("Failed to create manifest entry: {}", e)))?; - let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + // Merge extended_batch with basic batch if provided + let batch = if let Some(extended_batch) = extended_batch { + batch.merge(&extended_batch)? + } else { + batch + }; + + let schema = batch.schema(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); // Use MergeInsert to ensure uniqueness on object_id let dataset_guard = self.manifest_dataset.get().await?; @@ -860,6 +968,8 @@ impl ManifestNamespace { )))) })?; + // disable index to enable merge_insert with primary key dedupe + merge_builder.use_index(false); merge_builder.when_matched(lance::dataset::WhenMatched::Fail); merge_builder.when_not_matched(lance::dataset::WhenNotMatched::InsertAll); // conflict_retries=0: no outer loop retry on semantic conflicts (handled by caller) @@ -945,8 +1055,26 @@ impl ManifestNamespace { .await } + /// Get metadata of __manifest table + pub async fn get_metadata(&self) -> Result> { + let ds = self.manifest_dataset.get().await?; + Ok(ds.metadata().clone()) + } + + /// Update metadata to __manifest table + pub async fn update_metadata( + &self, + values: impl IntoIterator>, + ) -> Result> { + let mut ds = self.manifest_dataset.get_mut().await?; + ds.update_metadata(values).await + } + /// Validate that all levels of a namespace path exist - async fn validate_namespace_levels_exist(&self, namespace_path: &[String]) -> Result<()> { + pub(crate) async fn validate_namespace_levels_exist( + &self, + namespace_path: &[String], + ) -> Result<()> { for i in 1..=namespace_path.len() { let partial_path = &namespace_path[..i]; let object_id = partial_path.join(DELIMITER); @@ -963,65 +1091,60 @@ impl ManifestNamespace { async fn query_manifest_for_namespace(&self, object_id: &str) -> Result> { let escaped_id = object_id.replace('\'', "''"); let filter = format!("object_id = '{}' AND object_type = 'namespace'", escaped_id); + + let objects = self.query_manifest(&filter).await?; + let mut found: Option = None; + for obj in objects { + let ManifestObject::Namespace(ns) = obj else { + continue; + }; + if found.is_some() { + return Err(Error::io(format!( + "Expected exactly 1 namespace with id '{}', found more than 1", + object_id + ))); + } + found = Some(ns); + } + Ok(found) + } + + pub(crate) async fn query_manifest_expr(&self, expr: Expr) -> Result> { let mut scanner = self.manifest_scanner().await?; - scanner.filter(&filter).map_err(|e| { + scanner.filter_expr(expr); + + let batches = Self::execute_scanner(scanner).await?; + Self::parse_manifest_objects(batches) + } + + pub(crate) async fn query_manifest(&self, filter: &str) -> Result> { + let mut scanner = self.manifest_scanner().await?; + scanner.filter(filter).map_err(|e| { Error::io_source(box_error(std::io::Error::other(format!( "Failed to filter: {}", e )))) })?; - scanner.project(&["object_id", "metadata"]).map_err(|e| { - Error::io_source(box_error(std::io::Error::other(format!( - "Failed to project: {}", - e - )))) - })?; - let batches = Self::execute_scanner(scanner).await?; - - let mut found_result: Option = None; - let mut total_rows = 0; - for batch in batches { - if batch.num_rows() == 0 { - continue; - } + let batches = Self::execute_scanner(scanner).await?; + Self::parse_manifest_objects(batches) + } - total_rows += batch.num_rows(); - if total_rows > 1 { - return Err(Error::io(format!( - "Expected exactly 1 namespace with id '{}', found {}", - object_id, total_rows - ))); + fn parse_manifest_objects(batches: Vec) -> Result> { + let mut objects: Vec = vec![]; + + for batch in batches.iter() { + for row_idx in 0..batch.num_rows() { + let sliced_columns: Vec> = batch + .columns() + .iter() + .map(|col| col.slice(row_idx, 1)) + .collect(); + let row = RecordBatch::try_new(batch.schema(), sliced_columns)?; + objects.push(parse_manifest_object(&row)?); } - - let object_id_array = Self::get_string_column(&batch, "object_id")?; - let metadata_array = Self::get_string_column(&batch, "metadata")?; - - let object_id_str = object_id_array.value(0); - let metadata = if !metadata_array.is_null(0) { - let metadata_str = metadata_array.value(0); - match serde_json::from_str::>(metadata_str) { - Ok(map) => Some(map), - Err(e) => { - return Err(Error::io(format!( - "Failed to deserialize metadata for namespace '{}': {}", - object_id, e - ))); - } - } - } else { - None - }; - - let (namespace, name) = Self::parse_object_id(object_id_str); - found_result = Some(NamespaceInfo { - namespace, - name, - metadata, - }); } - - Ok(found_result) + Ok(objects) } /// Create or load the manifest dataset, ensuring it has the latest schema setup. @@ -1090,7 +1213,7 @@ impl ManifestNamespace { Ok(DatasetConsistencyWrapper::new(dataset)) } else { log::info!("Creating new manifest table at {}", manifest_path); - let schema = Self::manifest_schema(); + let schema = Self::basic_manifest_schema(); let empty_batch = RecordBatch::new_empty(schema.clone()); let reader = RecordBatchIterator::new(vec![Ok(empty_batch)], schema.clone()); @@ -1172,6 +1295,77 @@ impl ManifestNamespace { } } } + + pub(crate) fn build_metadata_json( + properties: &Option>, + ) -> Option { + properties.as_ref().and_then(|props| { + if props.is_empty() { + None + } else { + let meta_props = props + .iter() + .filter(|(key, _)| !key.starts_with(EXTENDED_PREFIX)) + .collect::>(); + Some(serde_json::to_string(&meta_props).ok()?) + } + }) + } +} + +/// Parse one row of __manifest table into manifest object. +fn parse_manifest_object(batch: &RecordBatch) -> Result { + if batch.num_rows() == 0 { + return Err(Error::invalid_input("batch must have at least one row")); + } + + // Parse properties + let mut merged = batch_to_extended_props(batch); + let metadata_array = ManifestNamespace::get_string_column(batch, "metadata")?; + + if !metadata_array.is_null(0) { + let metadata_str = metadata_array.value(0); + match serde_json::from_str::>(metadata_str) { + Ok(map) => merged.extend(map), + Err(e) => { + return Err(Error::io(format!("Failed to deserialize metadata: {}", e))); + } + } + } + + let properties = if merged.is_empty() { + None + } else { + Some(merged) + }; + + // Parse manifest object + let object_type = ManifestNamespace::get_string_column(batch, "object_type")?; + let object_type = object_type.value(0).to_string(); + match object_type.as_str() { + "namespace" => { + let object_id_array = ManifestNamespace::get_string_column(batch, "object_id")?; + let (namespace, name) = ManifestNamespace::parse_object_id(object_id_array.value(0)); + Ok(ManifestObject::Namespace(NamespaceInfo { + namespace, + name, + metadata: properties, + })) + } + "table" => { + let object_id_array = ManifestNamespace::get_string_column(batch, "object_id")?; + let location_array = ManifestNamespace::get_string_column(batch, "location")?; + let location = location_array.value(0).to_string(); + let (namespace, name) = ManifestNamespace::parse_object_id(object_id_array.value(0)); + Ok(ManifestObject::Table(TableInfo { + namespace, + name, + location, + properties, + })) + } + t => Err(Error::internal(format!("Unknown object type {}", t))), + } } #[async_trait] @@ -1280,6 +1474,7 @@ impl LanceNamespace for ManifestNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), storage_options, + properties: info.properties, ..Default::default() }); } @@ -1305,6 +1500,7 @@ impl LanceNamespace for ManifestNamespace { table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), storage_options, + properties: info.properties, ..Default::default() }) } @@ -1316,6 +1512,7 @@ impl LanceNamespace for ManifestNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), storage_options, + properties: info.properties, ..Default::default() }) } @@ -1356,97 +1553,14 @@ impl LanceNamespace for ManifestNamespace { request: CreateTableRequest, data: Bytes, ) -> Result { - let table_id = request - .id - .as_ref() - .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; - - if table_id.is_empty() { - return Err(Error::invalid_input_source( - "Table ID cannot be empty".into(), - )); - } - - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - // Check if table already exists in manifest - if self.manifest_contains_object(&object_id).await? { - return Err(Error::io(format!("Table '{}' already exists", table_name))); - } - - // Create the physical table location with hash-based naming - // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance - // Otherwise, use hash-based naming: {hash}_{object_id} - let dir_name = if namespace.is_empty() && self.dir_listing_enabled { - // Root table with directory listing enabled: use {table_name}.lance - format!("{}.lance", table_name) + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? } else { - // Child namespace table or dir listing disabled: use hash-based naming - Self::generate_dir_name(&object_id) + None }; - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - - // Validate that request_data is provided - if data.is_empty() { - return Err(Error::namespace_source( - "Request data (Arrow IPC stream) is required for create_table".into(), - )); - } - - // Write the data using Lance Dataset - let cursor = Cursor::new(data.to_vec()); - let stream_reader = StreamReader::try_new(cursor, None) - .map_err(|e| Error::io(format!("Failed to read IPC stream: {}", e)))?; - - let batches: Vec = - stream_reader - .collect::, _>>() - .map_err(|e| Error::io(format!("Failed to collect batches: {}", e)))?; - - if batches.is_empty() { - return Err(Error::io("No data provided for table creation")); - } - - let schema = batches[0].schema(); - let batch_results: Vec> = - batches.into_iter().map(Ok).collect(); - let reader = RecordBatchIterator::new(batch_results, schema); - let store_params = ObjectStoreParams { - storage_options_accessor: self.storage_options.as_ref().map(|opts| { - Arc::new( - lance_io::object_store::StorageOptionsAccessor::with_static_options( - opts.clone(), - ), - ) - }), - ..Default::default() - }; - let write_params = WriteParams { - session: self.session.clone(), - store_params: Some(store_params), - ..Default::default() - }; - let _dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) + self.create_table_extended(request, data, extended_batch) .await - .map_err(|e| { - Error::io_source(box_error(std::io::Error::other(format!( - "Failed to write dataset: {}", - e - )))) - })?; - - // Register in manifest (store dir_name, not full URI) - self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) - .await?; - - Ok(CreateTableResponse { - version: Some(1), - location: Some(table_uri), - storage_options: self.storage_options.clone(), - ..Default::default() - }) } async fn drop_table(&self, request: DropTableRequest) -> Result { @@ -1594,74 +1708,28 @@ impl LanceNamespace for ManifestNamespace { &self, request: CreateNamespaceRequest, ) -> Result { + self.create_namespace_extended(request, None).await + } + + async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { let namespace_id = request .id .as_ref() .ok_or_else(|| Error::invalid_input_source("Namespace ID is required".into()))?; - // Root namespace always exists and cannot be created + // Root namespace always exists and cannot be dropped if namespace_id.is_empty() { return Err(Error::namespace_source( - "Root namespace already exists and cannot be created".into(), + "Root namespace cannot be dropped".into(), )); } - // Validate parent namespaces exist (but not the namespace being created) - if namespace_id.len() > 1 { - self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) - .await?; - } - let object_id = namespace_id.join(DELIMITER); - if self.manifest_contains_object(&object_id).await? { + + // Check if namespace exists + if !self.manifest_contains_object(&object_id).boxed().await? { return Err(Error::namespace_source( - format!("Namespace '{}' already exists", object_id).into(), - )); - } - - // Serialize properties if provided - let metadata = request.properties.as_ref().and_then(|props| { - if props.is_empty() { - None - } else { - Some(serde_json::to_string(props).ok()?) - } - }); - - self.insert_into_manifest_with_metadata( - object_id, - ObjectType::Namespace, - None, - metadata, - None, - ) - .await?; - - Ok(CreateNamespaceResponse { - properties: request.properties, - ..Default::default() - }) - } - - async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { - let namespace_id = request - .id - .as_ref() - .ok_or_else(|| Error::invalid_input_source("Namespace ID is required".into()))?; - - // Root namespace always exists and cannot be dropped - if namespace_id.is_empty() { - return Err(Error::namespace_source( - "Root namespace cannot be dropped".into(), - )); - } - - let object_id = namespace_id.join(DELIMITER); - - // Check if namespace exists - if !self.manifest_contains_object(&object_id).boxed().await? { - return Err(Error::namespace_source( - format!("Namespace '{}' not found", object_id).into(), + format!("Namespace '{}' not found", object_id).into(), )); } @@ -1727,105 +1795,12 @@ impl LanceNamespace for ManifestNamespace { } async fn declare_table(&self, request: DeclareTableRequest) -> Result { - let table_id = request - .id - .as_ref() - .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; - - if table_id.is_empty() { - return Err(Error::invalid_input_source( - "Table ID cannot be empty".into(), - )); - } - - let (namespace, table_name) = Self::split_object_id(table_id); - let object_id = Self::build_object_id(&namespace, &table_name); - - // Check if table already exists in manifest - let existing = self.query_manifest_for_table(&object_id).await?; - if existing.is_some() { - return Err(Error::namespace_source( - format!("Table '{}' already exists", table_name).into(), - )); - } - - // Create table location path with hash-based naming - // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance - // Otherwise, use hash-based naming: {hash}_{object_id} - let dir_name = if namespace.is_empty() && self.dir_listing_enabled { - // Root table with directory listing enabled: use {table_name}.lance - format!("{}.lance", table_name) - } else { - // Child namespace table or dir listing disabled: use hash-based naming - Self::generate_dir_name(&object_id) - }; - let table_path = self.base_path.child(dir_name.as_str()); - let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; - - // Validate location if provided - if let Some(req_location) = &request.location { - let req_location = req_location.trim_end_matches('/'); - if req_location != table_uri { - return Err(Error::namespace_source( - format!( - "Cannot declare table {} at location {}, must be at location {}", - table_name, req_location, table_uri - ) - .into(), - )); - } - } - - // Create the .lance-reserved file to mark the table as existing - let reserved_file_path = table_path.child(".lance-reserved"); - - self.object_store - .create(&reserved_file_path) - .await - .map_err(|e| { - Error::namespace_source( - format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), - ) - })? - .shutdown() - .await - .map_err(|e| { - Error::namespace_source( - format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), - ) - })?; - - // Add entry to manifest marking this as a declared table (store dir_name, not full path) - self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) - .await?; - - log::info!( - "Declared table '{}' in manifest at {}", - table_name, - table_uri - ); - - // For backwards compatibility, only skip vending credentials when explicitly set to false - let vend_credentials = request.vend_credentials.unwrap_or(true); - let storage_options = if vend_credentials { - self.storage_options.clone() + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? } else { None }; - - Ok(DeclareTableResponse { - location: Some(table_uri), - storage_options, - ..Default::default() - }) + self.declare_table_extended(request, extended_batch).await } async fn register_table(&self, request: RegisterTableRequest) -> Result { @@ -1881,12 +1856,29 @@ impl LanceNamespace for ManifestNamespace { )); } + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + + let extended_batch = if let Some(props) = &request.properties { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } else { + None + }; + // Register the table with its location in the manifest - self.insert_into_manifest(object_id, ObjectType::Table, Some(location.clone())) - .await?; + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(location.clone()), + metadata, + None, + extended_batch, + ) + .await?; Ok(RegisterTableResponse { location: Some(location), + properties: request.properties.clone(), ..Default::default() }) } @@ -1933,128 +1925,1268 @@ impl LanceNamespace for ManifestNamespace { } } -#[cfg(test)] -mod tests { - use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; - use bytes::Bytes; - use lance_core::utils::tempfile::TempStdDir; - use lance_namespace::LanceNamespace; - use lance_namespace::models::{ - CreateNamespaceRequest, CreateTableRequest, DescribeTableRequest, DropTableRequest, - ListTablesRequest, TableExistsRequest, - }; - use rstest::rstest; - - fn create_test_ipc_data() -> Vec { - use arrow::array::{Int32Array, StringArray}; - use arrow::datatypes::{DataType, Field, Schema}; - use arrow::ipc::writer::StreamWriter; - use arrow::record_batch::RecordBatch; - use std::sync::Arc; +impl ManifestNamespace { + /// Create a namespace with extended properties. + pub async fn create_namespace_extended( + &self, + request: CreateNamespaceRequest, + extended_record: Option, + ) -> Result { + let namespace_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Namespace ID is required".into()))?; - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); + // Root namespace always exists and cannot be created + if namespace_id.is_empty() { + return Err(Error::namespace_source( + "Root namespace already exists and cannot be created".into(), + )); + } - let batch = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 2, 3])), - Arc::new(StringArray::from(vec!["a", "b", "c"])), - ], - ) - .unwrap(); + // Validate parent namespaces exist. + if namespace_id.len() > 1 { + self.validate_namespace_levels_exist(&namespace_id[..namespace_id.len() - 1]) + .await?; + } - let mut buffer = Vec::new(); - { - let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); - writer.write(&batch).unwrap(); - writer.finish().unwrap(); + // Fail fast if the namespace already exists. + let object_id = namespace_id.join(DELIMITER); + if self.manifest_contains_object(&object_id).await? { + return Err(Error::namespace_source( + format!("Namespace '{}' already exists", object_id).into(), + )); } - buffer - } - #[rstest] - #[case::with_optimization(true)] - #[case::without_optimization(false)] - #[tokio::test] - async fn test_manifest_namespace_basic_create_and_list(#[case] inline_optimization: bool) { - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); - // Create a DirectoryNamespace with manifest enabled (default) - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); + // Serialize properties and compute extended batch if provided + let extended_record = match (extended_record, &request.properties) { + (None, None) => None, + (Some(extended_record), None) => Some(extended_record), + (None, Some(props)) => { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } + (Some(extended_record), Some(props)) => { + let extended_record = merge_extended_record_and_properties( + Arc::new(self.extended_manifest_schema().await?), + &[Some(extended_record)], + &[Some(props.clone())], + )?; + Some(extended_record) + } + }; - // Verify we can list tables (should be empty) - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0); + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Namespace, + None, + metadata, + None, + extended_record, + ) + .await?; - // Create a test table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); + Ok(CreateNamespaceResponse { + properties: request.properties, + ..Default::default() + }) + } - let _response = dir_namespace - .create_table(create_request, Bytes::from(buffer)) - .await - .unwrap(); + /// Declare a table with extended properties (metadata only operation). + pub async fn declare_table_extended( + &self, + request: DeclareTableRequest, + extended_record: Option, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; - // List tables again - should see our new table - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 1); - assert_eq!(response.tables[0], "test_table"); - } + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } - #[rstest] - #[case::with_optimization(true)] - #[case::without_optimization(false)] - #[tokio::test] - async fn test_manifest_namespace_table_exists(#[case] inline_optimization: bool) { - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(Error::namespace_source( + format!("Table '{}' already exists", table_name).into(), + )); + } + + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + let extended_record = match (extended_record, &request.properties) { + (None, None) => None, + (Some(extended_record), None) => Some(extended_record), + (None, Some(props)) => { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } + (Some(extended_record), Some(props)) => { + let extended_record = merge_extended_record_and_properties( + Arc::new(self.extended_manifest_schema().await?), + &[Some(extended_record)], + &[Some(props.clone())], + )?; + Some(extended_record) + } + }; + + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) + } else { + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) + }; + let table_path = self.base_path.child(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(Error::namespace_source( + format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ) + .into(), + )); + } + } + + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.child(".lance-reserved"); + + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| { + Error::namespace_source( + format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + ) + })? + .shutdown() + .await + .map_err(|e| { + Error::namespace_source( + format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + ) + })?; + + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(dir_name), + metadata, + None, + extended_record, + ) + .await?; + + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + properties: request.properties.clone(), + ..Default::default() + }) + } + + /// Create a table with extended properties. + pub async fn create_table_extended( + &self, + request: CreateTableRequest, + data: Bytes, + extended_record: Option, + ) -> Result { + let table_id = request + .id + .as_ref() + .ok_or_else(|| Error::invalid_input_source("Table ID is required".into()))?; + + if table_id.is_empty() { + return Err(Error::invalid_input_source( + "Table ID cannot be empty".into(), + )); + } + + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Serialize properties and compute extended batch if provided + let metadata = Self::build_metadata_json(&request.properties); + let extended_batch = match (extended_record, &request.properties) { + (None, None) => None, + (Some(extended_record), None) => Some(extended_record), + (None, Some(props)) => { + batch_from_extended_props(props, &self.full_manifest_schema().await?)? + } + (Some(extended_record), Some(props)) => { + let extended_record = merge_extended_record_and_properties( + Arc::new(self.extended_manifest_schema().await?), + &[Some(extended_record)], + &[Some(props.clone())], + )?; + Some(extended_record) + } + }; + + // Check if table already exists in manifest + if self.manifest_contains_object(&object_id).await? { + return Err(Error::io(format!("Table '{}' already exists", table_name))); + } + + // Create the physical table location with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) + } else { + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) + }; + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + + // Validate that request_data is provided + if data.is_empty() { + return Err(Error::namespace_source( + "Request data (Arrow IPC stream) is required for create_table".into(), + )); + } + + // Write the data using Lance Dataset. + // + // NOTE: create_table_extended should support creating an empty table (0 rows) + // when the IPC stream contains schema but no record batches. + let cursor = Cursor::new(data.to_vec()); + let stream_reader = StreamReader::try_new(cursor, None) + .map_err(|e| Error::io(format!("Failed to read IPC stream: {}", e)))?; + let schema = stream_reader.schema(); + + let mut batches: Vec = stream_reader + .collect::, _>>() + .map_err(|e| Error::io(format!("Failed to collect batches: {}", e)))?; + + if batches.is_empty() { + // Schema-only stream: create empty dataset. + batches.push(RecordBatch::new_empty(schema.clone())); + } + + let batch_results: Vec> = + batches.into_iter().map(Ok).collect(); + let reader = RecordBatchIterator::new(batch_results, schema); + + let store_params = ObjectStoreParams { + storage_options_accessor: self.storage_options.as_ref().map(|opts| { + Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + ) + }), + ..Default::default() + }; + let write_params = WriteParams { + session: self.session.clone(), + store_params: Some(store_params), + ..Default::default() + }; + let _dataset = Dataset::write(Box::new(reader), &table_uri, Some(write_params)) + .await + .map_err(|e| Error::io(format!("Failed to write dataset: {}", e)))?; + + // Register in manifest (store dir_name, not full URI) + self.insert_into_manifest_with_metadata( + object_id, + ObjectType::Table, + Some(dir_name), + metadata, + None, + extended_batch, + ) + .await?; + + Ok(CreateTableResponse { + version: Some(1), + location: Some(table_uri), + storage_options: self.storage_options.clone(), + properties: request.properties.clone(), + ..Default::default() + }) + } +} + +/// Parse the first row of a RecordBatch into a HashMap, excluding specified columns. +fn batch_to_extended_props(batch: &RecordBatch) -> HashMap { + // Collect basic columns to excluded + let basic_schema = ManifestNamespace::basic_manifest_schema(); + let mut excluded: Vec<&str> = vec![]; + for field in basic_schema.fields.iter() { + excluded.push(field.name()); + } + + // Transform batch to properties + let mut result = HashMap::new(); + + if batch.num_rows() == 0 { + return result; + } + + for (i, field) in batch.schema().fields().iter().enumerate() { + let col_name = field.name().to_string(); + if excluded.contains(&col_name.as_str()) { + continue; + } + + let array = batch.column(i); + + if array.is_null(0) { + // skip null properties. + continue; + } + + let Ok(scalar) = ScalarValue::try_from_array(array.as_ref(), 0) else { + continue; + }; + + let Ok(value_str) = scalar_to_str(&scalar) else { + continue; + }; + + if let Some(value) = value_str + && !value.is_empty() + { + result.insert(format!("{}{}", EXTENDED_PREFIX, col_name), value); + } + } + + result +} + +/// Convert a HashMap into a RecordBatch, excluding specified columns. +pub(crate) fn batch_from_extended_props( + map: &HashMap, + schema: &Schema, +) -> Result> { + // Collect basic columns to excluded + let basic_schema = ManifestNamespace::basic_manifest_schema(); + let mut excluded: Vec<&str> = vec![]; + for field in basic_schema.fields.iter() { + excluded.push(field.name()); + } + + fn is_nullish_extended_value(v: &str) -> bool { + v.is_empty() || v.eq_ignore_ascii_case("null") + } + + // All non-null extended properties must be covered in schema. + for (k, v) in map.iter() { + if is_nullish_extended_value(v) { + continue; + } + if let Some(col_name) = k.strip_prefix(EXTENDED_PREFIX) { + if excluded.contains(&col_name) { + return Err(Error::invalid_input(format!( + "Column {} is preserved.", + col_name + ))); + } + if schema.column_with_name(col_name).is_none() { + return Err(Error::invalid_input(format!( + "Column {} does not exist in extended properties", + col_name + ))); + } + } + } + + // Construct record batch + let mut array: Vec = vec![]; + let mut fields: Vec = vec![]; + for field in schema + .fields() + .iter() + .filter(|field| !excluded.contains(&field.name().as_str())) + { + let field_name = field.name().as_str(); + + match map.get(&format!("{}{}", EXTENDED_PREFIX, field_name)) { + Some(value) if !is_nullish_extended_value(value) => { + let scalar = scalar_from_str(field.data_type(), value)?; + let v = scalar.to_array().map_err(|e| { + Error::io(format!( + "Failed to convert scalar for column '{}' to array: {}", + field_name, e + )) + })?; + array.push(v); + fields.push(field.clone()); + } + _ => {} + } + } + + if fields.is_empty() { + return Ok(None); + } + + let schema = Schema::new(fields); + Ok(Some(RecordBatch::try_new(Arc::new(schema), array)?)) +} + +pub(crate) fn scalar_to_str(scalar: &ScalarValue) -> Result> { + if scalar.is_null() { + return Ok(None); + } + + match scalar { + ScalarValue::Utf8(Some(v)) + | ScalarValue::Utf8View(Some(v)) + | ScalarValue::LargeUtf8(Some(v)) => Ok(Some(v.clone())), + ScalarValue::Boolean(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Int32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Int64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::UInt32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::UInt64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Float32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Float64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Date32(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Date64(Some(v)) => Ok(Some(v.to_string())), + ScalarValue::Binary(Some(v)) + | ScalarValue::LargeBinary(Some(v)) + | ScalarValue::BinaryView(Some(v)) + | ScalarValue::FixedSizeBinary(_, Some(v)) => Ok(Some(bytes_to_hex(v))), + _ => Err(Error::invalid_input(format!( + "Unsupported extended scalar: {:?}", + scalar + ))), + } +} + +pub(crate) fn scalar_from_str(dt: &DataType, value: &str) -> Result { + match dt { + DataType::Utf8 => Ok(ScalarValue::Utf8(Some(value.to_string()))), + DataType::LargeUtf8 => Ok(ScalarValue::LargeUtf8(Some(value.to_string()))), + DataType::Boolean => Ok(ScalarValue::Boolean(Some(bool::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid boolean '{}': {}", value, e)), + )?))), + DataType::Int32 => Ok(ScalarValue::Int32(Some(i32::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid int32 '{}': {}", value, e)), + )?))), + DataType::Int64 => Ok(ScalarValue::Int64(Some(i64::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid int64 '{}': {}", value, e)), + )?))), + DataType::UInt32 => Ok(ScalarValue::UInt32(Some(u32::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid uint32 '{}': {}", value, e)), + )?))), + DataType::UInt64 => Ok(ScalarValue::UInt64(Some(u64::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid uint64 '{}': {}", value, e)), + )?))), + DataType::Float32 => Ok(ScalarValue::Float32(Some(f32::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid float32 '{}': {}", value, e)), + )?))), + DataType::Float64 => Ok(ScalarValue::Float64(Some(f64::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid float64 '{}': {}", value, e)), + )?))), + DataType::Date32 => Ok(ScalarValue::Date32(Some(i32::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid date32 '{}': {}", value, e)), + )?))), + DataType::Date64 => Ok(ScalarValue::Date64(Some(i64::from_str(value).map_err( + |e| Error::invalid_input(format!("Invalid date64 '{}': {}", value, e)), + )?))), + DataType::Binary => Ok(ScalarValue::Binary(Some(hex_to_bytes(value)?))), + DataType::LargeBinary => Ok(ScalarValue::LargeBinary(Some(hex_to_bytes(value)?))), + _ => Err(Error::invalid_input(format!( + "Unsupported extended column type: {:?}", + dt + ))), + } +} + +fn bytes_to_hex(bytes: &[u8]) -> String { + let mut out = String::with_capacity(bytes.len() * 2); + for b in bytes { + use std::fmt::Write; + let _ = write!(&mut out, "{:02x}", b); + } + out +} + +fn hex_to_bytes(s: &str) -> Result> { + let s = s.strip_prefix("0x").unwrap_or(s); + if !s.len().is_multiple_of(2) { + return Err(Error::invalid_input(format!( + "Invalid hex string length {}", + s.len() + ))); + } + + let mut out = Vec::with_capacity(s.len() / 2); + let bytes = s.as_bytes(); + for i in (0..bytes.len()).step_by(2) { + let hex = std::str::from_utf8(&bytes[i..i + 2]) + .map_err(|e| Error::invalid_input(format!("Invalid hex string encoding: {}", e)))?; + let v = u8::from_str_radix(hex, 16) + .map_err(|e| Error::invalid_input(format!("Invalid hex byte '{}': {}", hex, e)))?; + out.push(v); + } + Ok(out) +} + +fn merge_extended_record_and_properties( + schema: SchemaRef, + extended_records: &[Option], + props_vec: &[Option>], +) -> Result { + let mut columns: Vec = Vec::with_capacity(schema.fields().len()); + + if extended_records.len() != props_vec.len() { + return Err(Error::invalid_input(format!( + "extended_records length {} must match props length {}", + extended_records.len(), + props_vec.len() + ))); + } + + let n = props_vec.len(); + if n == 0 { + return Ok(RecordBatch::new_empty(schema)); + } + + for f in schema.fields() { + let name = f.name().as_str(); + columns.push(build_extended_column_array( + name, + f.data_type(), + extended_records, + props_vec, + )?); + } + + let options = RecordBatchOptions::new().with_row_count(Some(n)); + let batch = RecordBatch::try_new_with_options(schema, columns, &options) + .map_err(|e| Error::io(format!("Failed to create manifest batch: {}", e)))?; + Ok(batch) +} + +fn build_extended_column_array( + col_name: &str, + data_type: &DataType, + extended_records: &[Option], + props_vec: &[Option>], +) -> Result { + if extended_records.len() != props_vec.len() { + return Err(Error::invalid_input(format!( + "extended_records length {} must match props length {}", + extended_records.len(), + props_vec.len() + ))); + } + + let key = format!("{}{}", EXTENDED_PREFIX, col_name); + let null_scalar = ScalarValue::try_from(data_type).map_err(|e| { + Error::io(format!( + "Failed to create null scalar for column {}: {}", + col_name, e + )) + })?; + + let mut scalars: Vec = Vec::with_capacity(extended_records.len()); + for (record_opt, props_opt) in extended_records.iter().zip(props_vec.iter()) { + let from_extended = record_opt + .as_ref() + .map(|record| scalar_from_record(record, col_name, Some(data_type))) + .transpose()? + .flatten(); + + let scalar = if let Some(s) = from_extended { + s + } else if let Some(s) = scalar_from_extended_props(props_opt.as_ref(), &key, data_type)? { + s + } else { + null_scalar.clone() + }; + + scalars.push(scalar); + } + + Ok(ScalarValue::iter_to_array(scalars.into_iter())?) +} + +fn get_column_checked( + batch: &RecordBatch, + name: &str, + expected_type: Option<&DataType>, +) -> Result> { + let Some(col) = batch.column_by_name(name) else { + return Ok(None); + }; + + if let Some(expected_dt) = expected_type + && col.data_type() != expected_dt + { + return Err(Error::invalid_input(format!( + "batch column '{}' has type {:?}, expected {:?}", + name, + col.data_type(), + expected_dt + ))); + } + + Ok(Some(col.clone())) +} + +// Parse first row of record as scalar value +fn scalar_from_record( + record: &RecordBatch, + col_name: &str, + data_type: Option<&DataType>, +) -> Result> { + let Some(col) = get_column_checked(record, col_name, data_type)? else { + return Ok(None); + }; + if col.is_null(0) { + return Ok(None); + } + + ScalarValue::try_from_array(col.as_ref(), 0) + .map(Some) + .map_err(|e| { + Error::internal(format!( + "Failed to convert column '{}' to scalar: {}", + col_name, e + )) + }) +} + +fn scalar_from_extended_props( + props: Option<&HashMap>, + key: &str, + data_type: &DataType, +) -> Result> { + let v = props.and_then(|m| m.get(key)); + match v { + Some(s) if s != "null" && !s.is_empty() => { + Ok(Some(crate::dir::manifest::scalar_from_str(data_type, s)?)) + } + _ => Ok(None), + } +} + +#[cfg(test)] +mod tests { + use crate::dir::manifest::batch_to_extended_props; + use crate::{DirectoryNamespaceBuilder, ManifestNamespace}; + use arrow::array::{Array, Int32Array, StringArray}; + use arrow::datatypes::{Field, Schema}; + use arrow::record_batch::RecordBatch; + use arrow_ipc::writer::StreamWriter; + use arrow_schema::DataType; + use bytes::Bytes; + use lance_core::utils::tempfile::TempStdDir; + use lance_io::object_store::ObjectStore; + use lance_namespace::LanceNamespace; + use lance_namespace::models::{ + CreateNamespaceRequest as ClientCreateNamespaceRequest, CreateTableRequest, + DeclareTableRequest, DescribeNamespaceRequest, DescribeTableRequest, DropNamespaceRequest, + DropTableRequest, ListNamespacesRequest, ListTablesRequest, NamespaceExistsRequest, + RegisterTableRequest, TableExistsRequest, + }; + use rstest::rstest; + use std::collections::HashMap; + use std::sync::Arc; + + fn create_test_ipc_data() -> Vec { + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::writer::StreamWriter; + use arrow::record_batch::RecordBatch; + use std::sync::Arc; + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["a", "b", "c"])), + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + buffer + } + + fn create_empty_ipc_data() -> Vec { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, true), + Field::new("name", DataType::Utf8, true), + ])); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &schema).unwrap(); + writer.finish().unwrap(); + } + buffer + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_namespace_basic_create_and_list(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create a DirectoryNamespace with manifest enabled (default) + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Verify we can list tables (should be empty) + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0); + + // Create a test table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + + let _response = dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // List tables again - should see our new table + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 1); + assert_eq!(response.tables[0], "test_table"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_namespace_table_exists(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Check non-existent table + let mut request = TableExistsRequest::new(); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.table_exists(request).await; + assert!(result.is_err()); + + // Create table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Check existing table + let mut request = TableExistsRequest::new(); + request.id = Some(vec!["test_table".to_string()]); + let result = dir_namespace.table_exists(request).await; + assert!(result.is_ok()); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_namespace_describe_table(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Describe non-existent table + let mut request = DescribeTableRequest::new(); + request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.describe_table(request).await; + assert!(result.is_err()); + + // Create table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Describe existing table + let mut request = DescribeTableRequest::new(); + request.id = Some(vec!["test_table".to_string()]); + let response = dir_namespace.describe_table(request).await.unwrap(); + assert!(response.location.is_some()); + assert!(response.location.unwrap().contains("test_table")); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_namespace_drop_table(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // Verify table exists + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 1); + + // Drop table + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(vec!["test_table".to_string()]); + let _response = dir_namespace.drop_table(drop_request).await.unwrap(); + + // Verify table is gone + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_namespace_multiple_tables(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create multiple tables + let buffer = create_test_ipc_data(); + for i in 1..=3 { + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec![format!("table{}", i)]); + dir_namespace + .create_table(create_request, Bytes::from(buffer.clone())) + .await + .unwrap(); + } + + // List all tables + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 3); + assert!(response.tables.contains(&"table1".to_string())); + assert!(response.tables.contains(&"table2".to_string())); + assert!(response.tables.contains(&"table3".to_string())); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_directory_only_mode(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create a DirectoryNamespace with manifest disabled + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Verify we can list tables (should be empty) + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0); + + // Create a test table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + + // Create table - this should use directory-only mode + let _response = dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // List tables - should see our new table + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 1); + assert_eq!(response.tables[0], "test_table"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_dual_mode_merge(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create a DirectoryNamespace with both manifest and directory enabled + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(true) .inline_optimization_enabled(inline_optimization) .build() .await .unwrap(); - // Check non-existent table - let mut request = TableExistsRequest::new(); - request.id = Some(vec!["nonexistent".to_string()]); - let result = dir_namespace.table_exists(request).await; + // Create tables through manifest + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["table1".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // List tables - should see table from both manifest and directory + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 1); + assert_eq!(response.tables[0], "table1"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_manifest_only_mode(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create a DirectoryNamespace with only manifest enabled + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create table + let buffer = create_test_ipc_data(); + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + dir_namespace + .create_table(create_request, Bytes::from(buffer)) + .await + .unwrap(); + + // List tables - should only use manifest + let mut request = ListTablesRequest::new(); + request.id = Some(vec![]); + let response = dir_namespace.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 1); + assert_eq!(response.tables[0], "test_table"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_drop_nonexistent_table(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Try to drop non-existent table + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(vec!["nonexistent".to_string()]); + let result = dir_namespace.drop_table(drop_request).await; assert!(result.is_err()); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_duplicate_table_fails(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); // Create table let buffer = create_test_ipc_data(); let mut create_request = CreateTableRequest::new(); create_request.id = Some(vec!["test_table".to_string()]); dir_namespace + .create_table(create_request, Bytes::from(buffer.clone())) + .await + .unwrap(); + + // Try to create table with same name - should fail + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(vec!["test_table".to_string()]); + let result = dir_namespace .create_table(create_request, Bytes::from(buffer)) + .await; + assert!(result.is_err()); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_child_namespace(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() .await .unwrap(); - // Check existing table - let mut request = TableExistsRequest::new(); - request.id = Some(vec!["test_table".to_string()]); - let result = dir_namespace.table_exists(request).await; + // Create a child namespace + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + let result = dir_namespace.create_namespace(create_req).await; + assert!( + result.is_ok(), + "Failed to create child namespace: {:?}", + result.err() + ); + + // Verify namespace exists + let exists_req = NamespaceExistsRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let result = dir_namespace.namespace_exists(exists_req).await; + assert!(result.is_ok(), "Namespace should exist"); + + // List child namespaces of root + let list_req = ListNamespacesRequest { + id: Some(vec![]), + page_token: None, + limit: None, + ..Default::default() + }; + let result = dir_namespace.list_namespaces(list_req).await; + assert!(result.is_ok()); + let namespaces = result.unwrap(); + assert_eq!(namespaces.namespaces.len(), 1); + assert_eq!(namespaces.namespaces[0], "ns1"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_nested_namespace(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create parent namespace + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["parent".to_string()]); + dir_namespace.create_namespace(create_req).await.unwrap(); + + // Create nested child namespace + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["parent".to_string(), "child".to_string()]); + let result = dir_namespace.create_namespace(create_req).await; + assert!( + result.is_ok(), + "Failed to create nested namespace: {:?}", + result.err() + ); + + // Verify nested namespace exists + let exists_req = NamespaceExistsRequest { + id: Some(vec!["parent".to_string(), "child".to_string()]), + ..Default::default() + }; + let result = dir_namespace.namespace_exists(exists_req).await; + assert!(result.is_ok(), "Nested namespace should exist"); + + // List child namespaces of parent + let list_req = ListNamespacesRequest { + id: Some(vec!["parent".to_string()]), + page_token: None, + limit: None, + ..Default::default() + }; + let result = dir_namespace.list_namespaces(list_req).await; assert!(result.is_ok()); + let namespaces = result.unwrap(); + assert_eq!(namespaces.namespaces.len(), 1); + assert_eq!(namespaces.namespaces[0], "child"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_namespace_without_parent_fails(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Try to create nested namespace without parent + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["nonexistent_parent".to_string(), "child".to_string()]); + let result = dir_namespace.create_namespace(create_req).await; + assert!(result.is_err(), "Should fail when parent doesn't exist"); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_drop_child_namespace(#[case] inline_optimization: bool) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + // Create a child namespace + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + dir_namespace.create_namespace(create_req).await.unwrap(); + + // Drop the namespace + let mut drop_req = DropNamespaceRequest::new(); + drop_req.id = Some(vec!["ns1".to_string()]); + let result = dir_namespace.drop_namespace(drop_req).await; + assert!( + result.is_ok(), + "Failed to drop namespace: {:?}", + result.err() + ); + + // Verify namespace no longer exists + let exists_req = NamespaceExistsRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let result = dir_namespace.namespace_exists(exists_req).await; + assert!(result.is_err(), "Namespace should not exist after drop"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_manifest_namespace_describe_table(#[case] inline_optimization: bool) { + async fn test_drop_namespace_with_children_fails(#[case] inline_optimization: bool) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2064,34 +3196,27 @@ mod tests { .await .unwrap(); - // Describe non-existent table - let mut request = DescribeTableRequest::new(); - request.id = Some(vec!["nonexistent".to_string()]); - let result = dir_namespace.describe_table(request).await; - assert!(result.is_err()); + // Create parent and child namespaces + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["parent".to_string()]); + dir_namespace.create_namespace(create_req).await.unwrap(); - // Create table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); - dir_namespace - .create_table(create_request, Bytes::from(buffer)) - .await - .unwrap(); + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["parent".to_string(), "child".to_string()]); + dir_namespace.create_namespace(create_req).await.unwrap(); - // Describe existing table - let mut request = DescribeTableRequest::new(); - request.id = Some(vec!["test_table".to_string()]); - let response = dir_namespace.describe_table(request).await.unwrap(); - assert!(response.location.is_some()); - assert!(response.location.unwrap().contains("test_table")); + // Try to drop parent namespace - should fail because it has children + let mut drop_req = DropNamespaceRequest::new(); + drop_req.id = Some(vec!["parent".to_string()]); + let result = dir_namespace.drop_namespace(drop_req).await; + assert!(result.is_err(), "Should fail when namespace has children"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_manifest_namespace_drop_table(#[case] inline_optimization: bool) { + async fn test_create_table_in_child_namespace(#[case] inline_optimization: bool) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2101,38 +3226,43 @@ mod tests { .await .unwrap(); - // Create table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); - dir_namespace - .create_table(create_request, Bytes::from(buffer)) - .await - .unwrap(); - - // Verify table exists - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 1); + // Create a child namespace + let mut create_ns_req = ClientCreateNamespaceRequest::new(); + create_ns_req.id = Some(vec!["ns1".to_string()]); + dir_namespace.create_namespace(create_ns_req).await.unwrap(); - // Drop table - let mut drop_request = DropTableRequest::new(); - drop_request.id = Some(vec!["test_table".to_string()]); - let _response = dir_namespace.drop_table(drop_request).await.unwrap(); + // Create a table in the child namespace + let buffer = create_test_ipc_data(); + let mut create_table_req = CreateTableRequest::new(); + create_table_req.id = Some(vec!["ns1".to_string(), "table1".to_string()]); + let result = dir_namespace + .create_table(create_table_req, Bytes::from(buffer)) + .await; + assert!( + result.is_ok(), + "Failed to create table in child namespace: {:?}", + result.err() + ); - // Verify table is gone - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0); + // List tables in the namespace + let list_req = ListTablesRequest { + id: Some(vec!["ns1".to_string()]), + page_token: None, + limit: None, + ..Default::default() + }; + let result = dir_namespace.list_tables(list_req).await; + assert!(result.is_ok()); + let tables = result.unwrap(); + assert_eq!(tables.tables.len(), 1); + assert_eq!(tables.tables[0], "table1"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_manifest_namespace_multiple_tables(#[case] inline_optimization: bool) { + async fn test_describe_child_namespace(#[case] inline_optimization: bool) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2142,726 +3272,1074 @@ mod tests { .await .unwrap(); - // Create multiple tables - let buffer = create_test_ipc_data(); - for i in 1..=3 { - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec![format!("table{}", i)]); - dir_namespace - .create_table(create_request, Bytes::from(buffer.clone())) + // Create a child namespace with properties + let mut properties = std::collections::HashMap::new(); + properties.insert("key1".to_string(), "value1".to_string()); + + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties.clone()); + dir_namespace.create_namespace(create_req).await.unwrap(); + + // Describe the namespace + let describe_req = DescribeNamespaceRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let result = dir_namespace.describe_namespace(describe_req).await; + assert!( + result.is_ok(), + "Failed to describe namespace: {:?}", + result.err() + ); + let response = result.unwrap(); + assert!(response.properties.is_some()); + assert_eq!( + response.properties.unwrap().get("key1"), + Some(&"value1".to_string()) + ); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_concurrent_create_and_drop_single_instance(#[case] inline_optimization: bool) { + use futures::future::join_all; + use std::sync::Arc; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let dir_namespace = Arc::new( + DirectoryNamespaceBuilder::new(temp_path) + .inline_optimization_enabled(inline_optimization) + .build() .await - .unwrap(); + .unwrap(), + ); + + // Initialize namespace first - create parent namespace to ensure __manifest table + // is created before concurrent operations + let mut create_ns_request = ClientCreateNamespaceRequest::new(); + create_ns_request.id = Some(vec!["test_ns".to_string()]); + dir_namespace + .create_namespace(create_ns_request) + .await + .unwrap(); + + let num_tables = 10; + let mut handles = Vec::new(); + + for i in 0..num_tables { + let ns = dir_namespace.clone(); + let handle = async move { + let table_name = format!("concurrent_table_{}", i); + let table_id = vec!["test_ns".to_string(), table_name.clone()]; + let buffer = create_test_ipc_data(); + + // Create table + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(table_id.clone()); + ns.create_table(create_request, Bytes::from(buffer)) + .await + .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + + // Drop table + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(table_id); + ns.drop_table(drop_request) + .await + .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + + Ok::<_, lance_core::Error>(()) + }; + handles.push(handle); } - // List all tables + let results = join_all(handles).await; + for result in results { + assert!(result.is_ok(), "All concurrent operations should succeed"); + } + + // Verify all tables are dropped let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); + request.id = Some(vec!["test_ns".to_string()]); let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 3); - assert!(response.tables.contains(&"table1".to_string())); - assert!(response.tables.contains(&"table2".to_string())); - assert!(response.tables.contains(&"table3".to_string())); + assert_eq!(response.tables.len(), 0, "All tables should be dropped"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_directory_only_mode(#[case] inline_optimization: bool) { + async fn test_concurrent_create_and_drop_multiple_instances(#[case] inline_optimization: bool) { + use futures::future::join_all; + let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); + let temp_path = temp_dir.to_str().unwrap().to_string(); - // Create a DirectoryNamespace with manifest disabled - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .manifest_enabled(false) + // Initialize namespace first with a single instance to ensure __manifest + // table is created and parent namespace exists before concurrent operations + let init_ns = DirectoryNamespaceBuilder::new(&temp_path) .inline_optimization_enabled(inline_optimization) .build() .await .unwrap(); + let mut create_ns_request = ClientCreateNamespaceRequest::new(); + create_ns_request.id = Some(vec!["test_ns".to_string()]); + init_ns.create_namespace(create_ns_request).await.unwrap(); - // Verify we can list tables (should be empty) - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0); + let num_tables = 10; + let mut handles = Vec::new(); - // Create a test table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); + for i in 0..num_tables { + let path = temp_path.clone(); + let handle = async move { + // Each task creates its own namespace instance + let ns = DirectoryNamespaceBuilder::new(&path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); - // Create table - this should use directory-only mode - let _response = dir_namespace - .create_table(create_request, Bytes::from(buffer)) + let table_name = format!("multi_ns_table_{}", i); + let table_id = vec!["test_ns".to_string(), table_name.clone()]; + let buffer = create_test_ipc_data(); + + // Create table + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(table_id.clone()); + ns.create_table(create_request, Bytes::from(buffer)) + .await + .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + + // Drop table + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(table_id); + ns.drop_table(drop_request) + .await + .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + + Ok::<_, lance_core::Error>(()) + }; + handles.push(handle); + } + + let results = join_all(handles).await; + for result in results { + assert!(result.is_ok(), "All concurrent operations should succeed"); + } + + // Verify with a fresh namespace instance + let verify_ns = DirectoryNamespaceBuilder::new(&temp_path) + .inline_optimization_enabled(inline_optimization) + .build() .await .unwrap(); - // List tables - should see our new table let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 1); - assert_eq!(response.tables[0], "test_table"); + request.id = Some(vec!["test_ns".to_string()]); + let response = verify_ns.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0, "All tables should be dropped"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_dual_mode_merge(#[case] inline_optimization: bool) { + async fn test_concurrent_create_then_drop_from_different_instance( + #[case] inline_optimization: bool, + ) { + use futures::future::join_all; + let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); + let temp_path = temp_dir.to_str().unwrap().to_string(); - // Create a DirectoryNamespace with both manifest and directory enabled - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .manifest_enabled(true) - .dir_listing_enabled(true) + // Initialize namespace first with a single instance to ensure __manifest + // table is created and parent namespace exists before concurrent operations + let init_ns = DirectoryNamespaceBuilder::new(&temp_path) .inline_optimization_enabled(inline_optimization) .build() .await .unwrap(); + let mut create_ns_request = ClientCreateNamespaceRequest::new(); + create_ns_request.id = Some(vec!["test_ns".to_string()]); + init_ns.create_namespace(create_ns_request).await.unwrap(); - // Create tables through manifest - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["table1".to_string()]); - dir_namespace - .create_table(create_request, Bytes::from(buffer)) + let num_tables = 10; + + // Phase 1: Create all tables concurrently using separate namespace instances + let mut create_handles = Vec::new(); + for i in 0..num_tables { + let path = temp_path.clone(); + let handle = async move { + let ns = DirectoryNamespaceBuilder::new(&path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let table_name = format!("cross_instance_table_{}", i); + let table_id = vec!["test_ns".to_string(), table_name.clone()]; + let buffer = create_test_ipc_data(); + + let mut create_request = CreateTableRequest::new(); + create_request.id = Some(table_id); + ns.create_table(create_request, Bytes::from(buffer)) + .await + .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + + Ok::<_, lance_core::Error>(()) + }; + create_handles.push(handle); + } + + let create_results = join_all(create_handles).await; + for result in create_results { + assert!(result.is_ok(), "All create operations should succeed"); + } + + // Phase 2: Drop all tables concurrently using NEW namespace instances + let mut drop_handles = Vec::new(); + for i in 0..num_tables { + let path = temp_path.clone(); + let handle = async move { + let ns = DirectoryNamespaceBuilder::new(&path) + .inline_optimization_enabled(inline_optimization) + .build() + .await + .unwrap(); + + let table_name = format!("cross_instance_table_{}", i); + let table_id = vec!["test_ns".to_string(), table_name.clone()]; + + let mut drop_request = DropTableRequest::new(); + drop_request.id = Some(table_id); + ns.drop_table(drop_request) + .await + .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + + Ok::<_, lance_core::Error>(()) + }; + drop_handles.push(handle); + } + + let drop_results = join_all(drop_handles).await; + for result in drop_results { + assert!(result.is_ok(), "All drop operations should succeed"); + } + + // Verify all tables are dropped + let verify_ns = DirectoryNamespaceBuilder::new(&temp_path) + .inline_optimization_enabled(inline_optimization) + .build() .await .unwrap(); - // List tables - should see table from both manifest and directory let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 1); - assert_eq!(response.tables[0], "table1"); + request.id = Some(vec!["test_ns".to_string()]); + let response = verify_ns.list_tables(request).await.unwrap(); + assert_eq!(response.tables.len(), 0, "All tables should be dropped"); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_manifest_only_mode(#[case] inline_optimization: bool) { + async fn test_add_extended_properties_creates_columns_and_idempotent( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + assert_eq!( + ManifestNamespace::basic_manifest_schema().fields().len(), + schema.fields().len() + ); - // Create a DirectoryNamespace with only manifest enabled - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .manifest_enabled(true) - .dir_listing_enabled(false) - .inline_optimization_enabled(inline_optimization) - .build() + // Adding extended properties should create new columns + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) .await .unwrap(); - // Create table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); - dir_namespace - .create_table(create_request, Bytes::from(buffer)) + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + let user_field = schema.field_with_name("user_id").unwrap(); + assert_eq!(user_field.data_type(), &DataType::Utf8); + let score_field = schema.field_with_name("score").unwrap(); + assert_eq!(score_field.data_type(), &DataType::Int32); + let initial_field_count = schema.fields().len(); + + // Adding the same properties again should be a no-op + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) .await .unwrap(); - - // List tables - should only use manifest - let mut request = ListTablesRequest::new(); - request.id = Some(vec![]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 1); - assert_eq!(response.tables[0], "test_table"); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert_eq!(schema_after.fields().len(), initial_field_count); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_drop_nonexistent_table(#[case] inline_optimization: bool) { + async fn test_add_extended_properties_rejects_missing_prefix( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); - - // Try to drop non-existent table - let mut drop_request = DropTableRequest::new(); - drop_request.id = Some(vec!["nonexistent".to_string()]); - let result = dir_namespace.drop_table(drop_request).await; + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let result = manifest_ns + .add_extended_properties(&vec![("invalid_key", DataType::Utf8)]) + .await; assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("must start with prefix")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_create_duplicate_table_fails(#[case] inline_optimization: bool) { + async fn test_remove_extended_properties_drops_specified_columns( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let mut manifest_ns = + create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.group", DataType::Utf8), + ]) .await .unwrap(); + let schema = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema.field_with_name("user_id").is_ok()); + assert!(schema.field_with_name("group").is_ok()); - // Create table - let buffer = create_test_ipc_data(); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); - dir_namespace - .create_table(create_request, Bytes::from(buffer.clone())) + manifest_ns + .remove_extended_properties(&vec!["lance.manifest.extended.user_id"]) .await .unwrap(); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema_after.field_with_name("user_id").is_err()); + assert!(schema_after.field_with_name("group").is_ok()); - // Try to create table with same name - should fail - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(vec!["test_table".to_string()]); - let result = dir_namespace - .create_table(create_request, Bytes::from(buffer)) - .await; - assert!(result.is_err()); + // Remove non-existent property should be a no-op + manifest_ns + .remove_extended_properties(&vec!["lance.manifest.extended.user_id"]) + .await + .unwrap(); + let schema_after = manifest_ns.full_manifest_schema().await.unwrap(); + assert!(schema_after.field_with_name("user_id").is_err()); + assert!(schema_after.field_with_name("group").is_ok()); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_create_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, ListNamespacesRequest, NamespaceExistsRequest, - }; - + async fn test_remove_extended_properties_rejects_missing_prefix( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); - - // Create a child namespace - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["ns1".to_string()]); - let result = dir_namespace.create_namespace(create_req).await; - assert!( - result.is_ok(), - "Failed to create child namespace: {:?}", - result.err() - ); - - // Verify namespace exists - let exists_req = NamespaceExistsRequest { - id: Some(vec!["ns1".to_string()]), - ..Default::default() - }; - let result = dir_namespace.namespace_exists(exists_req).await; - assert!(result.is_ok(), "Namespace should exist"); - - // List child namespaces of root - let list_req = ListNamespacesRequest { - id: Some(vec![]), - page_token: None, - limit: None, - ..Default::default() - }; - let result = dir_namespace.list_namespaces(list_req).await; - assert!(result.is_ok()); - let namespaces = result.unwrap(); - assert_eq!(namespaces.namespaces.len(), 1); - assert_eq!(namespaces.namespaces[0], "ns1"); + let mut manifest_ns = + create_manifest_namespace_for_test(temp_path, inline_optimization).await; + let result = manifest_ns + .remove_extended_properties(&vec!["user_id"]) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("must start with prefix")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_create_nested_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, ListNamespacesRequest, NamespaceExistsRequest, - }; - + async fn test_create_namespace_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) + let namespace = DirectoryNamespaceBuilder::new(temp_path) .inline_optimization_enabled(inline_optimization) .build() - .await - .unwrap(); - - // Create parent namespace - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["parent".to_string()]); - dir_namespace.create_namespace(create_req).await.unwrap(); - - // Create nested child namespace - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["parent".to_string(), "child".to_string()]); - let result = dir_namespace.create_namespace(create_req).await; - assert!( - result.is_ok(), - "Failed to create nested namespace: {:?}", - result.err() - ); - - // Verify nested namespace exists - let exists_req = NamespaceExistsRequest { - id: Some(vec!["parent".to_string(), "child".to_string()]), - ..Default::default() - }; - let result = dir_namespace.namespace_exists(exists_req).await; - assert!(result.is_ok(), "Nested namespace should exist"); - - // List child namespaces of parent - let list_req = ListNamespacesRequest { - id: Some(vec!["parent".to_string()]), - page_token: None, - limit: None, - ..Default::default() - }; - let result = dir_namespace.list_namespaces(list_req).await; - assert!(result.is_ok()); - let namespaces = result.unwrap(); - assert_eq!(namespaces.namespaces.len(), 1); - assert_eq!(namespaces.namespaces[0], "child"); + .await + .unwrap(); + + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + + let result = namespace.create_namespace(create_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_create_namespace_without_parent_fails(#[case] inline_optimization: bool) { - use lance_namespace::models::CreateNamespaceRequest; - + async fn test_create_namespace_with_extended_properties_succeeds_and_describe_unified( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![("lance.manifest.extended.user_id", DataType::Utf8)]) .await .unwrap(); - // Try to create nested namespace without parent - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["nonexistent_parent".to_string(), "child".to_string()]); - let result = dir_namespace.create_namespace(create_req).await; - assert!(result.is_err(), "Should fail when parent doesn't exist"); + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + manifest_ns.create_namespace(create_req).await.unwrap(); + + let describe_req = DescribeNamespaceRequest { + id: Some(vec!["ns1".to_string()]), + ..Default::default() + }; + let response = manifest_ns.describe_namespace(describe_req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + assert_eq!(props.get("owner"), Some(&"alice".to_string())); + assert_eq!( + props.get("lance.manifest.extended.user_id"), + Some(&"123".to_string()) + ); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_drop_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, DropNamespaceRequest, NamespaceExistsRequest, - }; - + async fn test_extended_properties_null_and_empty_values_omitted( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.null_prop", DataType::Utf8), + ("lance.manifest.extended.empty_prop", DataType::Utf8), + ("lance.manifest.extended.non_existed", DataType::Utf8), + ("lance.manifest.extended.valid_prop", DataType::Utf8), + ]) .await .unwrap(); - // Create a child namespace - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["ns1".to_string()]); - dir_namespace.create_namespace(create_req).await.unwrap(); - - // Drop the namespace - let mut drop_req = DropNamespaceRequest::new(); - drop_req.id = Some(vec!["ns1".to_string()]); - let result = dir_namespace.drop_namespace(drop_req).await; - assert!( - result.is_ok(), - "Failed to drop namespace: {:?}", - result.err() + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.null_prop".to_string(), + "null".to_string(), + ); + properties.insert( + "lance.manifest.extended.empty_prop".to_string(), + "".to_string(), ); + properties.insert( + "lance.manifest.extended.valid_prop".to_string(), + "42".to_string(), + ); + let mut create_req = ClientCreateNamespaceRequest::new(); + create_req.id = Some(vec!["ns1".to_string()]); + create_req.properties = Some(properties); + manifest_ns.create_namespace(create_req).await.unwrap(); - // Verify namespace no longer exists - let exists_req = NamespaceExistsRequest { + let describe_req = DescribeNamespaceRequest { id: Some(vec!["ns1".to_string()]), ..Default::default() }; - let result = dir_namespace.namespace_exists(exists_req).await; - assert!(result.is_err(), "Namespace should not exist after drop"); + let response = manifest_ns.describe_namespace(describe_req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + + assert_eq!(props.get("owner"), Some(&"alice".to_string())); + assert_eq!( + props.get("lance.manifest.extended.valid_prop"), + Some(&"42".to_string()) + ); + assert!(!props.contains_key("lance.manifest.extended.null_prop")); + assert!(!props.contains_key("lance.manifest.extended.empty_prop")); + assert!(!props.contains_key("lance.manifest.extended.non_existed")); + } + + async fn create_manifest_namespace_for_test( + root: &str, + inline_optimization: bool, + ) -> ManifestNamespace { + let (object_store, base_path) = ObjectStore::from_uri(root).await.unwrap(); + ManifestNamespace::from_directory( + root.to_string(), + None, + None, + object_store, + base_path, + true, + inline_optimization, + None, + ) + .await + .unwrap() } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_drop_namespace_with_children_fails(#[case] inline_optimization: bool) { - use lance_namespace::models::{CreateNamespaceRequest, DropNamespaceRequest}; - + async fn test_create_namespace_extended_record_overrides_properties( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.second_id", DataType::Utf8), + ]) .await .unwrap(); - // Create parent and child namespaces - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["parent".to_string()]); - dir_namespace.create_namespace(create_req).await.unwrap(); + let mut props = HashMap::new(); + props.insert("owner".to_string(), "alice".to_string()); + props.insert( + "lance.manifest.extended.user_id".to_string(), + "111".to_string(), + ); + props.insert( + "lance.manifest.extended.second_id".to_string(), + "123".to_string(), + ); - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["parent".to_string(), "child".to_string()]); - dir_namespace.create_namespace(create_req).await.unwrap(); + let mut req = ClientCreateNamespaceRequest::new(); + req.id = Some(vec!["ns_ext".to_string()]); + req.properties = Some(props); - // Try to drop parent namespace - should fail because it has children - let mut drop_req = DropNamespaceRequest::new(); - drop_req.id = Some(vec!["parent".to_string()]); - let result = dir_namespace.drop_namespace(drop_req).await; - assert!(result.is_err(), "Should fail when namespace has children"); + let ext_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Utf8, true), + Field::new("second_id", DataType::Utf8, true), + ])); + let ext_batch = RecordBatch::try_new( + ext_schema, + vec![ + Arc::new(StringArray::from(vec![Some("999")])), + Arc::new(StringArray::from(vec![None::<&str>])), + ], + ) + .unwrap(); + + // `ext_batch` should override request.properties on the same extended key. + manifest_ns + .create_namespace_extended(req, Some(ext_batch)) + .await + .unwrap(); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter("object_type = 'namespace' AND object_id = 'ns_ext'") + .unwrap(); + scanner + .project(&["user_id", "second_id", "metadata"]) + .unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "999"); + let second_id_array = ManifestNamespace::get_string_column(&batch, "second_id").unwrap(); + assert_eq!(second_id_array.value(0), "123"); + + let metadata_array = ManifestNamespace::get_string_column(&batch, "metadata").unwrap(); + assert!(!metadata_array.is_null(0)); + let metadata_map: HashMap = + serde_json::from_str(metadata_array.value(0)).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + // Extended keys should not be stored in metadata. + assert!( + !metadata_map.contains_key("lance.manifest.extended.user_id"), + "extended keys must not be stored in metadata" + ); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_create_table_in_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{ - CreateNamespaceRequest, CreateTableRequest, ListTablesRequest, - }; - + async fn test_declare_table_extended_writes_extended_record(#[case] inline_optimization: bool) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); - - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) .await .unwrap(); - // Create a child namespace - let mut create_ns_req = CreateNamespaceRequest::new(); - create_ns_req.id = Some(vec!["ns1".to_string()]); - dir_namespace.create_namespace(create_ns_req).await.unwrap(); + let ext_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Utf8, true), + Field::new("score", DataType::Int32, true), + ])); + let ext_batch = RecordBatch::try_new( + ext_schema, + vec![ + Arc::new(StringArray::from(vec![Some("u1")])), + Arc::new(Int32Array::from(vec![Some(7)])), + ], + ) + .unwrap(); - // Create a table in the child namespace - let buffer = create_test_ipc_data(); - let mut create_table_req = CreateTableRequest::new(); - create_table_req.id = Some(vec!["ns1".to_string(), "table1".to_string()]); - let result = dir_namespace - .create_table(create_table_req, Bytes::from(buffer)) - .await; - assert!( - result.is_ok(), - "Failed to create table in child namespace: {:?}", - result.err() - ); + let mut props = HashMap::new(); + props.insert("owner".to_string(), "alice".to_string()); - // List tables in the namespace - let list_req = ListTablesRequest { - id: Some(vec!["ns1".to_string()]), - page_token: None, - limit: None, - ..Default::default() - }; - let result = dir_namespace.list_tables(list_req).await; - assert!(result.is_ok()); - let tables = result.unwrap(); - assert_eq!(tables.tables.len(), 1); - assert_eq!(tables.tables[0], "table1"); + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(props); + + let resp = manifest_ns + .declare_table_extended(declare_req, Some(ext_batch)) + .await + .unwrap(); + let resp_loc = resp.location.expect("response location should be present"); + assert!(resp_loc.ends_with("test_table.lance")); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter("object_type = 'table' AND object_id = 'test_table'") + .unwrap(); + scanner + .project(&["metadata", "user_id", "score", "location"]) + .unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + + let metadata_array = ManifestNamespace::get_string_column(&batch, "metadata").unwrap(); + let metadata_str = metadata_array.value(0); + let metadata_map: HashMap = serde_json::from_str(metadata_str).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "u1"); + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 7); + + let location_array = ManifestNamespace::get_string_column(&batch, "location").unwrap(); + assert!(location_array.value(0).ends_with("test_table.lance")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_describe_child_namespace(#[case] inline_optimization: bool) { - use lance_namespace::models::{CreateNamespaceRequest, DescribeNamespaceRequest}; + async fn test_create_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap(); + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); - let dir_namespace = DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + manifest_ns + .create_table(create_req, Bytes::from(buffer)) .await .unwrap(); + verify_persist_properties(&manifest_ns, "test_table").await; + } - // Create a child namespace with properties - let mut properties = std::collections::HashMap::new(); - properties.insert("key1".to_string(), "value1".to_string()); + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_declare_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; - let mut create_req = CreateNamespaceRequest::new(); - create_req.id = Some(vec!["ns1".to_string()]); - create_req.properties = Some(properties.clone()); - dir_namespace.create_namespace(create_req).await.unwrap(); + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(properties); - // Describe the namespace - let describe_req = DescribeNamespaceRequest { - id: Some(vec!["ns1".to_string()]), - ..Default::default() - }; - let result = dir_namespace.describe_namespace(describe_req).await; - assert!( - result.is_ok(), - "Failed to describe namespace: {:?}", - result.err() - ); - let response = result.unwrap(); - assert!(response.properties.is_some()); - assert_eq!( - response.properties.unwrap().get("key1"), - Some(&"value1".to_string()) - ); + manifest_ns.declare_table(declare_req).await.unwrap(); + verify_persist_properties(&manifest_ns, "test_table").await; } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_concurrent_create_and_drop_single_instance(#[case] inline_optimization: bool) { - use futures::future::join_all; - use std::sync::Arc; + async fn test_register_table_with_properties_persisted(#[case] inline_optimization: bool) { + let (_temp_dir, manifest_ns, properties) = + create_manifest_and_persist_properties(inline_optimization).await; + + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(properties); + + LanceNamespace::register_table(&manifest_ns, register_req) + .await + .unwrap(); + verify_persist_properties(&manifest_ns, "registered_table").await; + } + async fn create_manifest_and_persist_properties( + inline_optimization: bool, + ) -> (TempStdDir, ManifestNamespace, HashMap) { let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; - let dir_namespace = Arc::new( - DirectoryNamespaceBuilder::new(temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(), - ); - - // Initialize namespace first - create parent namespace to ensure __manifest table - // is created before concurrent operations - let mut create_ns_request = CreateNamespaceRequest::new(); - create_ns_request.id = Some(vec!["test_ns".to_string()]); - dir_namespace - .create_namespace(create_ns_request) + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) .await .unwrap(); - let num_tables = 10; - let mut handles = Vec::new(); - - for i in 0..num_tables { - let ns = dir_namespace.clone(); - let handle = async move { - let table_name = format!("concurrent_table_{}", i); - let table_id = vec!["test_ns".to_string(), table_name.clone()]; - let buffer = create_test_ipc_data(); + let properties = std::collections::HashMap::from([ + ("owner".to_string(), "alice".to_string()), + ( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ), + ( + "lance.manifest.extended.score".to_string(), + "42".to_string(), + ), + ]); - // Create table - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(table_id.clone()); - ns.create_table(create_request, Bytes::from(buffer)) - .await - .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + (temp_dir, manifest_ns, properties) + } - // Drop table - let mut drop_request = DropTableRequest::new(); - drop_request.id = Some(table_id); - ns.drop_table(drop_request) - .await - .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + async fn verify_persist_properties(manifest_ns: &ManifestNamespace, table: &str) { + let object_id = ManifestNamespace::build_object_id(&[], table); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + let filter = format!("object_id = '{}'", object_id); + scanner.filter(&filter).unwrap(); + scanner.project(&["metadata", "user_id", "score"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + + let metadata_array = ManifestNamespace::get_string_column(batch, "metadata").unwrap(); + let metadata_str = metadata_array.value(0); + let metadata_map: std::collections::HashMap = + serde_json::from_str(metadata_str).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + + let user_id_array = ManifestNamespace::get_string_column(batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "123"); + + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 42); + } - Ok::<_, lance_core::Error>(()) - }; - handles.push(handle); - } + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; - let results = join_all(handles).await; - for result in results { - assert!(result.is_ok(), "All concurrent operations should succeed"); - } + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); - // Verify all tables are dropped - let mut request = ListTablesRequest::new(); - request.id = Some(vec!["test_ns".to_string()]); - let response = dir_namespace.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0, "All tables should be dropped"); + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + + let result = manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_concurrent_create_and_drop_multiple_instances(#[case] inline_optimization: bool) { - use futures::future::join_all; - + async fn test_declare_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap().to_string(); - - // Initialize namespace first with a single instance to ensure __manifest - // table is created and parent namespace exists before concurrent operations - let init_ns = DirectoryNamespaceBuilder::new(&temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); - let mut create_ns_request = CreateNamespaceRequest::new(); - create_ns_request.id = Some(vec!["test_ns".to_string()]); - init_ns.create_namespace(create_ns_request).await.unwrap(); - - let num_tables = 10; - let mut handles = Vec::new(); - - for i in 0..num_tables { - let path = temp_path.clone(); - let handle = async move { - // Each task creates its own namespace instance - let ns = DirectoryNamespaceBuilder::new(&path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; - let table_name = format!("multi_ns_table_{}", i); - let table_id = vec!["test_ns".to_string(), table_name.clone()]; - let buffer = create_test_ipc_data(); + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); - // Create table - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(table_id.clone()); - ns.create_table(create_request, Bytes::from(buffer)) - .await - .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(properties); - // Drop table - let mut drop_request = DropTableRequest::new(); - drop_request.id = Some(table_id); - ns.drop_table(drop_request) - .await - .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + let result = manifest_ns.declare_table(declare_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); + } - Ok::<_, lance_core::Error>(()) - }; - handles.push(handle); - } + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_register_table_with_extended_properties_without_columns_fails( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; - let results = join_all(handles).await; - for result in results { - assert!(result.is_ok(), "All concurrent operations should succeed"); - } + let mut properties = std::collections::HashMap::new(); + properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); - // Verify with a fresh namespace instance - let verify_ns = DirectoryNamespaceBuilder::new(&temp_path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(properties); - let mut request = ListTablesRequest::new(); - request.id = Some(vec!["test_ns".to_string()]); - let response = verify_ns.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0, "All tables should be dropped"); + let result = LanceNamespace::register_table(&manifest_ns, register_req).await; + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("Column user_id does not exist in extended properties")); } #[rstest] #[case::with_optimization(true)] #[case::without_optimization(false)] #[tokio::test] - async fn test_concurrent_create_then_drop_from_different_instance( + async fn test_create_table_extended_properties_null_and_empty_values_omitted( #[case] inline_optimization: bool, ) { - use futures::future::join_all; - let temp_dir = TempStdDir::default(); - let temp_path = temp_dir.to_str().unwrap().to_string(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.null_prop", DataType::Utf8), + ("lance.manifest.extended.empty_prop", DataType::Utf8), + ("lance.manifest.extended.valid_prop", DataType::Utf8), + ("lance.manifest.extended.non_existed_prop", DataType::Utf8), + ]) + .await + .unwrap(); - // Initialize namespace first with a single instance to ensure __manifest - // table is created and parent namespace exists before concurrent operations - let init_ns = DirectoryNamespaceBuilder::new(&temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + let mut properties = std::collections::HashMap::new(); + properties.insert("owner".to_string(), "alice".to_string()); + properties.insert( + "lance.manifest.extended.null_prop".to_string(), + "null".to_string(), + ); + properties.insert( + "lance.manifest.extended.empty_prop".to_string(), + "".to_string(), + ); + properties.insert( + "lance.manifest.extended.valid_prop".to_string(), + "42".to_string(), + ); + + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + manifest_ns + .create_table(create_req, Bytes::from(buffer)) .await .unwrap(); - let mut create_ns_request = CreateNamespaceRequest::new(); - create_ns_request.id = Some(vec!["test_ns".to_string()]); - init_ns.create_namespace(create_ns_request).await.unwrap(); - let num_tables = 10; + let object_id = ManifestNamespace::build_object_id(&[], "test_table"); - // Phase 1: Create all tables concurrently using separate namespace instances - let mut create_handles = Vec::new(); - for i in 0..num_tables { - let path = temp_path.clone(); - let handle = async move { - let ns = DirectoryNamespaceBuilder::new(&path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + let filter = format!("object_id = '{}'", object_id); + scanner.filter(&filter).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); - let table_name = format!("cross_instance_table_{}", i); - let table_id = vec!["test_ns".to_string(), table_name.clone()]; - let buffer = create_test_ipc_data(); + assert_eq!(batches.len(), 1); + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); - let mut create_request = CreateTableRequest::new(); - create_request.id = Some(table_id); - ns.create_table(create_request, Bytes::from(buffer)) - .await - .unwrap_or_else(|e| panic!("Failed to create table {}: {}", table_name, e)); + let extended_props = batch_to_extended_props(batch); - Ok::<_, lance_core::Error>(()) - }; - create_handles.push(handle); - } + assert_eq!( + extended_props.get("lance.manifest.extended.valid_prop"), + Some(&"42".to_string()) + ); + assert!(!extended_props.contains_key("lance.manifest.extended.null_prop")); + assert!(!extended_props.contains_key("lance.manifest.extended.empty_prop")); + assert!(!extended_props.contains_key("lance.manifest.extended.non_existed_prop")); + } - let create_results = join_all(create_handles).await; - for result in create_results { - assert!(result.is_ok(), "All create operations should succeed"); - } + #[tokio::test] + async fn test_describe_table_unifies_properties() { + let (_temp_dir, manifest_ns, base_properties) = prepare_properties_env().await; - // Phase 2: Drop all tables concurrently using NEW namespace instances - let mut drop_handles = Vec::new(); - for i in 0..num_tables { - let path = temp_path.clone(); - let handle = async move { - let ns = DirectoryNamespaceBuilder::new(&path) - .inline_optimization_enabled(inline_optimization) - .build() - .await - .unwrap(); + // create_table scenario + let buffer = create_test_ipc_data(); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["created_table".to_string()]); + create_req.properties = Some(base_properties.clone()); + manifest_ns + .create_table(create_req, Bytes::from(buffer)) + .await + .unwrap(); - let table_name = format!("cross_instance_table_{}", i); - let table_id = vec!["test_ns".to_string(), table_name.clone()]; + verify_describe_table_props(&manifest_ns, "created_table", Some(true), &base_properties) + .await; + verify_describe_table_props(&manifest_ns, "created_table", Some(false), &base_properties) + .await; - let mut drop_request = DropTableRequest::new(); - drop_request.id = Some(table_id); - ns.drop_table(drop_request) - .await - .unwrap_or_else(|e| panic!("Failed to drop table {}: {}", table_name, e)); + // declare_table scenario + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["declared_table".to_string()]); + declare_req.properties = Some(base_properties.clone()); + manifest_ns.declare_table(declare_req).await.unwrap(); - Ok::<_, lance_core::Error>(()) - }; - drop_handles.push(handle); - } + verify_describe_table_props(&manifest_ns, "declared_table", Some(true), &base_properties) + .await; + verify_describe_table_props( + &manifest_ns, + "declared_table", + Some(false), + &base_properties, + ) + .await; - let drop_results = join_all(drop_handles).await; - for result in drop_results { - assert!(result.is_ok(), "All drop operations should succeed"); - } + // register_table scenario + let mut register_req = RegisterTableRequest::new("registered_table.lance".to_string()); + register_req.id = Some(vec!["registered_table".to_string()]); + register_req.properties = Some(base_properties.clone()); + LanceNamespace::register_table(&manifest_ns, register_req) + .await + .unwrap(); - // Verify all tables are dropped - let verify_ns = DirectoryNamespaceBuilder::new(&temp_path) - .inline_optimization_enabled(inline_optimization) - .build() + verify_describe_table_props( + &manifest_ns, + "registered_table", + Some(true), + &base_properties, + ) + .await; + verify_describe_table_props( + &manifest_ns, + "registered_table", + Some(false), + &base_properties, + ) + .await; + } + + async fn prepare_properties_env() -> ( + TempStdDir, + ManifestNamespace, + std::collections::HashMap, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, true).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) .await .unwrap(); - let mut request = ListTablesRequest::new(); - request.id = Some(vec!["test_ns".to_string()]); - let response = verify_ns.list_tables(request).await.unwrap(); - assert_eq!(response.tables.len(), 0, "All tables should be dropped"); + // prepare base properties + let mut base_properties = std::collections::HashMap::new(); + base_properties.insert("owner".to_string(), "alice".to_string()); + base_properties.insert( + "lance.manifest.extended.user_id".to_string(), + "123".to_string(), + ); + base_properties.insert( + "lance.manifest.extended.score".to_string(), + "42".to_string(), + ); + + (temp_dir, manifest_ns, base_properties) + } + + async fn verify_describe_table_props( + manifest_ns: &ManifestNamespace, + table_name: &str, + load_detailed_metadata: Option, + base_properties: &std::collections::HashMap, + ) { + let req = DescribeTableRequest { + id: Some(vec![table_name.to_string()]), + load_detailed_metadata, + ..Default::default() + }; + let response = manifest_ns.describe_table(req).await.unwrap(); + let props = response.properties.expect("properties should be present"); + for (k, v) in base_properties.iter() { + assert_eq!(props.get(k), Some(v)); + } } #[test] @@ -2918,4 +4396,167 @@ mod tests { "URL with existing trailing slash should still work" ); } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_declare_table_extended_record_overrides_request_properties_and_null_falls_back( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + // extended_record should have higher priority than request.properties when non-null. + // When extended_record value is null, it should fall back to request.properties. + let ext_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Utf8, true), + Field::new("score", DataType::Int32, true), + ])); + let ext_batch = RecordBatch::try_new( + ext_schema, + vec![ + Arc::new(StringArray::from(vec![Some("u_ext")])), + Arc::new(Int32Array::from(vec![None])), + ], + ) + .unwrap(); + + let properties = std::collections::HashMap::from([ + ("owner".to_string(), "alice".to_string()), + ( + "lance.manifest.extended.user_id".to_string(), + "u_req".to_string(), + ), + ("lance.manifest.extended.score".to_string(), "7".to_string()), + ]); + + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + declare_req.properties = Some(properties); + + manifest_ns + .declare_table_extended(declare_req, Some(ext_batch)) + .await + .unwrap(); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter("object_type = 'table' AND object_id = 'test_table'") + .unwrap(); + scanner.project(&["user_id", "score"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "u_ext"); + + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 7); + } + + #[rstest] + #[case::with_optimization(true)] + #[case::without_optimization(false)] + #[tokio::test] + async fn test_create_table_extended_with_extended_record_and_properties( + #[case] inline_optimization: bool, + ) { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let manifest_ns = create_manifest_namespace_for_test(temp_path, inline_optimization).await; + manifest_ns + .add_extended_properties(&vec![ + ("lance.manifest.extended.user_id", DataType::Utf8), + ("lance.manifest.extended.score", DataType::Int32), + ]) + .await + .unwrap(); + + // extended_record should override request.properties and fallback to request.properties if + // null. + let ext_schema = Arc::new(Schema::new(vec![ + Field::new("user_id", DataType::Utf8, true), + Field::new("score", DataType::Int32, true), + ])); + let ext_batch = RecordBatch::try_new( + ext_schema, + vec![ + Arc::new(StringArray::from(vec![Some("u_ext")])), + Arc::new(Int32Array::from(vec![None])), + ], + ) + .unwrap(); + + let properties = std::collections::HashMap::from([ + ("owner".to_string(), "alice".to_string()), + ( + "lance.manifest.extended.user_id".to_string(), + "u_req".to_string(), + ), + ("lance.manifest.extended.score".to_string(), "7".to_string()), + ]); + + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + create_req.properties = Some(properties); + + let resp = manifest_ns + .create_table_extended( + create_req, + Bytes::from(create_empty_ipc_data()), + Some(ext_batch), + ) + .await + .unwrap(); + let resp_loc = resp.location.expect("response location should be present"); + assert!(resp_loc.ends_with("test_table.lance")); + + let mut scanner = manifest_ns.manifest_scanner().await.unwrap(); + scanner + .filter("object_type = 'table' AND object_id = 'test_table'") + .unwrap(); + scanner.project(&["user_id", "score", "metadata"]).unwrap(); + let batches = ManifestNamespace::execute_scanner(scanner).await.unwrap(); + let total_rows: usize = batches.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 1); + let batch = batches + .into_iter() + .find(|b| b.num_rows() > 0) + .expect("expected a non-empty batch"); + + let user_id_array = ManifestNamespace::get_string_column(&batch, "user_id").unwrap(); + assert_eq!(user_id_array.value(0), "u_ext"); + + let score_array = batch + .column_by_name("score") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(score_array.value(0), 7); + + let metadata_array = ManifestNamespace::get_string_column(&batch, "metadata").unwrap(); + let metadata_map: HashMap = + serde_json::from_str(metadata_array.value(0)).unwrap(); + assert_eq!(metadata_map.get("owner"), Some(&"alice".to_string())); + } } diff --git a/rust/lance-namespace-impls/src/lib.rs b/rust/lance-namespace-impls/src/lib.rs index a67aff1784d..1e1bd118277 100644 --- a/rust/lance-namespace-impls/src/lib.rs +++ b/rust/lance-namespace-impls/src/lib.rs @@ -72,12 +72,15 @@ pub mod connect; pub mod context; pub mod credentials; pub mod dir; +pub mod udf; #[cfg(feature = "rest")] pub mod rest; +pub mod partition; #[cfg(feature = "rest-adapter")] pub mod rest_adapter; +mod util; // Re-export connect builder pub use connect::ConnectBuilder; diff --git a/rust/lance-namespace-impls/src/partition.rs b/rust/lance-namespace-impls/src/partition.rs new file mode 100644 index 00000000000..e2db543d488 --- /dev/null +++ b/rust/lance-namespace-impls/src/partition.rs @@ -0,0 +1,3616 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors +// NOTE: Keep this module warning-clean; avoid `#![allow(unused)]`. + +#[cfg(test)] +use crate::dir::manifest::DELIMITER; +use crate::dir::manifest::{EXTENDED_PREFIX, ManifestObject, scalar_to_str}; +use crate::udf::MURMUR3_MULTI_UDF; +use crate::util::{ + check_table_spec_consistency, ensure_all_schema_field_have_id, expr_to_cdf, + is_cdf_always_false, is_comparison_op, +}; +use crate::{DirectoryNamespace, ManifestNamespace, context::DynamicContextProvider}; +use arrow::array::{Array, ArrayRef, RecordBatch, new_null_array}; +use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; +use arrow_ipc::writer::StreamWriter; +use arrow_schema::{DataType, FieldRef, SchemaRef}; +use async_trait::async_trait; +use bytes::Bytes; +#[cfg(test)] +use lance::Dataset; +use lance::deps::datafusion::logical_expr::{Expr, Operator}; +use lance::deps::datafusion::prelude::{SessionContext, col, lit}; +use lance::deps::datafusion::scalar::ScalarValue; +use lance::io::exec::Planner; +use lance_arrow::SchemaExt; +use lance_core::utils::backoff::Backoff; +use lance_core::{Error, Result}; +use lance_namespace::LanceNamespace; +use lance_namespace::error::NamespaceError; +use lance_namespace::models::{ + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateNamespaceRequest, CreateNamespaceResponse, + CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, JsonArrowSchema, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, + MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, +}; +use lance_namespace::schema::{ + arrow_schema_to_json, arrow_type_to_json, convert_json_arrow_schema, convert_json_arrow_type, +}; +use lance_namespace_reqwest_client::models::PartitionField as JsonPartitionField; +use lance_namespace_reqwest_client::models::PartitionSpec as JsonPartitionSpec; +use lance_namespace_reqwest_client::models::PartitionTransform as JsonPartitionTransform; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt::{Debug, Formatter}; +use std::sync::Arc; +use tokio::time::sleep; + +// Column name of table spec id. +const SPEC_ID_COL: &str = "spec_id"; + +// Keys of table spec schema and partition spec, stored as properties on table spec namespaces. +const NS_PROP_SCHEMA: &str = "schema"; +const NS_PROP_PARTITION_SPEC: &str = "partition_spec"; + +/// A PartitionedNamespace is a directory namespace containing collections of tables that share +/// common schemas. These tables are physically separated and independent, but logically related +/// through partition fields definition. +pub struct PartitionedNamespace { + /// Underlying directory namespace used for physical storage. + directory: DirectoryNamespace, + /// Underlying manifest namespace used for metadata and table discovery. + manifest: Arc, + /// Root location URI of this partitioned namespace. + location: String, + /// The definition of the table, each object represents one version of table in a sorted order. + /// The last object is the current table spec. + table_specs: Vec, +} + +impl Debug for PartitionedNamespace { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "PartitionedNamespace({})", self.namespace_id()) + } +} + +impl PartitionedNamespace { + /// Partition pruning for the given filter expression. + /// + /// # Arguments + /// + /// * `filter` - The filter expression to be applied. + /// + /// Returns the list of (partition table, refine expr) that are required to scan. + pub async fn plan_scan(&self, filter: &Expr) -> Result> { + let cdf = expr_to_cdf(filter); + + let mut planned: Vec<(PartitionTable, Expr)> = Vec::new(); + for table in &self.table_specs { + let schema = table.schema.clone(); + + if is_cdf_always_false(&schema, &cdf) { + continue; + } + + // Prune for each partition. + let mut manifest_pred = lit(true); + for field in &table.partition_spec.fields { + let expr = field.partition_prune(schema.clone(), &cdf).await?; + manifest_pred = manifest_pred.and(expr); + } + + // Query manifest to get candidate tables for this table spec. + let table_filter = col("object_type") + .eq(lit("table")) + .and(col(partition_field_col(SPEC_ID_COL)).eq(lit(table.id))); + let objects = self + .manifest + .query_manifest_expr(table_filter.and(manifest_pred)) + .await?; + + // Currently, refine expr is always the original filter. We can return different refine + // expr in future work. + for t in extract_tables(objects)? { + planned.push((t, filter.clone())); + } + } + + Ok(planned) + } + + /// Resolve the target partition table for the input row. Create it (empty table) if not exists. + /// + /// # Arguments + /// + /// * `record` - The record batch to be resolved, it should contain only one row. + /// + /// Returns the partition table that the input row belongs to. + pub async fn resolve_or_create_partition_table( + &self, + record: &RecordBatch, + ) -> Result { + let spec = self.partition_spec().await?; + let partition_values = partition_values(&spec.fields, record).await?; + + // Concurrent writers (e.g. Spark tasks) may race to create the same partition table. + // Some races surface as retryable commit conflicts from lower layers. + let mut backoff = Backoff::default(); + let mut last_retryable: Option = None; + for _attempt in 0..20 { + if let Some(table) = self + .resolve_partition_table(&spec, &partition_values) + .await? + { + return Ok(table); + } + + match self.create_partition_table(&spec, &partition_values).await { + Ok(table) => return Ok(table), + Err(e) if is_retryable_commit_conflict(&e) => { + last_retryable = Some(e); + sleep(backoff.next_backoff()).await; + continue; + } + Err(e) => { + // One last resolve attempt. + return if let Some(table) = self + .resolve_partition_table(&spec, &partition_values) + .await? + { + Ok(table) + } else { + Err(e) + }; + } + } + } + + // Retries exhausted. If another writer won the race, resolve should now succeed. + if let Some(table) = self + .resolve_partition_table(&spec, &partition_values) + .await? + { + return Ok(table); + } + + // Since the latest error is a retryable commit conflict, return TooMuchWriteContention. + Err(last_retryable.unwrap_or_else(|| { + Error::too_much_write_contention( + "Failed to resolve_or_create_partition_table after retries", + ) + })) + } + + async fn create_partition_table( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result { + self.ensure_partition_fields_exists(&spec.fields).await?; + if partition_values.len() != spec.fields.len() { + return Err(Error::invalid_input(format!( + "partition_values length {} must match partition fields length {}", + partition_values.len(), + spec.fields.len() + ))); + } + + // Create missing namespaces. + let mut table_id = self + .create_missing_partition_namespaces(spec, partition_values) + .await?; + + // Declare the leaf table. + table_id.push("dataset".to_string()); + + let table_record = + build_partition_extended_record(spec.id, &spec.fields, partition_values)?; + + // Handle concurrent creation: if table already exists, resolve and return it. + let create_table_req = CreateTableRequest { + id: Some(table_id.clone()), + mode: Some("Create".to_string()), + ..Default::default() + }; + let arrow_schema = Arc::new(self.schema()?); + let mut data = Vec::new(); + let mut writer = StreamWriter::try_new(&mut data, &arrow_schema)?; + writer.finish()?; + if let Err(e) = self + .manifest + .create_table_extended(create_table_req, Bytes::from(data), Some(table_record)) + .await + { + // Concurrency: if a competitor created it, resolve and return. + if let Some(table) = self.resolve_partition_table(spec, partition_values).await? { + return Ok(table); + } + // Otherwise, propagate the underlying error so callers can decide whether to retry. + return Err(e); + } + + Ok(PartitionTable { + id: table_id, + read_version: None, + }) + } + + async fn resolve_partition_table( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result> { + let partition_expr = partition_expressions(spec.id, &spec.fields, partition_values)?; + let table_expr = col("object_type").eq(lit("table")).and(partition_expr); + + let objects = self.manifest.query_manifest_expr(table_expr).await?; + let tables = extract_tables(objects)?; + for table in tables.into_iter() { + if table.id.first() == Some(&spec.spec_id_str()) { + return Ok(Some(table)); + } + } + Ok(None) + } + + /// Create all levels of partition namespaces, return the full partition namespace id. + /// + /// Create namespace with name generated from partition value if the level is missing. + async fn create_missing_partition_namespaces( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result> { + // Always use deterministic namespace ids derived from (spec_id, partition_values prefix). + // For each level, we compute the deterministic id and ensure it exists. + let ns_full_id = create_partition_namespace_id(spec, partition_values)?; + for level in 0..partition_values.len() { + // expected_id = v{n}.{level_0}.{level_1}...{level_i} + let expected_id = ns_full_id[0..level + 2].to_vec(); + let request = DescribeNamespaceRequest { + id: Some(expected_id.clone()), + ..Default::default() + }; + if self.describe_namespace(request).await.is_ok() { + continue; + } + + // Missing + let create_req = CreateNamespaceRequest { + id: Some(expected_id.clone()), + ..Default::default() + }; + let batch = build_partition_extended_record( + spec.id, + &spec.fields, + &partition_values[0..level + 1], + )?; + + if let Err(e) = self + .manifest + .create_namespace_extended(create_req, Some(batch)) + .await + { + // Concurrency: re-query and reuse if created by competitor. + let request = DescribeNamespaceRequest { + id: Some(expected_id), + ..Default::default() + }; + if self.describe_namespace(request).await.is_err() { + return Err(e); + } + } + } + + Ok(ns_full_id) + } + + /// Commit the partition table changes. + /// + /// If ACID is disabled, commit does nothing. + /// Otherwise, if the partition namespace is changed after read version, this method will + /// auto-detect the conflicts. + /// + /// # Arguments + /// + /// * `read_version` - The partition tables that are read in the transaction. + /// * `new_version` - The partition tables that are written in the transaction. + /// + /// Returns the new version of the partitioned namespace. + pub fn commit( + &self, + read_version: Option>, + new_version: Option>, + ) -> Result>> { + let _ = (read_version, new_version); + Err(Error::internal( + "PartitionedNamespace.commit is not implemented", + )) + } + + /// Schema of the partitioned namespace. + pub fn schema(&self) -> Result { + let tables = self.latest_table_spec()?; + Ok(tables.schema.clone()) + } + + /// All partition tables of the partitioned namespace. + pub async fn tables(&self) -> Result> { + let objects = self + .manifest + .query_manifest_expr(col("object_type").eq(lit("table"))) + .await?; + extract_tables(objects) + } + + /// Update the schema and partition spec. It will create a new version of table spec for the new + /// definition. + /// + /// This method is used for both schema evolution and partition evolution. + /// + /// # Arguments + /// + /// * `schema` - The new schema. + /// * `partition_spec` - The new partition spec. + /// + /// Returns the new table spec. + pub async fn update_table_spec( + &mut self, + schema: ArrowSchema, + partition_spec: Vec, + ) -> Result { + let schema = ensure_all_schema_field_have_id(schema)?; + check_table_spec_consistency(&schema, &partition_spec)?; + + // Build the new spec fields, reusing existing field_id where possible. + let new_spec_id = self.latest_table_spec()?.id + 1; + + let mut existed_fields = HashMap::new(); + let mut existed_ids = HashMap::new(); + for field in self.all_partition_fields().await? { + existed_fields.insert(field.signature(), field.clone()); + existed_ids.insert(field.field_id.clone(), field); + } + + let mut new_fields: Vec = Vec::with_capacity(partition_spec.len()); + for mut f in partition_spec.into_iter() { + if let Some(field) = existed_fields.get(&f.signature()) { + // Reuse existing field_id for the same signature. + f.field_id = field.field_id.clone(); + } else if let Some(field) = existed_ids.get(&f.field_id) { + // Field IDs must never be reused for a different definition. + if field.signature() != f.signature() { + return Err(Error::invalid_input(format!( + "Partition field_id '{}' is already used by another field; cannot reuse it", + f.field_id + ))); + } + } + new_fields.push(f); + } + + let new_spec = PartitionSpec { + id: new_spec_id, + fields: new_fields, + }; + let table = TableSpec::new(new_spec_id, schema, new_spec); + self.force_sink_table_spec(&table).await?; + + // Keep in-memory specs consistent with persisted state. + self.table_specs.push(table.clone()); + + Ok(table) + } + + /// Sink table spec to namespace. + pub(crate) async fn force_sink_table_spec(&self, table: &TableSpec) -> Result<()> { + let json_schema = arrow_schema_to_json(&table.schema)?; + let schema_json = serde_json::to_string(&json_schema) + .map_err(|e| Error::internal(format!("Failed to serialize schema: {}", e)))?; + + let spec_json = serde_json::to_string(&table.partition_spec.to_json()?) + .map_err(|e| Error::internal(format!("Failed to serialize partition spec: {}", e)))?; + + let mut props = HashMap::new(); + props.insert(NS_PROP_SCHEMA.to_string(), schema_json); + props.insert(NS_PROP_PARTITION_SPEC.to_string(), spec_json); + + // Sink + let create_req = CreateNamespaceRequest { + id: Some(vec![table.spec_id_str()]), + properties: Some(props), + ..Default::default() + }; + + if let Err(e) = self.create_namespace(create_req).await { + // Handle concurrent sinking error. + match load_table_spec(&self.directory, table.id).await { + Ok(Some(existed_table)) => { + if existed_table.schema != table.schema + || existed_table.partition_spec != table.partition_spec + { + return Err(Error::invalid_input(format!( + "A table spec with id {} already exists and contains different schema or partition spec", + existed_table.id + ))); + } + } + _ => return Err(e), + } + } + + self.ensure_partition_fields_exists(&table.partition_spec.fields) + .await?; + Ok(()) + } + + /// Add a new column to the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be added. + /// + /// Returns the new schema. + pub async fn add_column(&mut self, column: &ArrowField) -> Result { + let schema = self.schema()?; + if schema.fields().iter().any(|f| f.name() == column.name()) { + return Err(Error::invalid_input(format!( + "Column '{}' already exists", + column.name() + ))); + } + + let mut fields: Vec = + schema.fields().iter().map(|f| f.as_ref().clone()).collect(); + fields.push(column.clone()); + let new_schema = ArrowSchema::new_with_metadata(fields, schema.metadata().clone()); + + let partition_spec = self.latest_table_spec()?.partition_spec.fields.clone(); + let new_table = self.update_table_spec(new_schema, partition_spec).await?; + Ok(new_table.schema) + } + + /// Drop the given column from the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be dropped. + /// + /// Returns the new schema. + pub async fn drop_column(&mut self, column: &str) -> Result { + let schema = self.schema()?; + let mut fields: Vec = + schema.fields().iter().map(|f| f.as_ref().clone()).collect(); + + // TODO: support remove nested column in the future. + let Some(idx) = fields.iter().position(|f| f.name() == column) else { + return Err(Error::invalid_input(format!( + "Column '{}' not found", + column + ))); + }; + + fields.remove(idx); + let new_schema = ArrowSchema::new_with_metadata(fields, schema.metadata().clone()); + + // Drop column might remove column referenced by the current partition spec. + // Since update_table_spec will check whether source_ids exist in schema, we don't check it + // here. + let partition_spec = self.latest_table_spec()?.partition_spec.fields.clone(); + + let new_table = self.update_table_spec(new_schema, partition_spec).await?; + Ok(new_table.schema) + } + + /// Rename the given column in the partitioned namespace. + /// + /// # Arguments + /// + /// * `old_name` - The old name of the column. + /// * `new_name` - The new name of the column. + /// + /// Returns the new schema. + pub async fn rename_column(&mut self, old_name: &str, new_name: &str) -> Result { + if old_name == new_name { + return self.schema(); + } + + let schema = self.schema()?; + if schema.fields().iter().any(|f| f.name() == new_name) { + return Err(Error::invalid_input(format!( + "Column '{}' already exists", + new_name + ))); + } + + let mut fields: Vec = + schema.fields().iter().map(|f| f.as_ref().clone()).collect(); + + let Some(field) = fields.iter_mut().find(|f| f.name() == old_name) else { + return Err(Error::invalid_input(format!( + "Column '{}' not found", + old_name + ))); + }; + + field.set_name(new_name); + + let new_schema = ArrowSchema::new_with_metadata(fields, schema.metadata().clone()); + + let partition_spec = self.latest_table_spec()?.partition_spec.fields.clone(); + let new_table = self.update_table_spec(new_schema, partition_spec).await?; + Ok(new_table.schema) + } + + /// Promote the type of the given column to the new type in the partitioned namespace. + /// + /// # Arguments + /// + /// * `column` - The column to be promoted. + /// * `new_type` - The new type of the column. + /// + /// Returns the new schema. + pub fn type_promotion(&self, _column: &str, _new_type: &DataType) -> Result { + Err(Error::internal( + "PartitionedNamespace.type_promotion is not implemented", + )) + } + + /// Get the current partition spec + pub async fn partition_spec(&self) -> Result { + Ok(self.latest_table_spec()?.partition_spec.clone()) + } + + /// Get the current/latest table spec + fn latest_table_spec(&self) -> Result<&TableSpec> { + let table = self + .table_specs + .last() + .ok_or_else(|| Error::internal("Could not find any table spec"))?; + Ok(table) + } + + /// Get all unique partition fields + async fn all_partition_fields(&self) -> Result> { + let mut id_set = HashSet::new(); + let mut partition_fields = vec![]; + + for table in self.table_specs.iter() { + for field in table.partition_spec.fields.iter() { + if id_set.insert(field.field_id.clone()) { + partition_fields.push(field.clone()); + } + } + } + Ok(partition_fields) + } + + /// Ensure __manifest has columns for all partition fields. + async fn ensure_partition_fields_exists( + &self, + partition_fields: &[PartitionField], + ) -> Result<()> { + let full_schema = self.manifest.full_manifest_schema().await?; + let existing_fields: HashMap = full_schema + .fields() + .iter() + .map(|f| (f.name().to_string(), f.data_type().clone())) + .collect(); + + let mut to_add: Vec<(String, DataType)> = Vec::new(); + collect_non_existed_fields(&existing_fields, &mut to_add, SPEC_ID_COL, &DataType::Int32)?; + for f in partition_fields.iter() { + collect_non_existed_fields(&existing_fields, &mut to_add, &f.field_id, &f.result_type)?; + } + + if !to_add.is_empty() { + let to_add_param: Vec<(&str, DataType)> = to_add + .iter() + .map(|(k, t)| (k.as_str(), t.clone())) + .collect(); + self.manifest.add_extended_properties(&to_add_param).await?; + } + Ok(()) + } +} + +/// Is the error a retryable transaction commit error. +/// +/// When commit transaction, the basic retryable errors are: +/// * Error::RetryableCommitConflict (retryable semantic error) +/// * Error::TooMuchWriteContention (retryable semantic error or timeout error) +/// * Error::CommitConflict (commit conflict error) +/// +/// In lance namespace the retryable basic errors are transformed to NamespaceError::Throttled. +/// +fn is_retryable_commit_conflict(e: &Error) -> bool { + match e { + Error::RetryableCommitConflict { .. } => true, + Error::CommitConflict { .. } => true, + Error::TooMuchWriteContention { .. } => true, + Error::Namespace { source, .. } => source + .downcast_ref::() + .is_some_and(|e| matches!(e, NamespaceError::Throttled { .. })), + _ => false, + } +} + +/// Load table specs from namespaces. Table specs are saved in children-namespaces with name v{i} +/// of the root namespace , where `i` represents the table spec id. +async fn load_table_specs(dir: &DirectoryNamespace) -> Result> { + let list_req = ListNamespacesRequest { + id: Some(vec![]), + page_token: None, + limit: None, + ..Default::default() + }; + let list_resp = dir.list_namespaces(list_req).await?; + + let mut versions: Vec = list_resp + .namespaces + .iter() + .filter_map(|name| parse_spec_namespace_version(name)) + .collect(); + versions.sort(); + + let mut table_defs = vec![]; + for v in versions.iter().copied() { + if let Some(table) = load_table_spec(dir, v).await? { + table_defs.push(table); + } + } + + Ok(table_defs) +} + +/// Load specified table specs from namespaces. +async fn load_table_spec(dir: &DirectoryNamespace, v: i32) -> Result> { + let describe_req = DescribeNamespaceRequest { + id: Some(vec![format!("v{}", v)]), + ..Default::default() + }; + let desc = dir.describe_namespace(describe_req).await?; + let props = desc.properties.unwrap_or_default(); + + let mut item_spec = None; + if let Some(spec_json) = props.get(NS_PROP_PARTITION_SPEC) { + let json_partition_spec: JsonPartitionSpec = serde_json::from_str(spec_json.as_str()) + .map_err(|e| { + Error::internal(format!( + "Failed to parse partition spec from v{} namespace properties: {}", + v, e + )) + })?; + let spec = PartitionSpec::from_json(&json_partition_spec)?; + item_spec = Some(spec); + } + + let mut item_schema: Option = None; + if let Some(schema_json) = props.get(NS_PROP_SCHEMA) { + let json_schema: JsonArrowSchema = + serde_json::from_str(schema_json.as_str()).map_err(|e| { + Error::internal(format!( + "Failed to parse schema from v{} namespace properties: {}", + v, e + )) + })?; + let arrow_schema = convert_json_arrow_schema(&json_schema)?; + item_schema = Some(arrow_schema); + } + + if let (Some(schema), Some(spec)) = (item_schema, item_spec) { + Ok(Some(TableSpec::new(v, schema, spec))) + } else { + Ok(None) + } +} + +fn extract_tables(objects: Vec) -> Result> { + let mut tables: Vec = Vec::new(); + for obj in objects { + let ManifestObject::Table(t) = obj else { + continue; + }; + // Only consider partitioned namespace leaf tables. + if t.name != "dataset" { + continue; + } + if t.namespace.is_empty() { + continue; + } + if !t.namespace[0].starts_with('v') { + continue; + } + + let mut id = t.namespace; + id.push(t.name); + tables.push(PartitionTable { + id, + read_version: None, + }); + } + Ok(tables) +} + +/// Parse a SQL filter expression into a DataFusion [`Expr`]. +pub async fn parse_filter_expr_from_sql(filter: &str, arrow_schema: &ArrowSchema) -> Result { + let filter = filter.trim(); + if filter.is_empty() { + return Ok(lit(true)); + } + let planner = Planner::new(Arc::new(arrow_schema.clone())); + planner.parse_filter(filter) +} + +/// Transform the first row in record into partition values. +async fn partition_values( + fields: &[PartitionField], + record: &RecordBatch, +) -> Result> { + let mut values: Vec = Vec::with_capacity(fields.len()); + for field in fields.iter() { + let partition_value = field.value(record).await?; + let scalar = ScalarValue::try_from_array(&partition_value, 0).map_err(|e| { + Error::internal(format!( + "Failed to convert partition value for field '{}' to scalar: {}", + field.field_id, e + )) + })?; + values.push(scalar); + } + Ok(values) +} + +/// Transform partition values into manifest filter expressions. +/// +/// Partition values are stored in `__manifest` as extended columns named +/// `partition_field_{field_id}`. +fn partition_expressions( + id: i32, + fields: &[PartitionField], + values: &[ScalarValue], +) -> Result { + if fields.len() != values.len() { + return Err(Error::invalid_input(format!( + "fields len {} must be equal to values len {}", + fields.len(), + values.len() + ))); + } + + let mut partition_expr = col(partition_field_col(SPEC_ID_COL)).eq(lit(id)); + for (field, value) in fields.iter().zip(values.iter()) { + let col_name = partition_field_col(&field.field_id); + let expr = col(col_name).eq(lit(value.clone())); + partition_expr = partition_expr.and(expr); + } + Ok(partition_expr) +} + +fn parse_spec_namespace_version(name: &str) -> Option { + let rest = name.strip_prefix('v')?; + if rest.is_empty() { + None + } else { + rest.parse::().ok() + } +} + +/// Build extended record for partition fields. +fn build_partition_extended_record( + spec_version: i32, + fields: &[PartitionField], + partition_values: &[ScalarValue], +) -> Result { + let mut arrow_fields: Vec = Vec::with_capacity(fields.len() + 1); + let mut arrays: Vec = Vec::with_capacity(fields.len() + 1); + + arrow_fields.push(ArrowField::new( + partition_field_col(SPEC_ID_COL), + DataType::Int32, + false, + )); + arrays.push( + ScalarValue::Int32(Some(spec_version)) + .to_array() + .map_err(|e| { + Error::internal(format!( + "Failed to convert spec_version {} to array: {}", + spec_version, e + )) + })?, + ); + + let max = partition_values.len() - 1; + for (idx, f) in fields.iter().enumerate() { + let col_name = partition_field_col(&f.field_id); + arrow_fields.push(ArrowField::new(&col_name, f.result_type.clone(), true)); + + let scalar = if idx > max { + ScalarValue::try_from(&f.result_type).map_err(|e| { + Error::internal(format!( + "Failed to create null scalar for partition field '{}': {}", + f.field_id, e + )) + })? + } else { + partition_values.get(idx).cloned().ok_or_else(|| { + Error::invalid_input(format!( + "partition_values length {} is smaller than required index {}", + partition_values.len(), + idx + )) + })? + }; + + let arr = scalar.to_array().map_err(|e| { + Error::internal(format!( + "Failed to convert scalar for manifest column '{}' to array: {}", + col_name, e + )) + })?; + arrays.push(arr); + } + + let schema = Arc::new(ArrowSchema::new(arrow_fields)); + RecordBatch::try_new(schema, arrays) + .map_err(|e| Error::internal(format!("Failed to create extended record batch: {}", e))) +} + +#[async_trait] +impl LanceNamespace for PartitionedNamespace { + async fn list_namespaces( + &self, + request: ListNamespacesRequest, + ) -> Result { + self.directory.list_namespaces(request).await + } + + async fn describe_namespace( + &self, + request: DescribeNamespaceRequest, + ) -> Result { + self.directory.describe_namespace(request).await + } + + async fn create_namespace( + &self, + request: CreateNamespaceRequest, + ) -> Result { + self.directory.create_namespace(request).await + } + + async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { + self.directory.drop_namespace(request).await + } + + async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { + self.directory.namespace_exists(request).await + } + + async fn list_tables(&self, request: ListTablesRequest) -> Result { + self.directory.list_tables(request).await + } + + async fn describe_table(&self, request: DescribeTableRequest) -> Result { + self.directory.describe_table(request).await + } + + async fn register_table(&self, request: RegisterTableRequest) -> Result { + self.directory.register_table(request).await + } + + async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { + self.directory.table_exists(request).await + } + + async fn drop_table(&self, request: DropTableRequest) -> Result { + self.directory.drop_table(request).await + } + + async fn deregister_table( + &self, + request: DeregisterTableRequest, + ) -> Result { + self.directory.deregister_table(request).await + } + + async fn count_table_rows(&self, request: CountTableRowsRequest) -> Result { + self.directory.count_table_rows(request).await + } + + async fn create_table( + &self, + request: CreateTableRequest, + request_data: Bytes, + ) -> Result { + self.directory.create_table(request, request_data).await + } + + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + self.directory.declare_table(request).await + } + + async fn insert_into_table( + &self, + request: InsertIntoTableRequest, + request_data: Bytes, + ) -> Result { + self.directory + .insert_into_table(request, request_data) + .await + } + + async fn merge_insert_into_table( + &self, + request: MergeInsertIntoTableRequest, + request_data: Bytes, + ) -> Result { + self.directory + .merge_insert_into_table(request, request_data) + .await + } + + async fn update_table(&self, request: UpdateTableRequest) -> Result { + self.directory.update_table(request).await + } + + async fn delete_from_table( + &self, + request: DeleteFromTableRequest, + ) -> Result { + self.directory.delete_from_table(request).await + } + + async fn query_table(&self, request: QueryTableRequest) -> Result { + self.directory.query_table(request).await + } + + async fn create_table_index( + &self, + request: CreateTableIndexRequest, + ) -> Result { + self.directory.create_table_index(request).await + } + + async fn list_table_indices( + &self, + request: ListTableIndicesRequest, + ) -> Result { + self.directory.list_table_indices(request).await + } + + async fn describe_table_index_stats( + &self, + request: DescribeTableIndexStatsRequest, + ) -> Result { + self.directory.describe_table_index_stats(request).await + } + + async fn describe_transaction( + &self, + request: DescribeTransactionRequest, + ) -> Result { + self.directory.describe_transaction(request).await + } + + async fn alter_transaction( + &self, + request: AlterTransactionRequest, + ) -> Result { + self.directory.alter_transaction(request).await + } + + async fn create_table_scalar_index( + &self, + request: CreateTableIndexRequest, + ) -> Result { + self.directory.create_table_scalar_index(request).await + } + + async fn drop_table_index( + &self, + request: DropTableIndexRequest, + ) -> Result { + self.directory.drop_table_index(request).await + } + + async fn list_all_tables(&self, request: ListTablesRequest) -> Result { + self.directory.list_all_tables(request).await + } + + async fn restore_table(&self, request: RestoreTableRequest) -> Result { + self.directory.restore_table(request).await + } + + async fn rename_table(&self, request: RenameTableRequest) -> Result { + self.directory.rename_table(request).await + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> Result { + self.directory.list_table_versions(request).await + } + + async fn update_table_schema_metadata( + &self, + request: UpdateTableSchemaMetadataRequest, + ) -> Result { + self.directory.update_table_schema_metadata(request).await + } + + async fn get_table_stats( + &self, + request: GetTableStatsRequest, + ) -> Result { + self.directory.get_table_stats(request).await + } + + async fn explain_table_query_plan( + &self, + request: ExplainTableQueryPlanRequest, + ) -> Result { + self.directory.explain_table_query_plan(request).await + } + + async fn analyze_table_query_plan( + &self, + request: AnalyzeTableQueryPlanRequest, + ) -> Result { + self.directory.analyze_table_query_plan(request).await + } + + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + self.directory.alter_table_add_columns(request).await + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + self.directory.alter_table_alter_columns(request).await + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + self.directory.alter_table_drop_columns(request).await + } + + async fn list_table_tags( + &self, + request: ListTableTagsRequest, + ) -> Result { + self.directory.list_table_tags(request).await + } + + async fn get_table_tag_version( + &self, + request: GetTableTagVersionRequest, + ) -> Result { + self.directory.get_table_tag_version(request).await + } + + async fn create_table_tag( + &self, + request: CreateTableTagRequest, + ) -> Result { + self.directory.create_table_tag(request).await + } + + async fn delete_table_tag( + &self, + request: DeleteTableTagRequest, + ) -> Result { + self.directory.delete_table_tag(request).await + } + + async fn update_table_tag( + &self, + request: UpdateTableTagRequest, + ) -> Result { + self.directory.update_table_tag(request).await + } + + fn namespace_id(&self) -> String { + format!("partitioned(root={})", self.location) + } +} + +/// Builder for creating or loading a [`PartitionedNamespace`]. +/// +/// - If the `DirectoryNamespace` of specified location contains table specs, +/// then [`build`](PartitionedNamespaceBuilder::build) loads the existing partitioned namespace. +/// - Otherwise, it creates a new namespace using the provided schema and initial partition spec. +#[derive(Debug, Default)] +pub struct PartitionedNamespaceBuilder { + location: String, + schema: Option, + partition_spec: Option, + directory: Option, + credential_vendor_properties: HashMap, + context_provider: Option>, +} + +impl PartitionedNamespaceBuilder { + pub fn new(location: impl Into) -> Self { + Self { + location: location.into().trim_end_matches('/').to_string(), + schema: None, + partition_spec: None, + directory: None, + credential_vendor_properties: HashMap::new(), + context_provider: None, + } + } + + /// Use an already constructed [`DirectoryNamespace`] when building or loading. + pub fn directory(mut self, directory: DirectoryNamespace) -> Self { + self.directory = Some(directory); + self + } + + /// Add a credential vendor property. + pub fn credential_vendor_property( + mut self, + key: impl Into, + value: impl Into, + ) -> Self { + self.credential_vendor_properties + .insert(key.into(), value.into()); + self + } + + /// Add multiple credential vendor properties. + pub fn credential_vendor_properties(mut self, properties: HashMap) -> Self { + self.credential_vendor_properties.extend(properties); + self + } + + /// Set a dynamic context provider for per-request context. + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + + pub fn schema(mut self, schema: ArrowSchema) -> Self { + self.schema = Some(schema); + self + } + + pub fn partition_spec(mut self, partition_spec: PartitionSpec) -> Self { + self.partition_spec = Some(partition_spec); + self + } + + /// Load [`PartitionedNamespace`] if already initialized, otherwise create. + pub async fn build(self) -> Result { + let (directory, manifest_ns) = Self::open_directory( + &self.location, + self.directory, + &self.credential_vendor_properties, + self.context_provider, + ) + .await?; + + // Load initialized partitioned namespace. + let tables = load_table_specs(&directory).await?; + if !tables.is_empty() { + for table in tables.iter() { + check_table_spec_consistency(&table.schema, &table.partition_spec.fields)?; + } + return Ok(PartitionedNamespace { + directory, + manifest: manifest_ns, + location: self.location.to_string(), + table_specs: tables, + }); + } + + // Create new. + let schema = self.schema.ok_or_else(|| { + Error::invalid_input("schema is required when creating a new partitioned namespace") + })?; + let partition = self.partition_spec.ok_or_else(|| { + Error::invalid_input( + "partition_spec is required when creating a new partitioned namespace", + ) + })?; + Self::create_new(&self.location, directory, manifest_ns, schema, partition).await + } + + /// Load an existing [`PartitionedNamespace`]. + /// + /// Returns an error if the namespace has not been initialized yet. + pub async fn load(self) -> Result { + let (directory, manifest_ns) = Self::open_directory( + &self.location, + self.directory, + &self.credential_vendor_properties, + self.context_provider, + ) + .await?; + + let tables = load_table_specs(&directory).await?; + if !tables.is_empty() { + Ok(PartitionedNamespace { + directory, + manifest: manifest_ns, + location: self.location.to_string(), + table_specs: tables, + }) + } else { + Err(Error::invalid_input( + "PartitionedNamespace is not initialized", + )) + } + } + + async fn open_directory( + location: &str, + directory: Option, + credential_vendor_properties: &HashMap, + context_provider: Option>, + ) -> Result<(DirectoryNamespace, Arc)> { + if directory.is_some() + && (!credential_vendor_properties.is_empty() || context_provider.is_some()) + { + return Err(Error::invalid_input( + "Cannot set credential_vendor/context_provider when directory is explicitly provided", + )); + } + + let directory = match directory { + Some(d) => d, + None => { + let mut builder = crate::DirectoryNamespaceBuilder::new(location) + .manifest_enabled(true) + .dir_listing_enabled(false) + .inline_optimization_enabled(true); + + for (k, v) in credential_vendor_properties.iter() { + builder = builder.credential_vendor_property(k.clone(), v.clone()); + } + if let Some(provider) = context_provider { + builder = builder.context_provider(provider); + } + + builder.build().await? + } + }; + let manifest_ns = directory.manifest_namespace()?; + Ok((directory, manifest_ns)) + } + + async fn create_new( + location: &str, + directory: DirectoryNamespace, + manifest_ns: Arc, + schema: ArrowSchema, + partition: PartitionSpec, + ) -> Result { + if partition.id != 1 { + return Err(Error::invalid_input("initial partition spec id must be 1")); + } + + let schema = ensure_all_schema_field_have_id(schema)?; + check_table_spec_consistency(&schema, &partition.fields)?; + + let table = TableSpec { + id: partition.id, + schema, + partition_spec: partition.clone(), + }; + + let ns = PartitionedNamespace { + directory, + manifest: manifest_ns, + location: location.to_string(), + table_specs: vec![table.clone()], + }; + ns.force_sink_table_spec(&table).await?; + + Ok(ns) + } +} + +/// Partition table of the partitioned namespace. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionTable { + /// Namespace id path for this partition table, e.g. ["v1", "abc123", "def456"] + pub id: Vec, + /// Optional read version used in strong transaction mode + pub read_version: Option, +} + +/// `TableSpec` represents a version of the table metadata state. +/// It contains an id, a schema and a partition spec. +#[derive(Debug, Clone)] +pub struct TableSpec { + id: i32, + schema: ArrowSchema, + partition_spec: PartitionSpec, +} + +impl TableSpec { + pub fn new(id: i32, schema: ArrowSchema, partition_spec: PartitionSpec) -> Self { + assert_eq!(id, partition_spec.id); + Self { + id, + schema, + partition_spec, + } + } + + pub fn spec_id_str(&self) -> String { + format!("v{}", self.id) + } +} + +/// Partition specification defines how to derive partition values from a record in a partitioned +/// namespace. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionSpec { + /// Spec version id, matching the N in `partition_spec_vN`. + pub id: i32, + /// Fields in this spec in evaluation order. + pub fields: Vec, +} + +impl PartitionSpec { + /// Convert from JSON representation stored in __manifest metadata. + pub fn from_json(json: &JsonPartitionSpec) -> Result { + let mut fields = Vec::with_capacity(json.fields.len()); + for f in &json.fields { + fields.push(PartitionField::from_json(f)?); + } + Ok(Self { + id: json.id, + fields, + }) + } + + /// Convert to JSON representation for storing in __manifest metadata. + pub fn to_json(&self) -> Result { + let fields = self + .fields + .iter() + .map(PartitionField::to_json) + .collect::>>()?; + + Ok(JsonPartitionSpec { + id: self.id, + fields, + }) + } + + pub fn spec_id_str(&self) -> String { + format!("v{}", self.id) + } +} + +/// Supported well-known partition transforms. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum PartitionTransform { + Identity, + Year, + Month, + Day, + Hour, + Bucket { num_buckets: i32 }, + MultiBucket { num_buckets: i32 }, + Truncate { width: i32 }, +} + +fn first_id_name(schema: &ArrowSchema, source_ids: &[i32]) -> Result { + Ok(first_id(schema, source_ids)?.1) +} + +fn first_id(schema: &ArrowSchema, source_ids: &[i32]) -> Result<(FieldRef, String)> { + let id = *source_ids + .first() + .ok_or_else(|| Error::invalid_input("source_ids should have at least one element"))?; + + let (col, field) = schema + .path_and_field_by_id(id)? + .ok_or_else(|| Error::invalid_input(format!("Field id {} not found in schema", id)))?; + + Ok((Arc::new(field), col)) +} + +/// Partition field definition. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct PartitionField { + /// Unique identifier for this partition field + pub field_id: String, + /// Field ids of the source columns in the schema + pub source_ids: Vec, + /// Well-known transform to apply. Exactly one of `transform` or `expression` + /// should be set. + pub transform: Option, + /// Custom SQL expression used when `transform` is not set. + pub expression: Option, + /// Result type of the partition value + pub result_type: DataType, +} + +impl PartitionField { + /// Convert this field into its JSON representation. + pub fn to_json(&self) -> Result { + let transform = self.transform.as_ref().map(|t| match t { + PartitionTransform::Identity => Box::new(JsonPartitionTransform { + r#type: "identity".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Year => Box::new(JsonPartitionTransform { + r#type: "year".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Month => Box::new(JsonPartitionTransform { + r#type: "month".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Day => Box::new(JsonPartitionTransform { + r#type: "day".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Hour => Box::new(JsonPartitionTransform { + r#type: "hour".to_string(), + num_buckets: None, + width: None, + }), + PartitionTransform::Bucket { num_buckets } => Box::new(JsonPartitionTransform { + r#type: "bucket".to_string(), + num_buckets: Some(*num_buckets), + width: None, + }), + PartitionTransform::MultiBucket { num_buckets } => Box::new(JsonPartitionTransform { + r#type: "multi_bucket".to_string(), + num_buckets: Some(*num_buckets), + width: None, + }), + PartitionTransform::Truncate { width } => Box::new(JsonPartitionTransform { + r#type: "truncate".to_string(), + num_buckets: None, + width: Some(*width), + }), + }); + + Ok(JsonPartitionField { + field_id: self.field_id.clone(), + source_ids: self.source_ids.clone(), + transform, + expression: self.expression.clone(), + result_type: Box::new(arrow_type_to_json(&self.result_type)?), + }) + } + + /// Construct a `PartitionField` from its JSON representation. + pub fn from_json(json: &JsonPartitionField) -> Result { + let has_transform = json.transform.is_some(); + let has_expression = json + .expression + .as_ref() + .map(|e| !e.trim().is_empty()) + .unwrap_or(false); + + if has_transform == has_expression { + return Err(lance_core::Error::namespace( + "Exactly one of transform or expression must be set", + )); + } + + let transform = json + .transform + .as_ref() + .map(|t| { + let result = match t.r#type.as_str() { + "identity" => PartitionTransform::Identity, + "year" => PartitionTransform::Year, + "month" => PartitionTransform::Month, + "day" => PartitionTransform::Day, + "hour" => PartitionTransform::Hour, + "bucket" => PartitionTransform::Bucket { + num_buckets: t.num_buckets.unwrap_or(0), + }, + "multi_bucket" => PartitionTransform::MultiBucket { + num_buckets: t.num_buckets.unwrap_or(0), + }, + "truncate" => PartitionTransform::Truncate { + width: t.width.unwrap_or(0), + }, + other => { + return Err(lance_core::Error::namespace(format!( + "Unsupported partition transform: {}", + other + ))); + } + }; + Ok(result) + }) + .transpose()?; + + Ok(Self { + field_id: json.field_id.clone(), + source_ids: json.source_ids.clone(), + transform, + expression: json.expression.clone(), + result_type: convert_json_arrow_type(&json.result_type)?, + }) + } + + /// Parse partition value from the record. The record should contain exactly one row. + pub async fn value(&self, record: &RecordBatch) -> Result> { + if record.num_rows() != 1 { + return Err(Error::invalid_input("record must contain exactly one row")); + } + + let array = match (self.expression.as_ref(), self.transform.as_ref()) { + (Some(expr), None) => parse_partition_value_from_expr(record, expr).await?, + (None, Some(transform)) => { + parse_partition_value_from_transform(&self.source_ids, record, transform).await? + } + _ => { + return Err(Error::internal( + "expression and transform can't both be set or unset", + )); + } + }; + + if array.is_empty() { + return Err(Error::internal( + "partition value is empty, the expression might be invalid", + )); + } + Ok(array) + } + + /// Signature of this field + pub fn signature( + &self, + ) -> ( + Vec, + Option, + Option, + DataType, + ) { + ( + self.source_ids.clone(), + self.transform.clone(), + self.expression.clone(), + self.result_type.clone(), + ) + } + + /// Transform the input predicate (in CDF format) into a filter of `__manifest`. + /// + /// This is a best-effort partition pruning rewrite for a single [`PartitionField`]. + /// It must be conservative: if we cannot safely rewrite a clause, we return a + /// literal TRUE for that clause (i.e. keep all partitions). + /// + /// Per OR-clause (AND of atoms): + /// - If we can rewrite atoms directly against the manifest partition column + /// (identity transform, or well-known `date_part`/`year`/`month`/`day`/`hour` + /// predicates for time transforms), we do so. + /// - Otherwise, if *all* source columns are constrained by equality to literals, + /// we synthesize a single-row [`RecordBatch`], reuse [`PartitionField::value`] + /// to compute the partition value, then rewrite into + /// `partition_field_{field_id} == `. + pub async fn partition_prune(&self, schema: ArrowSchema, cdf: &Vec>) -> Result { + // Resolve source column names from schema and source_ids. + let mut source_col_names: Vec = Vec::with_capacity(self.source_ids.len()); + for source_id in &self.source_ids { + let (path, _) = schema.path_and_field_by_id(*source_id)?.ok_or_else(|| { + Error::invalid_input(format!("Field id {} not found in schema", source_id)) + })?; + source_col_names.push(path.clone()); + } + let source_col_set: HashSet = source_col_names.iter().cloned().collect(); + + let manifest_col = partition_field_col(&self.field_id); + let mut manifest_expr = lit(false); + + for clause in cdf { + // Collect atoms that are relevant to this field (they reference any source column). + let mut relevant_atoms: Vec = Vec::new(); + for atom in clause { + if expr_references_any_column(atom, &source_col_set) { + relevant_atoms.push(atom.clone()); + } + } + + // If this OR-clause doesn't restrict this partition field at all, it + // cannot prune partitions. + if relevant_atoms.is_empty() { + return Ok(lit(true)); + } + + // 1) Try direct rewrites (do not require full equality coverage). + let mut clause_pred = lit(true); + let mut rewrote_any = false; + for atom in &relevant_atoms { + let rewritten = match (&self.transform, &self.expression) { + (Some(PartitionTransform::Identity), None) if source_col_names.len() == 1 => { + rewrite_identity_atom(atom, &source_col_names[0], &manifest_col) + } + (Some(PartitionTransform::Year), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "year", + ) + } + (Some(PartitionTransform::Month), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "month", + ) + } + (Some(PartitionTransform::Day), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "day", + ) + } + (Some(PartitionTransform::Hour), None) if source_col_names.len() == 1 => { + rewrite_time_transform_atom( + atom, + &source_col_names[0], + &manifest_col, + "hour", + ) + } + _ => None, + }; + + // If we cannot rewrite this atom, keep it as TRUE (conservative). + if let Some(expr) = rewritten { + rewrote_any = true; + clause_pred = clause_pred.and(expr); + } else { + clause_pred = clause_pred.and(lit(true)); + } + } + + // If we rewrote at least one atom, we can use it for pruning. + if rewrote_any { + manifest_expr = manifest_expr.or(clause_pred); + continue; + } + + // 2) Try equality-driven value computation. + // This requires that every relevant atom is an equality between a source column + // and a literal, and that all source columns are covered. + let mut eq_map: HashMap = HashMap::new(); + let mut all_relevant_are_eq = true; + + for atom in &relevant_atoms { + if let Some((col_name, scalar)) = extract_eq_on_source_column(atom, &source_col_set) + { + eq_map.insert(col_name, scalar); + } else { + all_relevant_are_eq = false; + break; + } + } + + let covers_all_sources = source_col_names + .iter() + .all(|name| eq_map.contains_key(name)); + + if all_relevant_are_eq + && covers_all_sources + && let Some(rb) = build_single_row_record_batch(&schema, &eq_map) + { + // Compute the partition value using the same logic as table creation. + let arr = self.value(&rb).await?; + if let Ok(scalar) = ScalarValue::try_from_array(&arr, 0) { + let base = col(&manifest_col).eq(lit(scalar)); + manifest_expr = manifest_expr.or(base); + continue; + } + } + + // 3) Can't prune. + return Ok(lit(true)); + } + + Ok(manifest_expr) + } +} + +fn expr_references_any_column(expr: &Expr, cols: &HashSet) -> bool { + match expr { + Expr::Column(c) => cols.contains(&c.name), + Expr::BinaryExpr(b) => { + expr_references_any_column(&b.left, cols) || expr_references_any_column(&b.right, cols) + } + Expr::IsNull(e) | Expr::IsNotNull(e) => expr_references_any_column(e, cols), + Expr::Cast(c) => expr_references_any_column(&c.expr, cols), + Expr::TryCast(c) => expr_references_any_column(&c.expr, cols), + Expr::ScalarFunction(fun) => fun.args.iter().any(|a| expr_references_any_column(a, cols)), + _ => false, + } +} + +fn extract_eq_on_source_column( + atom: &Expr, + source_cols: &HashSet, +) -> Option<(String, ScalarValue)> { + let Expr::BinaryExpr(binary) = atom else { + return None; + }; + if binary.op != Operator::Eq { + return None; + } + match (&*binary.left, &*binary.right) { + (Expr::Column(c), Expr::Literal(v, _)) if source_cols.contains(&c.name) => { + Some((c.name.clone(), v.clone())) + } + (Expr::Literal(v, _), Expr::Column(c)) if source_cols.contains(&c.name) => { + Some((c.name.clone(), v.clone())) + } + _ => None, + } +} + +fn rewrite_identity_atom(atom: &Expr, source_col: &str, manifest_col: &str) -> Option { + match atom { + Expr::BinaryExpr(binary) if is_comparison_op(binary.op) => { + match (&*binary.left, &*binary.right) { + (Expr::Column(c), Expr::Literal(v, _)) if c.name == source_col => { + let base = + Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(col(manifest_col)), + op: binary.op, + right: Box::new(Expr::Literal(v.clone(), None)), + }); + Some(base.or(col(manifest_col).is_null())) + } + (Expr::Literal(v, _), Expr::Column(c)) if c.name == source_col => { + let base = + Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(Expr::Literal(v.clone(), None)), + op: binary.op, + right: Box::new(col(manifest_col)), + }); + Some(base.or(col(manifest_col).is_null())) + } + _ => None, + } + } + Expr::IsNull(e) if matches!(e.as_ref(), Expr::Column(c) if c.name == source_col) => { + Some(col(manifest_col).is_null()) + } + Expr::IsNotNull(e) if matches!(e.as_ref(), Expr::Column(c) if c.name == source_col) => { + Some(col(manifest_col).is_not_null()) + } + _ => None, + } +} + +fn rewrite_time_transform_atom( + atom: &Expr, + source_col: &str, + manifest_col: &str, + unit: &str, +) -> Option { + let Expr::BinaryExpr(binary) = atom else { + return None; + }; + if !is_comparison_op(binary.op) { + return None; + } + + let is_matching_transform_call = |expr: &Expr| -> bool { + let Expr::ScalarFunction(fun) = expr else { + return false; + }; + + // Accept either `year(col)` style or `date_part('year', col)` style. + let is_unit_fn = fun.name() == unit && fun.args.len() == 1; + let is_date_part = fun.name() == "date_part" + && fun.args.len() == 2 + && matches!(&fun.args[0], Expr::Literal(v, _) if matches!(v, ScalarValue::Utf8(Some(s)) if s == unit)); + + let col_arg = if is_unit_fn { + fun.args.first() + } else if is_date_part { + fun.args.get(1) + } else { + None + }; + let Some(col_arg) = col_arg else { + return false; + }; + matches!(col_arg, Expr::Column(c) if c.name == source_col) + }; + + // func(col) literal + if is_matching_transform_call(&binary.left) { + let base = Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(col(manifest_col)), + op: binary.op, + right: Box::new(binary.right.as_ref().clone()), + }); + return Some(base.or(col(manifest_col).is_null())); + } + + // literal func(col) + if is_matching_transform_call(&binary.right) { + let base = Expr::BinaryExpr(lance::deps::datafusion::logical_expr::BinaryExpr { + left: Box::new(binary.left.as_ref().clone()), + op: binary.op, + right: Box::new(col(manifest_col)), + }); + return Some(base.or(col(manifest_col).is_null())); + } + + None +} + +// Build a single-row RecordBatch for partition computation. +// +// For each top-level field in `schema`: +// - If there is an exact match in `eq_map`, use the scalar value. +// - Otherwise, use a NULL of the field's type. +// +// `eq_map` may contain nested column paths (e.g. "s.city"). In that case, if a top-level +// field is a struct, we will populate its children recursively. +fn build_single_row_record_batch( + schema: &ArrowSchema, + eq_map: &HashMap, +) -> Option { + let rb_schema: SchemaRef = Arc::new(schema.clone()); + let mut arrays: Vec = Vec::with_capacity(schema.fields().len()); + + for f in schema.fields().iter() { + if let Some(arr) = build_single_row_field_array(f, "", eq_map) { + arrays.push(arr); + } else { + return None; + } + } + + RecordBatch::try_new(rb_schema, arrays).ok() +} + +fn build_single_row_field_array( + field: &FieldRef, + prefix: &str, + eq_map: &HashMap, +) -> Option { + let name = field.name(); + let full_name = if prefix.is_empty() { + name.to_string() + } else { + format!("{}.{}", prefix, name) + }; + + if let Some(scalar) = eq_map.get(&full_name) { + // Ensure the literal's array type matches the field type; otherwise, None. + let arr = scalar.to_array_of_size(1).ok()?; + if arr.data_type() != field.data_type() { + return None; + } + return Some(arr); + } + + match field.data_type() { + DataType::Struct(fields) => { + let mut cols: Vec<(FieldRef, ArrayRef)> = Vec::with_capacity(fields.len()); + for child in fields.iter() { + let child_arr = build_single_row_field_array(child, &full_name, eq_map); + if let Some(arr) = child_arr { + cols.push((child.clone(), arr)); + } else { + return None; + } + } + Some(Arc::new(arrow::array::StructArray::from(cols)) as ArrayRef) + } + _ => Some(new_null_array(field.data_type(), 1)), + } +} + +/// Evaluate a partition expression using DataFusion and return the resulting +/// Arrow array (single column) for the given record batch. +async fn parse_partition_value_from_expr( + record: &RecordBatch, + expr: &str, +) -> Result> { + let ctx = SessionContext::new(); + ctx.register_udf(MURMUR3_MULTI_UDF.clone()); + ctx.register_batch("record_batch", record.clone())?; + let df = ctx + .sql(&format!("SELECT {} FROM record_batch", expr)) + .await?; + let records = df.collect().await?; + let partition_batch = records + .first() + .ok_or_else(|| Error::internal("expect one row of partition value but got nothing"))?; + let partition_col = partition_batch.column(0); + Ok(Arc::clone(partition_col)) +} + +/// Compute partition values using the transform description. +/// TODO: implement parse logic by code instead of datafusion + expr for better performance. +async fn parse_partition_value_from_transform( + ids: &[i32], + record: &RecordBatch, + transform: &PartitionTransform, +) -> Result> { + let schema = record.schema(); + + // Map transform to an equivalent SQL expression over the record batch. + let expr = match transform { + PartitionTransform::Identity => first_id_name(&schema, ids)?, + PartitionTransform::Year => format!("date_part('year', {})", first_id_name(&schema, ids)?), + PartitionTransform::Month => { + format!("date_part('month', {})", first_id_name(&schema, ids)?) + } + PartitionTransform::Day => format!("date_part('day', {})", first_id_name(&schema, ids)?), + PartitionTransform::Hour => format!("date_part('hour', {})", first_id_name(&schema, ids)?), + PartitionTransform::Bucket { num_buckets } => { + if *num_buckets <= 0 { + return Err(Error::invalid_input(format!( + "num_buckets must be positive, got {}", + num_buckets + ))); + } + format!( + "abs(murmur3_multi({})) % {}", + first_id_name(&schema, ids)?, + num_buckets + ) + } + PartitionTransform::MultiBucket { num_buckets } => { + if *num_buckets <= 0 { + return Err(Error::invalid_input(format!( + "num_buckets must be positive, got {}", + num_buckets + ))); + } + + // Collect all columns in multi-bucket. + let cols: Vec = ids + .iter() + .map(|id| { + schema + .path_and_field_by_id(*id)? + .map(|(p, _)| p) + .ok_or_else(|| { + Error::invalid_input(format!("Field id {} not found in schema", id)) + }) + }) + .collect::>>()?; + if cols.is_empty() { + return Err(Error::invalid_input( + "source_ids should have at least one element", + )); + } + + format!("abs(murmur3_multi({})) % {}", cols.join(", "), num_buckets) + } + PartitionTransform::Truncate { width } => { + if *width <= 0 { + return Err(Error::invalid_input(format!( + "truncate width must be positive, got {}", + width + ))); + } + + let (field, col) = first_id(&schema, ids)?; + match field.data_type() { + DataType::Utf8 | DataType::LargeUtf8 => { + format!("substring({}, 1, {})", col, width) + } + _ => format!("{} - ({} % {})", col, col, width), + } + } + }; + + parse_partition_value_from_expr(record, &expr).await +} + +/// Generate partition namespace id based on partition values. +/// +/// This function transforms partition values into string and concat with delimiter. +/// +/// The format is: v{n}${pv_0_str}${pv_1_str}$...${pv_n_str} +fn create_partition_namespace_id( + spec: &PartitionSpec, + partition_values: &[ScalarValue], +) -> Result> { + let mut buf: Vec = Vec::with_capacity(partition_values.len() + 1); + buf.push(spec.spec_id_str()); + + for (i, v) in partition_values.iter().enumerate() { + let v_str = scalar_to_str(v)?.ok_or_else(|| { + Error::invalid_input(format!( + "partition value with index {} should not be null", + i + )) + })?; + buf.push(v_str); + } + + Ok(buf) +} + +fn collect_non_existed_fields( + existing_fields: &HashMap, + to_add: &mut Vec<(String, DataType)>, + col: &str, + data_type: &DataType, +) -> Result<()> { + if let Some(existing_ty) = existing_fields.get(col) { + if existing_ty != data_type { + return Err(Error::invalid_input(format!( + "Manifest column '{}' already exists with type {:?}, expected {:?}", + col, + existing_ty, + DataType::Int32 + ))); + } + } else { + to_add.push(( + format!("{}{}", EXTENDED_PREFIX, partition_field_col(col)), + data_type.clone(), + )); + } + Ok(()) +} + +/// Partition field is saved as extended properties in __manifest table. We add a prefix +/// 'partition_field_' to it to distinguish from other extended properties. +fn partition_field_col(field_id: &str) -> String { + format!("partition_field_{}", field_id) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dir::manifest::TableInfo; + use crate::util::{ensure_all_schema_field_have_id, expr_to_cdf}; + use arrow::array::{ + BinaryArray, Date32Array, Int32Array, RecordBatch, StringArray, StructArray, + }; + use arrow_schema::{Field, Field as ArrowField, Fields, Schema as ArrowSchema, Schema}; + use lance_arrow::LANCE_FIELD_ID_META_KEY; + use lance_core::utils::tempfile::TempStdDir; + use lance_namespace::models::JsonArrowDataType; + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + use tokio::task::JoinSet; + + impl PartitionedNamespace { + /// Resolve partition namespace through spec_id and partition values. + /// + /// The spec_id and partition values should uniquely point to a single namespace. + async fn resolve_partition_namespace( + &self, + spec: &PartitionSpec, + partition_values: &[ScalarValue], + ) -> Result>> { + let level = partition_values.len(); + + let mut ns_expr = col("object_type").eq(lit("namespace")); + ns_expr = ns_expr.and(col(partition_field_col(SPEC_ID_COL)).eq(lit(spec.id))); + + let partition_expr = + partition_expressions(spec.id, &spec.fields[0..level], partition_values)?; + ns_expr = ns_expr.and(partition_expr); + + for i in level..spec.fields.len() { + let col_name = partition_field_col(&spec.fields.get(i).unwrap().field_id); + ns_expr = ns_expr.and(col(&col_name).is_null()); + } + + let mut ns_ids = vec![]; + let objects = self.manifest.query_manifest_expr(ns_expr).await?; + for object in objects.into_iter() { + if let ManifestObject::Namespace(ns) = object { + if ns.namespace.is_empty() { + continue; + } + if ns.namespace.first().unwrap() != &format!("v{}", spec.id) { + continue; + } + let mut ns_id: Vec = vec![]; + ns_id.extend(ns.namespace); + ns_id.push(ns.name); + ns_ids.push(ns_id); + } else { + continue; + } + } + + if ns_ids.is_empty() { + Ok(None) + } else if ns_ids.len() == 1 { + Ok(ns_ids.pop()) + } else { + Err(Error::internal(format!( + "Found multiple partition namespace instance with the same spec id and partition values {:?}", + ns_ids + ))) + } + } + } + + fn const_bool(expr: &Expr) -> Option { + match expr { + Expr::Literal(ScalarValue::Boolean(Some(b)), _) => Some(*b), + Expr::BinaryExpr(b) if b.op == Operator::And => { + Some(const_bool(&b.left)? && const_bool(&b.right)?) + } + Expr::BinaryExpr(b) if b.op == Operator::Or => { + Some(const_bool(&b.left)? || const_bool(&b.right)?) + } + _ => None, + } + } + + fn collect_column_names(expr: &Expr, out: &mut Vec) { + match expr { + Expr::Column(c) => out.push(c.name.clone()), + Expr::BinaryExpr(b) => { + collect_column_names(&b.left, out); + collect_column_names(&b.right, out); + } + Expr::IsNull(e) | Expr::IsNotNull(e) => collect_column_names(e, out), + Expr::Cast(c) => collect_column_names(&c.expr, out), + Expr::TryCast(c) => collect_column_names(&c.expr, out), + Expr::ScalarFunction(fun) => fun.args.iter().for_each(|a| collect_column_names(a, out)), + _ => {} + } + } + + async fn setup_multi_version_namespace( + temp_path: &str, + ) -> ( + PartitionedNamespace, + PartitionSpec, + PartitionSpec, + Vec<(PartitionTable, Vec)>, + Vec<(PartitionTable, Vec)>, + ) { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("ts", DataType::Date32, true), + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("business_unit", DataType::Int32, true), + ]); + let schema = arrow_schema.clone(); + + let spec_v1 = PartitionSpec { + id: 1, + fields: vec![make_pf_year("event_year", 0), make_pf_country("country", 1)], + }; + + let mut ns = PartitionedNamespaceBuilder::new(temp_path) + .schema(schema.clone()) + .partition_spec(spec_v1.clone()) + .build() + .await + .unwrap(); + + // v1 tables + let v1_vals_1 = vec![ + ScalarValue::Int32(Some(2020)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v1_t1 = ns + .create_partition_table(&spec_v1, &v1_vals_1) + .await + .unwrap(); + + let v1_vals_2 = vec![ + ScalarValue::Int32(Some(2021)), + ScalarValue::Utf8(Some("CN".to_string())), + ]; + let v1_t2 = ns + .create_partition_table(&spec_v1, &v1_vals_2) + .await + .unwrap(); + + // evolve to v2 + let spec_v2 = ns + .update_table_spec( + schema, + vec![ + make_pf_business_unit("business_unit", 2), + make_pf_country("country", 1), + ], + ) + .await + .unwrap(); + + // v2 tables + let v2_vals_1 = vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v2_t1 = ns + .create_partition_table(&spec_v2.partition_spec, &v2_vals_1) + .await + .unwrap(); + + let v2_vals_2 = vec![ + ScalarValue::Int32(Some(2)), + ScalarValue::Utf8(Some("FR".to_string())), + ]; + let v2_t2 = ns + .create_partition_table(&spec_v2.partition_spec, &v2_vals_2) + .await + .unwrap(); + + ( + ns, + spec_v1, + spec_v2.partition_spec, + vec![(v1_t1, v1_vals_1), (v1_t2, v1_vals_2)], + vec![(v2_t1, v2_vals_1), (v2_t2, v2_vals_2)], + ) + } + + #[test] + fn test_partition_field_json_transform() { + let field = PartitionField { + field_id: "event_year".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + }; + + let json = field.to_json().unwrap(); + assert_eq!(json.field_id, "event_year"); + assert!(json.expression.is_none()); + let transform = json.transform.as_ref().expect("transform should be set"); + assert_eq!(transform.r#type, "year"); + + let other_field = PartitionField::from_json(&json).expect("from_json should succeed"); + assert_eq!(other_field.field_id, "event_year"); + assert_eq!(other_field.source_ids, vec![1]); + assert_eq!(other_field.transform, Some(PartitionTransform::Year)); + assert_eq!(other_field.expression, None); + assert_eq!(other_field.result_type, DataType::Int32); + } + + #[test] + fn test_partition_spec_json_transform() { + let field1 = PartitionField { + field_id: "event_date".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Date32, + }; + let field2 = PartitionField { + field_id: "country".to_string(), + source_ids: vec![2], + transform: None, + expression: Some("col0".to_string()), + result_type: DataType::Utf8, + }; + + let spec = PartitionSpec { + id: 1, + fields: vec![field1, field2], + }; + + let json_spec = spec.to_json().unwrap(); + assert_eq!(json_spec.id, 1); + assert_eq!(json_spec.fields.len(), 2); + + let other_spec = PartitionSpec::from_json(&json_spec).expect("from_json should succeed"); + assert_eq!(other_spec.id, spec.id); + assert_eq!(other_spec.fields.len(), spec.fields.len()); + for (a, b) in other_spec.fields.iter().zip(spec.fields.iter()) { + assert_eq!(a.field_id, b.field_id); + assert_eq!(a.source_ids, b.source_ids); + assert_eq!(a.transform, b.transform); + assert_eq!(a.expression, b.expression); + assert_eq!(a.result_type, b.result_type); + } + } + + #[tokio::test] + async fn test_partition_prune_rewrites_to_manifest_column() { + // Case 1: identity transform (direct rewrite) + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("country", DataType::Utf8, true)]); + let schema = ensure_all_schema_field_have_id(arrow_schema).unwrap(); + + let field = make_pf_country("country", 0); + + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_country".to_string())); + assert!(!cols.contains(&"country".to_string())); + } + + // Case 2: time transform (direct rewrite via SQL-parsed scalar functions) + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("ts", DataType::Date32, true)]); + let schema = ensure_all_schema_field_have_id(arrow_schema).unwrap(); + + let field = make_pf_year("event_year", 0); + + // date_part('year', ts) = 2020 + let filter = parse_filter_expr_from_sql("date_part('year', ts) = 2020", &schema) + .await + .unwrap(); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_event_year".to_string())); + assert!(!cols.contains(&"ts".to_string())); + } + } + + #[tokio::test] + async fn test_partition_prune_bucket_computes_partition_value() { + // Case 1: single source_id bucket transform, can prune by computing partition value. + { + let arrow_schema = + ArrowSchema::new(vec![ArrowField::new("country", DataType::Utf8, true)]); + let schema = ensure_all_schema_field_have_id(arrow_schema).unwrap(); + + let field = PartitionField { + field_id: "country_bucket".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Bucket { num_buckets: 16 }), + expression: None, + result_type: DataType::Int64, + }; + + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema, &cdf).await.unwrap(); + + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_country_bucket".to_string())); + assert!(!cols.contains(&"country".to_string())); + } + + // Case 2: multi source_ids, can prune only when all sources are constrained by equality. + { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("business_unit", DataType::Int32, true), + ]); + let schema = ensure_all_schema_field_have_id(arrow_schema).unwrap(); + + let field = PartitionField { + field_id: "mb".to_string(), + source_ids: vec![0, 1], + transform: Some(PartitionTransform::MultiBucket { num_buckets: 16 }), + expression: None, + result_type: DataType::Int64, + }; + + // 2.1 can prune + let filter = col("country") + .eq(lit("US")) + .and(col("business_unit").eq(lit(1i32))); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + + assert_ne!(const_bool(&manifest_filter), Some(true)); + let mut cols = Vec::new(); + collect_column_names(&manifest_filter, &mut cols); + assert!(cols.contains(&"partition_field_mb".to_string())); + assert!(!cols.contains(&"country".to_string())); + assert!(!cols.contains(&"business_unit".to_string())); + + // 2.2 cannot prune: missing equality on one source + let filter = col("country").eq(lit("US")); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + assert_eq!(const_bool(&manifest_filter), Some(true)); + + // 2.3 cannot prune: non-equality predicate present + let filter = col("country") + .eq(lit("US")) + .and(col("business_unit").gt(lit(1i32))); + let cdf = expr_to_cdf(&filter); + let manifest_filter = field.partition_prune(schema.clone(), &cdf).await.unwrap(); + assert_eq!(const_bool(&manifest_filter), Some(true)); + } + } + + #[test] + fn test_partition_field_json_expression() { + let json = JsonPartitionField { + field_id: "country".to_string(), + source_ids: vec![2], + transform: None, + expression: Some("col0".to_string()), + result_type: Box::new(JsonArrowDataType::new("utf8".to_string())), + }; + + let field = PartitionField::from_json(&json).expect("from_json should succeed"); + assert_eq!(field.field_id, "country"); + assert_eq!(field.source_ids, vec![2]); + assert!(field.transform.is_none()); + assert_eq!(field.expression.as_deref(), Some("col0")); + assert_eq!(field.result_type, DataType::Utf8); + + let json2 = field.to_json().unwrap(); + assert!(json2.transform.is_none()); + assert_eq!(json2.expression.as_deref(), Some("col0")); + assert_eq!(json2.result_type.r#type.to_lowercase(), "utf8"); + } + + #[test] + fn test_partition_field_json_requires_exactly_one_of_transform_or_expression() { + let json = JsonPartitionField { + field_id: "bad".to_string(), + source_ids: vec![1], + transform: Some(Box::new(JsonPartitionTransform { + r#type: "identity".to_string(), + num_buckets: None, + width: None, + })), + expression: Some("col0".to_string()), + result_type: Box::new(JsonArrowDataType::new("int32".to_string())), + }; + + let err = PartitionField::from_json(&json).expect_err("should fail"); + assert!( + err.to_string() + .contains("Exactly one of transform or expression") + ); + } + + #[tokio::test] + async fn test_update_partition_spec_successfully() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let schema_v1 = ArrowSchema::new(vec![ + ArrowField::new("c0", DataType::Date32, true), + ArrowField::new("c1", DataType::Utf8, true), + ]); + let mut ns = create_ns(temp_path, schema_v1, vec![make_pf_year("event_year", 0)]).await; + + let schema_v2 = ArrowSchema::new(vec![ + ArrowField::new("c0", DataType::Date32, true), + ArrowField::new("c1", DataType::Utf8, true), + ArrowField::new("c2", DataType::Int32, true), + ]); + let table_v2 = ns + .update_table_spec( + schema_v2.clone(), + vec![ + make_pf_year("ignored_id", 0), + make_pf_business_unit("business_unit", 2), + ], + ) + .await + .unwrap(); + assert_eq!(table_v2.id, 2); + assert_eq!( + table_v2.schema, + ensure_all_schema_field_have_id(schema_v2.clone()).unwrap() + ); + assert_eq!(table_v2.partition_spec.id, 2); + assert_eq!(table_v2.partition_spec.fields.len(), 2); + + // Validate persisted state by reloading. + let reloaded = PartitionedNamespaceBuilder::new(temp_path) + .load() + .await + .unwrap(); + assert_eq!( + reloaded.schema().unwrap(), + ensure_all_schema_field_have_id(schema_v2).unwrap() + ); + assert_eq!( + reloaded.partition_spec().await.unwrap(), + table_v2.partition_spec + ); + } + + #[tokio::test] + async fn test_schema_evolution_add_drop_rename_column() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let schema = ArrowSchema::new(vec![ + ArrowField::new("ts", DataType::Date32, true), + ArrowField::new("country", DataType::Utf8, true), + ]); + let mut ns = create_ns( + temp_path, + schema, + vec![make_pf_year("event_year", 0), make_pf_country("country", 1)], + ) + .await; + assert_eq!(ns.latest_table_spec().unwrap().id, 1); + + let new_col = ArrowField::new("business_unit", DataType::Int32, true); + let schema_v2 = ns.add_column(&new_col).await.unwrap(); + assert!( + schema_v2 + .fields() + .iter() + .any(|f| f.name() == "business_unit") + ); + assert_eq!(ns.latest_table_spec().unwrap().id, 2); + + let schema_v3 = ns.rename_column("country", "region").await.unwrap(); + assert!(schema_v3.fields().iter().any(|f| f.name() == "region")); + assert!(!schema_v3.fields().iter().any(|f| f.name() == "country")); + assert_eq!(ns.latest_table_spec().unwrap().id, 3); + + let schema_v4 = ns.drop_column("business_unit").await.unwrap(); + assert!( + !schema_v4 + .fields() + .iter() + .any(|f| f.name() == "business_unit") + ); + assert_eq!(ns.latest_table_spec().unwrap().id, 4); + + // Cannot drop a column that is referenced by the current partition spec. + assert!(ns.drop_column("region").await.is_err()); + } + + // Reuse field_id when signature matches. + #[tokio::test] + async fn test_update_partition_spec_reuse_field_id() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let schema = ArrowSchema::new(vec![ + ArrowField::new("c0", DataType::Date32, true), + ArrowField::new("c1", DataType::Utf8, true), + ArrowField::new("c2", DataType::Int32, true), + ]); + let mut ns = create_ns( + temp_path, + schema.clone(), + vec![ + make_pf_year("event_year", 0), + make_pf_expr_int32("country", 1, "col0"), + ], + ) + .await; + + // Update table spec then verify + let table_v2 = ns + .update_table_spec( + schema, + vec![ + // Same signature as event_year but incoming id should be overridden. + make_pf_year("should_be_overridden", 0), + // Same signature as country but incoming id should be overridden. + make_pf_expr_int32("should_be_overridden_too", 1, "col0"), + // New field keeps requested id. + make_pf_business_unit("business_unit", 2), + ], + ) + .await + .unwrap(); + + assert_eq!(table_v2.id, 2); + + let year = table_v2 + .partition_spec + .fields + .iter() + .find(|f| f.transform == Some(PartitionTransform::Year)) + .unwrap(); + assert_eq!(year.field_id, "event_year"); + + let country = table_v2 + .partition_spec + .fields + .iter() + .find(|f| f.expression.as_deref() == Some("col0")) + .unwrap(); + assert_eq!(country.field_id, "country"); + } + + #[tokio::test] + async fn test_update_partition_spec_reuse_field_id_error() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let schema = ArrowSchema::new(vec![ + ArrowField::new("c0", DataType::Date32, true), + ArrowField::new("c1", DataType::Int32, true), + ]); + let mut ns = create_ns( + temp_path, + schema.clone(), + vec![make_pf_year("event_year", 0)], + ) + .await; + + let err = ns + .update_table_spec( + schema, + vec![ + // Reuse existing field_id but with a different signature. + make_pf_business_unit("event_year", 1), + ], + ) + .await + .unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("already used by another field")); + } + + fn make_pf_year(field_id: &str, source_id: i32) -> PartitionField { + PartitionField { + field_id: field_id.to_string(), + source_ids: vec![source_id], + transform: Some(PartitionTransform::Year), + expression: None, + result_type: DataType::Int32, + } + } + + fn make_pf_country(field_id: &str, source_id: i32) -> PartitionField { + PartitionField { + field_id: field_id.to_string(), + source_ids: vec![source_id], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Utf8, + } + } + + fn make_pf_business_unit(field_id: &str, source_id: i32) -> PartitionField { + PartitionField { + field_id: field_id.to_string(), + source_ids: vec![source_id], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Int32, + } + } + + fn make_pf_expr_int32(field_id: &str, source_id: i32, expression: &str) -> PartitionField { + PartitionField { + field_id: field_id.to_string(), + source_ids: vec![source_id], + transform: None, + expression: Some(expression.to_string()), + result_type: DataType::Int32, + } + } + + async fn create_ns( + temp_path: &str, + schema: ArrowSchema, + fields: Vec, + ) -> PartitionedNamespace { + let initial_spec = PartitionSpec { id: 1, fields }; + PartitionedNamespaceBuilder::new(temp_path) + .schema(schema) + .partition_spec(initial_spec) + .build() + .await + .unwrap() + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_identity() { + let array = Int32Array::from(vec![1]); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![Field::new( + "col0", + DataType::Int32, + false, + )])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let ids = vec![0]; + let transform = PartitionTransform::Identity; + let expr = "col0"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_nested_struct_field_id() { + // Schema: s: struct + // Field ids after `ensure_schema_field_ids`: + // - s => 0 (top-level) + // - s.city => 1 (nested) + let child_field: FieldRef = Arc::new(ArrowField::new("city", DataType::Utf8, false)); + let struct_field = ArrowField::new( + "s", + DataType::Struct(Fields::from(vec![child_field.clone()])), + false, + ); + + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![struct_field])).unwrap(), + ); + // Use child field from schema, it has id. + let struct_field = schema.field(0); + let child_field = match struct_field.data_type() { + DataType::Struct(fields) => fields[0].clone(), + _ => panic!("expected struct"), + }; + let city_values = Arc::new(StringArray::from(vec![Some("US")])); + let struct_array = StructArray::from(vec![(child_field, city_values as ArrayRef)]); + + let batch = RecordBatch::try_new(schema, vec![Arc::new(struct_array)]).unwrap(); + + let ids = vec![1]; + let transform = PartitionTransform::Identity; + + let v = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + let arr = v.as_any().downcast_ref::().unwrap(); + assert_eq!(arr.value(0), "US"); + } + + #[test] + fn test_build_single_row_record_batch_with_nested_eq_map() { + let child_field_city: FieldRef = Arc::new(ArrowField::new("city", DataType::Utf8, true)); + let child_field_zip: FieldRef = Arc::new(ArrowField::new("zip", DataType::Utf8, true)); + let struct_field = ArrowField::new( + "s", + DataType::Struct(Fields::from(vec![child_field_city, child_field_zip])), + true, + ); + let schema = ArrowSchema::new(vec![struct_field]); + + let mut eq_map = HashMap::new(); + eq_map.insert( + "s.city".to_string(), + ScalarValue::Utf8(Some("US".to_string())), + ); + + let batch = build_single_row_record_batch(&schema, &eq_map).unwrap(); + let s_arr = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let city_arr = s_arr + .column_by_name("city") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(city_arr.value(0), "US"); + let zip_arr = s_arr + .column_by_name("zip") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert!(zip_arr.is_null(0)); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_year_date32() { + let value: i32 = 19723; + let array = Date32Array::from(vec![value]); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![Field::new( + "col0", + DataType::Date32, + false, + )])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let ids = vec![0]; + let transform = PartitionTransform::Year; + let expr = "date_part('year', col0)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_truncate_utf8() { + let array = StringArray::from(vec!["abcdef"]); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![Field::new( + "col0", + DataType::Utf8, + false, + )])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let ids = vec![0]; + let width = 3; + let transform = PartitionTransform::Truncate { width }; + let expr = "substring(col0, 1, 3)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_truncate_int32() { + let array = Int32Array::from(vec![17]); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![Field::new( + "col0", + DataType::Int32, + false, + )])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let ids = vec![0]; + let width = 5; + let transform = PartitionTransform::Truncate { width }; + let expr = "col0 - (col0 % 5)"; + + let v_expr = parse_partition_value_from_expr(&batch, expr).await.unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_bucket_binary() { + let data: Vec<&[u8]> = vec![b"abc".as_ref()]; + let array = BinaryArray::from(data); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![Field::new( + "col0", + DataType::Binary, + false, + )])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap(); + + let ids = vec![0]; + let num_buckets = 8; + let transform = PartitionTransform::Bucket { num_buckets }; + let expr = format!("abs(murmur3_multi(col0)) % {}", num_buckets); + + let v_expr = parse_partition_value_from_expr(&batch, &expr) + .await + .unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_parse_partition_value_transform_vs_expr_multi_bucket_utf8() { + let col0 = StringArray::from(vec!["ab"]); + let col1 = StringArray::from(vec!["12"]); + let schema = Arc::new( + ensure_all_schema_field_have_id(ArrowSchema::new(vec![ + Field::new("col0", DataType::Utf8, false), + Field::new("col1", DataType::Utf8, false), + ])) + .unwrap(), + ); + let batch = RecordBatch::try_new(schema, vec![Arc::new(col0), Arc::new(col1)]).unwrap(); + + let ids = vec![0, 1]; + let num_buckets = 16; + let transform = PartitionTransform::MultiBucket { num_buckets }; + let expr = format!("abs(murmur3_multi(col0, col1)) % {}", num_buckets); + + let v_expr = parse_partition_value_from_expr(&batch, &expr) + .await + .unwrap(); + let v_transform = parse_partition_value_from_transform(&ids, &batch, &transform) + .await + .unwrap(); + + assert_eq!(v_expr.len(), v_transform.len()); + assert_eq!(v_expr.data_type(), v_transform.data_type()); + assert_eq!(v_expr.as_ref(), v_transform.as_ref()); + } + + #[tokio::test] + async fn test_resolve_partition_table_multiple_partition_spec_versions() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let (ns, spec_v1, spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + for (t, vals) in v1_tables.iter() { + let resolved = ns + .resolve_partition_table(&spec_v1, vals) + .await + .unwrap() + .expect("should resolve v1 table"); + assert_eq!(&resolved, t); + assert_eq!(resolved.id.first().map(|s| s.as_str()), Some("v1")); + } + + for (t, vals) in v2_tables.iter() { + let resolved = ns + .resolve_partition_table(&spec_v2, vals) + .await + .unwrap() + .expect("should resolve v2 table"); + assert_eq!(&resolved, t); + assert_eq!(resolved.id.first().map(|s| s.as_str()), Some("v2")); + } + + // missing partition should return None + let missing = vec![ + ScalarValue::Int32(Some(1999)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let resolved = ns + .resolve_partition_table(&spec_v1, &missing) + .await + .unwrap(); + assert!(resolved.is_none()); + } + + #[tokio::test] + async fn test_create_partition_table_multiple_versions_manifest_properties() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // create partition table + let (ns, _spec_v1, _spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + // verify created partition tables + let objects = ns + .manifest + .query_manifest_expr(col("object_type").eq(lit("table"))) + .await + .unwrap(); + + let mut tables_by_id: HashMap = HashMap::new(); + let mut v1_count = 0usize; + let mut v2_count = 0usize; + for obj in objects { + let ManifestObject::Table(t) = obj else { + continue; + }; + if t.name != "dataset" { + continue; + } + if t.namespace.first().map(|s| s.as_str()) == Some("v1") { + v1_count += 1; + } + if t.namespace.first().map(|s| s.as_str()) == Some("v2") { + v2_count += 1; + } + + let mut id = t.namespace.clone(); + id.push(t.name.clone()); + tables_by_id.insert(id.join("."), t); + } + + assert_eq!(v1_count, v1_tables.len()); + assert_eq!(v2_count, v2_tables.len()); + + // Check one v1 table has expected extended props and no v2-only field + let (t_v1, vals_v1) = &v1_tables[0]; + let tbl = tables_by_id + .get(&t_v1.id.join(".")) + .expect("v1 table should exist in manifest"); + let props = tbl.properties.as_ref().expect("properties should exist"); + let v1_year = scalar_to_str(&vals_v1[0]).unwrap().unwrap(); + assert_eq!( + props + .get(&format!("{}partition_field_event_year", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v1_year.as_str()) + ); + assert_eq!( + props + .get(&format!("{}partition_field_country", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some("US") + ); + assert!(!props.contains_key(&format!("{}partition_field_business_unit", EXTENDED_PREFIX))); + + // Check one v2 table has expected extended props including v2-only field + let (t_v2, vals_v2) = &v2_tables[0]; + let tbl = tables_by_id + .get(&t_v2.id.join(".")) + .expect("v2 table should exist in manifest"); + let props = tbl.properties.as_ref().expect("properties should exist"); + let v2_bu = crate::dir::manifest::scalar_to_str(&vals_v2[0]) + .unwrap() + .unwrap(); + let v2_co = crate::dir::manifest::scalar_to_str(&vals_v2[1]) + .unwrap() + .unwrap(); + assert_eq!( + props + .get(&format!("{}partition_field_event_year", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + None + ); + assert_eq!( + props + .get(&format!("{}partition_field_business_unit", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v2_bu.as_str()) + ); + assert_eq!( + props + .get(&format!("{}partition_field_country", EXTENDED_PREFIX)) + .map(|s| s.as_str()), + Some(v2_co.as_str()) + ); + } + + #[tokio::test] + async fn test_plan_scan_on_missing_columns() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // v1 schema does NOT have column business_unit. + let schema_v1 = ArrowSchema::new(vec![ + ArrowField::new("ts", DataType::Date32, true), + ArrowField::new("country", DataType::Utf8, true), + ]); + let spec_v1 = PartitionSpec { + id: 1, + fields: vec![make_pf_year("event_year", 0), make_pf_country("country", 1)], + }; + let mut ns = PartitionedNamespaceBuilder::new(temp_path) + .schema(schema_v1.clone()) + .partition_spec(spec_v1.clone()) + .build() + .await + .unwrap(); + + // Create v1 tables. + let v1_vals_1 = vec![ + ScalarValue::Int32(Some(2020)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v1_t1 = ns + .create_partition_table(&spec_v1, &v1_vals_1) + .await + .unwrap(); + + // Evolve schema + partition spec to v2: add business_unit partition field. + let schema_v2 = ArrowSchema::new(vec![ + ArrowField::new("ts", DataType::Date32, true), + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("business_unit", DataType::Int32, true), + ]); + let spec_v2 = ns + .update_table_spec(schema_v2, spec_v1.fields.clone()) + .await + .unwrap(); + + let v2_vals_1 = vec![ + ScalarValue::Int32(Some(1)), + ScalarValue::Utf8(Some("US".to_string())), + ]; + let v2_t1 = ns + .create_partition_table(&spec_v2.partition_spec, &v2_vals_1) + .await + .unwrap(); + + // business_unit IS NOT NULL should prune ALL v1 tables. + let filter = col("business_unit").is_not_null(); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + + let expected: HashSet = [v2_t1.id.join(".")].into_iter().collect(); + assert_eq!(got, expected); + + // Explicitly verify v1 tables are pruned. + assert!(!got.contains(&v1_t1.id.join("."))); + } + + #[tokio::test] + async fn test_plan_scan_multiple_partition_spec_versions() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let (ns, _spec_v1, _spec_v2, v1_tables, v2_tables) = + setup_multi_version_namespace(temp_path).await; + + let v1_us_2020 = &v1_tables[0].0; + let v1_cn_2021 = &v1_tables[1].0; + let v2_us_2020_bu1 = &v2_tables[0].0; + let v2_fr_2022_bu2 = &v2_tables[1].0; + + // (country = 'US') should match exactly the two US/2020 tables. + let filter = col("country").eq(lit("US")); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + + let expected: HashSet = [v1_us_2020.id.join("."), v2_us_2020_bu1.id.join(".")] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // business_unit = 1 should include v2 bu=1 table and all v1 tables (NULL => conservative keep), but not v2 bu=2. + let filter = col("business_unit").eq(lit(1i32)); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + let expected: HashSet = [ + v2_us_2020_bu1.id.join("."), + v1_us_2020.id.join("."), + v1_cn_2021.id.join("."), + ] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // (business_unit = 1) AND (country = 'US') should prune away v1 CN table. + let filter = col("business_unit") + .eq(lit(1i32)) + .and(col("country").eq(lit("US"))); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + let expected: HashSet = [v1_us_2020.id.join("."), v2_us_2020_bu1.id.join(".")] + .into_iter() + .collect(); + assert_eq!(got, expected); + + // (year(ts) = 2020) should match v1 2020 and all v2 tables. + let arrow_schema = ns.schema().unwrap(); + let filter = parse_filter_expr_from_sql("date_part('year', ts)=2020", &arrow_schema) + .await + .unwrap(); + let planned = ns.plan_scan(&filter).await.unwrap(); + let got: HashSet = planned.into_iter().map(|(t, _)| t.id.join(".")).collect(); + + let expected: HashSet = [ + v1_us_2020.id.join("."), + v2_fr_2022_bu2.id.join("."), + v2_us_2020_bu1.id.join("."), + ] + .into_iter() + .collect(); + assert_eq!(got, expected); + } + + #[tokio::test] + async fn test_create_partition_table() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let ns = create_partitioned_namespace_country_year(temp_path).await; + let spec = ns.partition_spec().await.unwrap(); + + // Pre-create namespaces for US (level0) and US/2020 (level1), but no table. + let us_2020_values = vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2020)), + ]; + ns.create_missing_partition_namespaces(&spec, &us_2020_values) + .await + .unwrap(); + + // Before: namespaces exist, but no table. + let before_us = list_ns_ids_by_country_year(&ns, "US", None).await; + assert_eq!(before_us.len(), 1); + assert_eq!(before_us[0].0, vec![spec.spec_id_str()]); + assert_eq!(before_us[0].1, "US"); + + let before_us_2020 = list_ns_ids_by_country_year(&ns, "US", Some(2020)).await; + assert_eq!(before_us_2020.len(), 1); + assert_eq!( + before_us_2020[0].0, + vec![spec.spec_id_str(), "US".to_string()] + ); + assert_eq!(before_us_2020[0].1, "2020"); + + let tables_before = ns + .manifest + .query_manifest_expr(col("object_type").eq(lit("table"))) + .await + .unwrap(); + assert!(tables_before.is_empty()); + + // Create US/2021 table. + let us_2021_values = vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2021)), + ]; + let resp = ns + .create_partition_table(&spec, &us_2021_values) + .await + .unwrap(); + let location = ns + .describe_table(DescribeTableRequest { + id: Some(resp.id), + ..Default::default() + }) + .await + .unwrap() + .location + .unwrap(); + let ds = Dataset::open(&location).await.unwrap(); + // Dataset schema doesn't carry field-id metadata. Normalize before comparing. + assert_eq!( + ensure_all_schema_field_have_id(ArrowSchema::from(ds.schema())).unwrap(), + ns.schema().unwrap() + ); + + // US namespace should be reused: still exactly one, and same object_id. + let after_us = list_ns_ids_by_country_year(&ns, "US", None).await; + assert_eq!(after_us.len(), 1); + assert_eq!(after_us, before_us); + + // US/2020 namespace should remain unchanged. + let after_us_2020 = list_ns_ids_by_country_year(&ns, "US", Some(2020)).await; + assert_eq!(after_us_2020.len(), 1); + assert_eq!(after_us_2020, before_us_2020); + + // Table for US/2021 exists; US/2020 table still does not exist. + let us_2021_tables = ns + .manifest + .query_manifest_expr( + col("object_type") + .eq(lit("table")) + .and(col("partition_field_country").eq(lit("US"))) + .and(col("partition_field_year").eq(lit(2021i32))), + ) + .await + .unwrap(); + assert_eq!(us_2021_tables.len(), 1); + + let us_2020_tables = ns + .manifest + .query_manifest_expr( + col("object_type") + .eq(lit("table")) + .and(col("partition_field_country").eq(lit("US"))) + .and(col("partition_field_year").eq(lit(2020i32))), + ) + .await + .unwrap(); + assert!(us_2020_tables.is_empty()); + } + + #[tokio::test] + async fn test_create_missing_partition_namespaces() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + let ns = create_partitioned_namespace_country_year(temp_path).await; + let spec = ns.partition_spec().await.unwrap(); + + // When creating missing namespaces, it should use deterministic ids derived from + // (spec_id, partition_values). + let values = vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2020)), + ]; + let got_1 = ns + .create_missing_partition_namespaces(&spec, &values) + .await + .unwrap(); + let got_2 = ns + .create_missing_partition_namespaces(&spec, &values) + .await + .unwrap(); + assert_eq!(got_1, got_2); + + let expected = vec![spec.spec_id_str(), "US".to_string(), "2020".to_string()]; + assert_eq!(got_1, expected); + + // Deterministic object_id should exist in manifest. + let object_id = got_1.join(DELIMITER); + let objs = ns + .manifest + .query_manifest_expr( + col("object_id") + .eq(lit(object_id)) + .and(col("object_type").eq(lit("namespace"))), + ) + .await + .unwrap(); + assert_eq!(objs.len(), 1); + } + + async fn create_ns_and_record( + temp_path: &str, + values: Vec, + ) -> Result)>> { + let col1 = ScalarValue::iter_to_array(vec![values.first().unwrap().clone()])?; + let col2 = ScalarValue::iter_to_array(vec![values.get(1).unwrap().clone()])?; + let mut col1_meta = HashMap::new(); + col1_meta.insert(LANCE_FIELD_ID_META_KEY.to_string(), "0".to_string()); + let mut col2_meta = HashMap::new(); + col2_meta.insert(LANCE_FIELD_ID_META_KEY.to_string(), "1".to_string()); + let schema = Arc::new(Schema::new(vec![ + Field::new("country", DataType::Utf8, true).with_metadata(col1_meta), + Field::new("year", DataType::Int32, true).with_metadata(col2_meta), + ])); + let batch = RecordBatch::try_new(schema, vec![col1, col2])?; + let mut ns_list: Vec<(PartitionedNamespace, RecordBatch, Vec)> = vec![]; + for _ in 0..8 { + let ns = create_partitioned_namespace_country_year(temp_path).await; + ns_list.push((ns, batch.clone(), values.clone())); + } + Ok(ns_list) + } + + async fn create_missing_table_same_id( + temp_path: &str, + values_list: Vec>, + ) -> Result<()> { + let ns = create_partitioned_namespace_country_year(temp_path).await; + let spec = ns.partition_spec().await.unwrap(); + ns.ensure_partition_fields_exists(&spec.fields).await?; + + let mut ns_list: Vec<(PartitionedNamespace, RecordBatch, Vec)> = vec![]; + for values in values_list.into_iter() { + let ns_record_list = create_ns_and_record(temp_path, values).await?; + ns_list.extend(ns_record_list); + } + + // Execute concurrent create. + let mut set = JoinSet::new(); + for (ns, batch, values) in ns_list.into_iter() { + set.spawn(async move { + let table = ns.resolve_or_create_partition_table(&batch).await.unwrap(); + (table, values) + }); + } + + let mut table_values_res: Vec<(PartitionTable, Vec)> = vec![]; + while let Some(res) = set.join_next().await { + table_values_res.push(res?); + } + + for (created_table, values) in table_values_res.into_iter() { + let table = ns.resolve_partition_table(&spec, &values).await?.unwrap(); + assert_eq!(table.id, created_table.id); + } + + Ok(()) + } + + #[tokio::test] + async fn test_resolve_or_create_missing_table_same_id() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let values = vec![ + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2020)), + ], + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2021)), + ], + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2022)), + ], + ]; + create_missing_table_same_id(temp_path, values) + .await + .unwrap(); + } + + async fn create_missing_partition_namespaces_same_id( + temp_path: &str, + values_list: Vec>, + ) -> Result<()> { + let ns = create_partitioned_namespace_country_year(temp_path).await; + let spec = ns.partition_spec().await?; + ns.ensure_partition_fields_exists(&spec.fields).await?; + + // Create multiple PartitionedNamespace instances per values group. + let mut tasks: Vec<(PartitionedNamespace, Vec)> = Vec::new(); + for values in values_list.into_iter() { + for _ in 0..8 { + let ns_i = create_partitioned_namespace_country_year(temp_path).await; + tasks.push((ns_i, values.clone())); + } + } + + // Execute concurrent create. + let spec = Arc::new(spec); + let mut set = JoinSet::new(); + for (ns_i, values) in tasks.into_iter() { + let spec = Arc::clone(&spec); + set.spawn(async move { + let created = ns_i + .create_missing_partition_namespaces(&spec, &values) + .await + .unwrap(); + (created, values) + }); + } + + let mut created: Vec<(Vec, Vec)> = Vec::new(); + while let Some(res) = set.join_next().await { + created.push(res?); + } + + for (created_id, values) in created.into_iter() { + let expected = create_partition_namespace_id(&spec, &values)?; + assert_eq!(created_id, expected); + + // Resolve partition namespace from __manifest by partition values. + let leaf = ns + .resolve_partition_namespace(&spec, &values) + .await? + .unwrap(); + assert_eq!(leaf, expected); + } + + Ok(()) + } + + #[tokio::test] + async fn test_create_missing_partition_namespaces_concurrent_same_id() { + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let values = vec![ + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2020)), + ], + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2021)), + ], + vec![ + ScalarValue::Utf8(Some("US".to_string())), + ScalarValue::Int32(Some(2022)), + ], + ]; + + create_missing_partition_namespaces_same_id(temp_path, values) + .await + .unwrap(); + } + + async fn create_partitioned_namespace_country_year(temp_path: &str) -> PartitionedNamespace { + let arrow_schema = ArrowSchema::new(vec![ + ArrowField::new("country", DataType::Utf8, true), + ArrowField::new("year", DataType::Int32, true), + ]); + let schema = arrow_schema.clone(); + + let spec = PartitionSpec { + id: 1, + fields: vec![ + PartitionField { + field_id: "country".to_string(), + source_ids: vec![0], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Utf8, + }, + PartitionField { + field_id: "year".to_string(), + source_ids: vec![1], + transform: Some(PartitionTransform::Identity), + expression: None, + result_type: DataType::Int32, + }, + ], + }; + + let ns = PartitionedNamespaceBuilder::new(temp_path) + .schema(schema) + .partition_spec(spec.clone()) + .build() + .await + .unwrap(); + ns.ensure_partition_fields_exists(&spec.fields) + .await + .unwrap(); + + ns + } + + async fn list_ns_ids_by_country_year( + ns: &PartitionedNamespace, + country: &str, + year: Option, + ) -> Vec<(Vec, String)> { + let mut pred = col("object_type") + .eq(lit("namespace")) + .and(col("partition_field_country").eq(lit(country))); + pred = match year { + Some(y) => pred.and(col("partition_field_year").eq(lit(y))), + None => pred.and(col("partition_field_year").is_null()), + }; + + ns.manifest + .query_manifest_expr(pred) + .await + .unwrap() + .into_iter() + .filter_map(|obj| match obj { + ManifestObject::Namespace(n) => Some((n.namespace, n.name)), + _ => None, + }) + .collect() + } +} diff --git a/rust/lance-namespace-impls/src/udf.rs b/rust/lance-namespace-impls/src/udf.rs new file mode 100644 index 00000000000..a50fa3cec68 --- /dev/null +++ b/rust/lance-namespace-impls/src/udf.rs @@ -0,0 +1,163 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +/// DataFusion UDFs used by partitioned namespace implementation. +use arrow::array::{ + Array, ArrayRef, BinaryArray, Date32Array, Date64Array, Float32Array, Float64Array, Int8Array, + Int16Array, Int32Array, Int32Builder, Int64Array, LargeBinaryArray, LargeStringArray, + StringArray, TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt8Array, UInt16Array, UInt32Array, UInt64Array, +}; +use arrow::util::display::array_value_to_string; +use arrow_schema::DataType; +use datafusion_functions::utils::make_scalar_function; +use lance::deps::datafusion::error::DataFusionError; +use lance::deps::datafusion::logical_expr::{ScalarUDF, Signature, SimpleScalarUDF, Volatility}; +use lance_core::Error; +use std::sync::{Arc, LazyLock}; + +/// A variadic murmur3 UDF. +/// +/// - Accepts any number of arguments (>= 1) +/// - Accepts any argument types (bytes are derived via `scalar_to_bytes`) +/// - Skips NULL arguments; returns NULL if all arguments are NULL for a row +fn murmur3_multi() -> ScalarUDF { + let function = Arc::new(make_scalar_function( + |args: &[ArrayRef]| { + if args.is_empty() { + return Err(DataFusionError::Execution( + "murmur3_multi expects at least 1 argument".to_string(), + )); + } + + let len = args[0].len(); + for a in args.iter().skip(1) { + if a.len() != len { + return Err(DataFusionError::Execution( + "All arguments to murmur3_multi must have the same length".to_string(), + )); + } + } + + let mut builder = Int32Builder::new(); + for row in 0..len { + let mut buf = Vec::new(); + let mut has_value = false; + + for col in args { + let array = col.as_ref(); + if array.is_null(row) { + continue; + } + has_value = true; + let value_bytes = scalar_to_bytes(array, row) + .map_err(|e| DataFusionError::Execution(e.to_string()))?; + buf.extend_from_slice(&value_bytes); + } + + if !has_value { + builder.append_null(); + continue; + } + + let hash = murmur3::murmur3_32(&mut std::io::Cursor::new(&buf), 0)? as i32; + builder.append_value(hash); + } + + Ok(Arc::new(builder.finish()) as ArrayRef) + }, + vec![], + )); + + ScalarUDF::from(SimpleScalarUDF::new_with_signature( + "murmur3_multi", + Signature::variadic_any(Volatility::Immutable), + DataType::Int32, + function, + )) +} + +pub(crate) fn scalar_to_bytes(array: &dyn Array, row: usize) -> lance_core::Result> { + if array.is_null(row) { + return Ok(Vec::new()); + } + + macro_rules! to_bytes_primitive { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = $array.as_any().downcast_ref::<$array_ty>().ok_or_else(|| { + Error::invalid_input(format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + )) + })?; + a.value($row).to_le_bytes().to_vec() + }}; + } + + macro_rules! to_bytes_utf8_like { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = $array.as_any().downcast_ref::<$array_ty>().ok_or_else(|| { + Error::invalid_input(format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + )) + })?; + a.value($row).as_bytes().to_vec() + }}; + } + + macro_rules! to_bytes_binary_like { + ($array_ty:ty, $array:expr, $row:expr) => {{ + let a = $array.as_any().downcast_ref::<$array_ty>().ok_or_else(|| { + Error::invalid_input(format!( + "Expected array type '{}' but got '{:?}'", + stringify!($array_ty), + $array.data_type() + )) + })?; + a.value($row).to_vec() + }}; + } + + let dt = array.data_type(); + let bytes = match dt { + DataType::Int8 => to_bytes_primitive!(Int8Array, array, row), + DataType::Int16 => to_bytes_primitive!(Int16Array, array, row), + DataType::Int32 => to_bytes_primitive!(Int32Array, array, row), + DataType::Int64 => to_bytes_primitive!(Int64Array, array, row), + DataType::UInt8 => to_bytes_primitive!(UInt8Array, array, row), + DataType::UInt16 => to_bytes_primitive!(UInt16Array, array, row), + DataType::UInt32 => to_bytes_primitive!(UInt32Array, array, row), + DataType::UInt64 => to_bytes_primitive!(UInt64Array, array, row), + DataType::Float32 => to_bytes_primitive!(Float32Array, array, row), + DataType::Float64 => to_bytes_primitive!(Float64Array, array, row), + DataType::Date32 => to_bytes_primitive!(Date32Array, array, row), + DataType::Date64 => to_bytes_primitive!(Date64Array, array, row), + DataType::Timestamp(arrow_schema::TimeUnit::Second, _) => { + to_bytes_primitive!(TimestampSecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Millisecond, _) => { + to_bytes_primitive!(TimestampMillisecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Microsecond, _) => { + to_bytes_primitive!(TimestampMicrosecondArray, array, row) + } + DataType::Timestamp(arrow_schema::TimeUnit::Nanosecond, _) => { + to_bytes_primitive!(TimestampNanosecondArray, array, row) + } + DataType::Utf8 => to_bytes_utf8_like!(StringArray, array, row), + DataType::LargeUtf8 => to_bytes_utf8_like!(LargeStringArray, array, row), + DataType::Binary => to_bytes_binary_like!(BinaryArray, array, row), + DataType::LargeBinary => to_bytes_binary_like!(LargeBinaryArray, array, row), + _ => { + let s = array_value_to_string(array, row).map_err(lance_core::Error::from)?; + s.into_bytes() + } + }; + + Ok(bytes) +} + +pub static MURMUR3_MULTI_UDF: LazyLock = LazyLock::new(murmur3_multi); diff --git a/rust/lance-namespace-impls/src/util.rs b/rust/lance-namespace-impls/src/util.rs new file mode 100644 index 00000000000..df7df2df075 --- /dev/null +++ b/rust/lance-namespace-impls/src/util.rs @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors +// NOTE: Keep this module warning-clean; avoid `#![allow(unused)]`. + +use crate::partition::PartitionField; +use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; +use arrow_schema::{DataType, FieldRef, Fields}; +use lance::deps::datafusion::common::ScalarValue; +use lance::deps::datafusion::logical_expr::{Expr, Operator}; +use lance_arrow::{LANCE_FIELD_ID_META_KEY, SchemaExt}; +use lance_core::Error; +use std::collections::HashSet; +use std::sync::Arc; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum Truthiness { + AlwaysTrue, + AlwaysFalse, + Unknown, +} + +impl Truthiness { + fn and(self, other: Self) -> Self { + match (self, other) { + (Self::Unknown, _) | (_, Self::Unknown) => Self::Unknown, + (Self::AlwaysTrue, Self::AlwaysTrue) => Self::AlwaysTrue, + (_, _) => Self::AlwaysFalse, + } + } + + fn or(self, other: Self) -> Self { + match (self, other) { + (Self::Unknown, _) | (_, Self::Unknown) => Self::Unknown, + (Self::AlwaysFalse, Self::AlwaysFalse) => Self::AlwaysFalse, + (_, _) => Self::AlwaysTrue, + } + } + + fn not(self) -> Self { + match self { + Self::AlwaysTrue => Self::AlwaysFalse, + Self::AlwaysFalse => Self::AlwaysTrue, + Self::Unknown => Self::Unknown, + } + } +} + +/// Return `True` if the input cdf is always false. +pub fn is_cdf_always_false(schema: &ArrowSchema, cdf: &Vec>) -> bool { + for clause in cdf { + // If any clause is not provably false, then the whole CDF is not const-false. + // (We are conservative: UNKNOWN => not const-false.) + let mut clause_truth = Truthiness::AlwaysTrue; + for atom in clause { + let atom_t = expr_truthiness(schema, atom); + clause_truth = match (clause_truth, atom_t) { + (Truthiness::AlwaysFalse, _) | (_, Truthiness::AlwaysFalse) => { + Truthiness::AlwaysFalse + } + (Truthiness::AlwaysTrue, Truthiness::AlwaysTrue) => Truthiness::AlwaysTrue, + _ => Truthiness::Unknown, + }; + if clause_truth == Truthiness::AlwaysFalse { + break; + } + } + + if clause_truth != Truthiness::AlwaysFalse { + return false; + } + } + true +} + +/// Static analyze of expr. The result is always_true, always_false or unknown. +/// If the column doesn't exist in schema, it is treated as value null. +pub fn expr_truthiness(schema: &ArrowSchema, expr: &Expr) -> Truthiness { + match expr { + Expr::Literal(v, _) => match v { + ScalarValue::Boolean(Some(true)) => Truthiness::AlwaysTrue, + ScalarValue::Boolean(Some(false)) => Truthiness::AlwaysFalse, + _ if v.is_null() => Truthiness::AlwaysFalse, + _ => Truthiness::Unknown, + }, + Expr::Column(_) if expr_is_always_null(schema, expr) => Truthiness::AlwaysFalse, + Expr::IsNull(e) if expr_is_always_null(schema, e.as_ref()) => Truthiness::AlwaysTrue, + Expr::IsNotNull(e) if expr_is_always_null(schema, e.as_ref()) => Truthiness::AlwaysFalse, + Expr::Not(e) => expr_truthiness(schema, e).not(), + Expr::BinaryExpr(b) if b.op == Operator::And => { + expr_truthiness(schema, &b.left).and(expr_truthiness(schema, &b.right)) + } + Expr::BinaryExpr(b) if b.op == Operator::Or => { + expr_truthiness(schema, &b.left).or(expr_truthiness(schema, &b.right)) + } + Expr::BinaryExpr(b) + if is_comparison_op(b.op) + && (expr_is_always_null(schema, &b.left) + || expr_is_always_null(schema, &b.right)) => + { + Truthiness::AlwaysFalse + } + Expr::Between(b) if expr_is_always_null(schema, &b.expr) => Truthiness::AlwaysFalse, + Expr::InList(inlist) if expr_is_always_null(schema, &inlist.expr) => { + Truthiness::AlwaysFalse + } + _ => Truthiness::Unknown, + } +} + +fn expr_is_always_null(schema: &ArrowSchema, expr: &Expr) -> bool { + match expr { + Expr::Literal(v, _) => v.is_null(), + Expr::Column(c) => !schema.fields().iter().any(|f| f.name() == &c.name), + Expr::Cast(c) => expr_is_always_null(schema, &c.expr), + Expr::TryCast(c) => expr_is_always_null(schema, &c.expr), + _ => false, + } +} + +pub fn is_comparison_op(op: Operator) -> bool { + matches!( + op, + Operator::Eq + | Operator::NotEq + | Operator::Lt + | Operator::LtEq + | Operator::Gt + | Operator::GtEq + ) +} + +/// Convert boolean expression into a conservative CDF (OR-of-ANDs) form. +/// +/// The returned structure is `Vec>`, where outer `Vec` is OR, and +/// inner `Vec` is AND of atomic predicates. +pub fn expr_to_cdf(expr: &Expr) -> Vec> { + match expr { + Expr::BinaryExpr(binary) if binary.op == Operator::And => { + let left = expr_to_cdf(&binary.left); + let right = expr_to_cdf(&binary.right); + let mut out = Vec::new(); + for l in left { + for r in &right { + let mut clause = Vec::with_capacity(l.len() + r.len()); + clause.extend(l.iter().cloned()); + clause.extend(r.iter().cloned()); + out.push(clause); + } + } + out + } + Expr::BinaryExpr(binary) if binary.op == Operator::Or => { + let mut left = expr_to_cdf(&binary.left); + let mut right = expr_to_cdf(&binary.right); + left.append(&mut right); + left + } + _ => vec![vec![expr.clone()]], + } +} + +/// Sanity check of table spec. +/// +/// 1. Schema field must have id. +/// 2. Partition field source_ids must not be empty. +/// 3. Partition field source id must exist in schema. +/// 4. Exactly one of transform or expression must be set. +/// 5. Partition field signature should be unique. +/// 6. Partition field id should be unique. +pub fn check_table_spec_consistency( + schema: &ArrowSchema, + partition_spec: &Vec, +) -> lance_core::Result<()> { + // Error if any field doesn't have id. + schema.max_id(false)?; + + // Sanity check + let mut existed_fields = HashSet::new(); + let mut existed_ids = HashSet::new(); + for f in partition_spec { + if f.source_ids.is_empty() { + return Err(Error::invalid_input( + "partition field source_ids must not be empty", + )); + } + for id in &f.source_ids { + if *id < 0 || schema.path_and_field_by_id(*id)?.is_none() { + return Err(Error::invalid_input(format!( + "partition source id {} not found in schema", + *id + ))); + } + } + + let has_transform = f.transform.is_some(); + let has_expression = f + .expression + .as_ref() + .map(|e| !e.trim().is_empty()) + .unwrap_or(false); + if has_transform == has_expression { + return Err(Error::invalid_input( + "Exactly one of transform or expression must be set", + )); + } + if !existed_fields.insert(f.signature()) || !existed_ids.insert(f.field_id.clone()) { + return Err(Error::invalid_input( + "Partition fields signature and field_id should be unique.", + )); + } + } + + Ok(()) +} + +/// Ensure all fields have id by setting id to field if it doesn't have one. +pub fn ensure_all_schema_field_have_id(schema: ArrowSchema) -> lance_core::Result { + // Find next id. + let max_id = schema.max_id(true)?.unwrap_or(-1); + let mut next_id = max_id + 1; + + // Rebuild schema and set id + let mut out: Vec = Vec::with_capacity(schema.fields().len()); + for f in schema.fields().iter() { + out.push(set_field_id(f.as_ref().clone(), &mut next_id)?); + } + Ok(ArrowSchema::new_with_metadata( + out, + schema.metadata().clone(), + )) +} + +/// Set field if for input arrow field. +fn set_field_id(field: ArrowField, next_id: &mut i32) -> lance_core::Result { + // Set id to field + let mut field = if field.metadata().get(LANCE_FIELD_ID_META_KEY).is_some() { + field + } else { + let id = *next_id; + *next_id += 1; + + let mut md = field.metadata().clone(); + md.insert(LANCE_FIELD_ID_META_KEY.to_string(), id.to_string()); + + field.with_metadata(md) + }; + + // Set id to children + let new_dt = match field.data_type() { + DataType::Struct(fields) => { + let mut out: Vec = Vec::with_capacity(fields.len()); + for child in fields.iter() { + let child = set_field_id(child.as_ref().clone(), next_id)?; + out.push(Arc::new(child)); + } + DataType::Struct(Fields::from(out)) + } + DataType::List(child) => { + let child = set_field_id(child.as_ref().clone(), next_id)?; + DataType::List(Arc::new(child)) + } + DataType::LargeList(child) => { + let child = set_field_id(child.as_ref().clone(), next_id)?; + DataType::LargeList(Arc::new(child)) + } + DataType::FixedSizeList(child, n) => { + let child = set_field_id(child.as_ref().clone(), next_id)?; + DataType::FixedSizeList(Arc::new(child), *n) + } + DataType::Map(child, sorted) => { + let child = set_field_id(child.as_ref().clone(), next_id)?; + DataType::Map(Arc::new(child), *sorted) + } + other => other.clone(), + }; + field = field.with_data_type(new_dt); + + Ok(field) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{Field as ArrowField, Schema as ArrowSchema}; + use arrow_schema::DataType; + use lance::deps::datafusion::prelude::{col, lit}; + + #[test] + fn test_expr_truthiness() { + let schema_empty = ArrowSchema::new(Vec::::new()); + let schema_with_a = ArrowSchema::new(vec![ArrowField::new("a", DataType::Int32, true)]); + + // Expr::Literal + assert_eq!( + expr_truthiness(&schema_empty, &lit(true)), + Truthiness::AlwaysTrue + ); + assert_eq!( + expr_truthiness(&schema_empty, &lit(false)), + Truthiness::AlwaysFalse + ); + assert_eq!( + expr_truthiness( + &schema_empty, + &Expr::Literal(ScalarValue::Int32(None), None) + ), + Truthiness::AlwaysFalse + ); + assert_eq!( + expr_truthiness( + &schema_empty, + &Expr::Literal(ScalarValue::Int32(Some(1)), None) + ), + Truthiness::Unknown + ); + + // Expr::Column (missing in schema) + assert_eq!( + expr_truthiness(&schema_with_a, &col("missing")), + Truthiness::AlwaysFalse + ); + + // Expr::IsNull / Expr::IsNotNull + assert_eq!( + expr_truthiness(&schema_with_a, &Expr::IsNull(Box::new(col("missing")))), + Truthiness::AlwaysTrue + ); + assert_eq!( + expr_truthiness(&schema_with_a, &Expr::IsNotNull(Box::new(col("missing")))), + Truthiness::AlwaysFalse + ); + + // Expr::Not + assert_eq!( + expr_truthiness(&schema_empty, &Expr::Not(Box::new(lit(true)))), + Truthiness::AlwaysFalse + ); + + // Expr::BinaryExpr + assert_eq!( + expr_truthiness(&schema_empty, &lit(true).and(lit(false))), + Truthiness::AlwaysFalse + ); + assert_eq!( + expr_truthiness(&schema_empty, &lit(false).or(lit(false))), + Truthiness::AlwaysFalse + ); + + // Expr::BinaryExpr + assert_eq!( + expr_truthiness(&schema_with_a, &col("missing").eq(lit(1))), + Truthiness::AlwaysFalse + ); + + // Expr::Between + assert_eq!( + expr_truthiness(&schema_with_a, &col("missing").between(lit(1), lit(2))), + Truthiness::AlwaysFalse + ); + + // Expr::InList + assert_eq!( + expr_truthiness( + &schema_with_a, + &col("missing").in_list(vec![lit(1), lit(2)], false) + ), + Truthiness::AlwaysFalse + ); + + // Default case: unknown + assert_eq!( + expr_truthiness(&schema_with_a, &col("a")), + Truthiness::Unknown + ); + } + + #[test] + fn test_is_cdf_always_false() { + let schema_with_a = ArrowSchema::new(vec![ArrowField::new("a", DataType::Int32, true)]); + + // Always false: every clause is provably false. + let cdf_always_false = vec![vec![lit(false)], vec![col("missing")]]; + assert!(is_cdf_always_false(&schema_with_a, &cdf_always_false)); + + // Not always false: at least one clause is not provably false. + let cdf_not_always_false = vec![vec![lit(false)], vec![lit(true)]]; + assert!(!is_cdf_always_false(&schema_with_a, &cdf_not_always_false)); + } +} diff --git a/rust/lance-namespace/src/schema.rs b/rust/lance-namespace/src/schema.rs index 3f44847bcd4..b2a5953786c 100644 --- a/rust/lance-namespace/src/schema.rs +++ b/rust/lance-namespace/src/schema.rs @@ -47,7 +47,7 @@ fn arrow_field_to_json(arrow_field: &Field) -> Result { } /// Convert Arrow DataType to JsonArrowDataType -fn arrow_type_to_json(data_type: &DataType) -> Result { +pub fn arrow_type_to_json(data_type: &DataType) -> Result { match data_type { // Primitive types DataType::Null => Ok(JsonArrowDataType::new("null".to_string())), @@ -261,10 +261,15 @@ pub fn convert_json_arrow_type(json_type: &JsonArrowDataType) -> Result Ok(DataType::UInt32), "int64" => Ok(DataType::Int64), "uint64" => Ok(DataType::UInt64), + "float16" => Ok(DataType::Float16), "float32" => Ok(DataType::Float32), "float64" => Ok(DataType::Float64), + "date32" => Ok(DataType::Date32), + "date64" => Ok(DataType::Date64), "utf8" => Ok(DataType::Utf8), + "large_utf8" => Ok(DataType::LargeUtf8), "binary" => Ok(DataType::Binary), + "large_binary" => Ok(DataType::LargeBinary), _ => Err(Error::namespace(format!( "Unsupported Arrow type: {}", type_name