From 641e0a94f19b3dd5dd015cae8234748ca67fc7e6 Mon Sep 17 00:00:00 2001 From: BubbleCal Date: Mon, 12 Jan 2026 19:22:04 +0800 Subject: [PATCH 1/8] perf: tighten WAND block score upper bound (#5668) We made some changes that cause the upper bound lose, tighten it to make it 15x faster --- rust/lance-index/src/scalar/inverted/wand.rs | 23 ++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs index ecfb93679cb..25c756950f3 100644 --- a/rust/lance-index/src/scalar/inverted/wand.rs +++ b/rust/lance-index/src/scalar/inverted/wand.rs @@ -148,7 +148,7 @@ impl PostingIterator { num_doc: usize, ) -> Self { let approximate_upper_bound = match list.max_score() { - Some(max_score) => max_score, // the index doesn't include the full BM25 upper bound at indexing time, so we need to multiply it here + Some(max_score) => max_score, None => idf(list.len(), num_doc) * (K1 + 1.0), }; @@ -265,7 +265,7 @@ impl PostingIterator { #[inline] fn block_max_score(&self) -> f32 { match self.list { - PostingList::Compressed(ref list) => list.block_max_score(self.block_idx) * (K1 + 1.0), + PostingList::Compressed(ref list) => list.block_max_score(self.block_idx), PostingList::Plain(_) => self.approximate_upper_bound, } } @@ -978,4 +978,23 @@ mod tests { assert!(result.is_ok()); } + + #[test] + fn test_block_max_score_matches_stored_value() { + let doc_ids = vec![0_u32]; + let block_max_scores = vec![0.7_f32]; + let posting_list = generate_posting_list(doc_ids, 0.7, Some(block_max_scores), true); + let expected = match &posting_list { + PostingList::Compressed(list) => list.block_max_score(0), + PostingList::Plain(_) => unreachable!("expected compressed posting list"), + }; + + let posting = PostingIterator::new(String::from("test"), 0, 0, posting_list, 1); + + let actual = posting.block_max_score(); + assert!( + (actual - expected).abs() < 1e-6, + "block max score should match stored value" + ); + } } From 0e03e35a563d8cf9ae68c3153d8ecad2a374aa7f Mon Sep 17 00:00:00 2001 From: Will Jones Date: Fri, 16 Jan 2026 18:07:55 -0800 Subject: [PATCH 2/8] perf: use LRU cache for session contexts in get_session_context (#5736) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We have a performance issue when `LANCE_MEM_POOL_SIZE`: when this is set (to something different than the default), the current cache always misses here: https://github.com/lance-format/lance/blob/445dd5bfe7b0111fd8e146163393e825e5995679/rust/lance-datafusion/src/exec.rs#L398-L410 - Replace static `LazyLock` session contexts with LRU cache (size 4) - Cache key uses resolved configuration values (after env var lookup) - Fixes cache misses when `LANCE_MEM_POOL_SIZE` env var is set - [x] `cargo test -p lance-datafusion` passes - [x] `cargo clippy -p lance-datafusion` passes 🤖 Generated with [Claude Code](https://claude.com/claude-code) --------- Co-authored-by: Claude Opus 4.5 --- rust/lance-datafusion/src/exec.rs | 192 +++++++++++++++++++++++++++--- 1 file changed, 176 insertions(+), 16 deletions(-) diff --git a/rust/lance-datafusion/src/exec.rs b/rust/lance-datafusion/src/exec.rs index 1bac7466700..8cf238f90bc 100644 --- a/rust/lance-datafusion/src/exec.rs +++ b/rust/lance-datafusion/src/exec.rs @@ -6,7 +6,7 @@ use std::{ collections::HashMap, fmt::{self, Formatter}, - sync::{Arc, LazyLock, Mutex}, + sync::{Arc, Mutex, OnceLock}, time::Duration, }; @@ -359,26 +359,78 @@ pub fn new_session_context(options: &LanceExecutionOptions) -> SessionContext { ctx } -static DEFAULT_SESSION_CONTEXT: LazyLock = - LazyLock::new(|| new_session_context(&LanceExecutionOptions::default())); +/// Cache key for session contexts based on resolved configuration values. +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +struct SessionContextCacheKey { + mem_pool_size: u64, + target_partition: Option, + use_spilling: bool, +} + +impl SessionContextCacheKey { + fn from_options(options: &LanceExecutionOptions) -> Self { + Self { + mem_pool_size: options.mem_pool_size(), + target_partition: options.target_partition, + use_spilling: options.use_spilling(), + } + } +} -static DEFAULT_SESSION_CONTEXT_WITH_SPILLING: LazyLock = LazyLock::new(|| { - new_session_context(&LanceExecutionOptions { - use_spilling: true, - ..Default::default() +struct CachedSessionContext { + context: SessionContext, + last_access: std::time::Instant, +} + +fn get_session_cache() -> &'static Mutex> { + static SESSION_CACHE: OnceLock>> = + OnceLock::new(); + SESSION_CACHE.get_or_init(|| Mutex::new(HashMap::new())) +} + +fn get_max_cache_size() -> usize { + const DEFAULT_CACHE_SIZE: usize = 4; + static MAX_CACHE_SIZE: OnceLock = OnceLock::new(); + *MAX_CACHE_SIZE.get_or_init(|| { + std::env::var("LANCE_SESSION_CACHE_SIZE") + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(DEFAULT_CACHE_SIZE) }) -}); +} pub fn get_session_context(options: &LanceExecutionOptions) -> SessionContext { - if options.mem_pool_size() == DEFAULT_LANCE_MEM_POOL_SIZE && options.target_partition.is_none() - { - return if options.use_spilling() { - DEFAULT_SESSION_CONTEXT_WITH_SPILLING.clone() - } else { - DEFAULT_SESSION_CONTEXT.clone() - }; + let key = SessionContextCacheKey::from_options(options); + let mut cache = get_session_cache() + .lock() + .unwrap_or_else(|e| e.into_inner()); + + // If key exists, update access time and return + if let Some(entry) = cache.get_mut(&key) { + entry.last_access = std::time::Instant::now(); + return entry.context.clone(); + } + + // Evict least recently used entry if cache is full + if cache.len() >= get_max_cache_size() { + if let Some(lru_key) = cache + .iter() + .min_by_key(|(_, v)| v.last_access) + .map(|(k, _)| k.clone()) + { + cache.remove(&lru_key); + } } - new_session_context(options) + + let context = new_session_context(options); + cache.insert( + key, + CachedSessionContext { + context: context.clone(), + last_access: std::time::Instant::now(), + }, + ); + context } fn get_task_context( @@ -791,3 +843,111 @@ impl ExecutionPlan for StrictBatchSizeExec { true } } + +#[cfg(test)] +mod tests { + use super::*; + + // Serialize cache tests since they share global state + static CACHE_TEST_LOCK: std::sync::Mutex<()> = std::sync::Mutex::new(()); + + #[test] + fn test_session_context_cache() { + let _lock = CACHE_TEST_LOCK.lock().unwrap(); + let cache = get_session_cache(); + + // Clear any existing entries from other tests + cache.lock().unwrap().clear(); + + // Create first session with default options + let opts1 = LanceExecutionOptions::default(); + let _ctx1 = get_session_context(&opts1); + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 1); + } + + // Same options should reuse cached session (no new entry) + let _ctx1_again = get_session_context(&opts1); + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 1); + } + + // Different options should create new entry + let opts2 = LanceExecutionOptions { + use_spilling: true, + ..Default::default() + }; + let _ctx2 = get_session_context(&opts2); + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 2); + } + } + + #[test] + fn test_session_context_cache_lru_eviction() { + let _lock = CACHE_TEST_LOCK.lock().unwrap(); + let cache = get_session_cache(); + + // Clear any existing entries from other tests + cache.lock().unwrap().clear(); + + // Create 4 different configurations to fill the cache + let configs: Vec = (0..4) + .map(|i| LanceExecutionOptions { + mem_pool_size: Some((i + 1) as u64 * 1024 * 1024), + ..Default::default() + }) + .collect(); + + for config in &configs { + let _ctx = get_session_context(config); + } + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 4); + } + + // Access config[0] to make it more recently used than config[1] + // (config[0] was inserted first, so without this access it would be evicted) + std::thread::sleep(std::time::Duration::from_millis(1)); + let _ctx = get_session_context(&configs[0]); + + // Add a 5th configuration - should evict config[1] (now least recently used) + let opts5 = LanceExecutionOptions { + mem_pool_size: Some(5 * 1024 * 1024), + ..Default::default() + }; + let _ctx5 = get_session_context(&opts5); + + { + let cache_guard = cache.lock().unwrap(); + assert_eq!(cache_guard.len(), 4); + + // config[0] should still be present (was accessed recently) + let key0 = SessionContextCacheKey::from_options(&configs[0]); + assert!( + cache_guard.contains_key(&key0), + "config[0] should still be cached after recent access" + ); + + // config[1] should be evicted (was least recently used) + let key1 = SessionContextCacheKey::from_options(&configs[1]); + assert!( + !cache_guard.contains_key(&key1), + "config[1] should have been evicted" + ); + + // New config should be present + let key5 = SessionContextCacheKey::from_options(&opts5); + assert!( + cache_guard.contains_key(&key5), + "new config should be cached" + ); + } + } +} From e5bdd66fc4b1349110a72af78cae1cab20bfd32d Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 20:56:52 -0800 Subject: [PATCH 3/8] ci: fix known CI failures from main branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Applies fixes for: 1. cargo-deny: add ignores for RUSTSEC-2025-0141 (bincode) and RUSTSEC-2026-0002 (lru) 2. torch.jit.script deprecation: migrate to torch.compile, add MSVC setup for Windows, add filterwarnings for PyTorch inductor 3. Java CI out of disk space: use warp-ubuntu-latest-x64-4x runner 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- .github/workflows/java.yml | 2 +- .github/workflows/run_tests/action.yml | 3 +++ deny.toml | 4 +++- python/pyproject.toml | 8 ++++++-- python/python/lance/torch/distance.py | 12 ++++++------ 5 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/java.yml b/.github/workflows/java.yml index c702df258f8..fcebe39571b 100644 --- a/.github/workflows/java.yml +++ b/.github/workflows/java.yml @@ -47,7 +47,7 @@ jobs: run: cargo clippy --all-targets -- -D warnings build-and-test-java: - runs-on: ubuntu-24.04 + runs-on: warp-ubuntu-latest-x64-4x timeout-minutes: 60 strategy: matrix: diff --git a/.github/workflows/run_tests/action.yml b/.github/workflows/run_tests/action.yml index 14c4b3d6f46..1800ce614fe 100644 --- a/.github/workflows/run_tests/action.yml +++ b/.github/workflows/run_tests/action.yml @@ -12,6 +12,9 @@ inputs: runs: using: "composite" steps: + - name: Setup MSVC for torch.compile + if: runner.os == 'Windows' + uses: ilammy/msvc-dev-cmd@v1 - name: Install dependencies working-directory: python shell: bash diff --git a/deny.toml b/deny.toml index ba5eed05786..cc7cd6d6023 100644 --- a/deny.toml +++ b/deny.toml @@ -85,7 +85,9 @@ ignore = [ { id = "RUSTSEC-2024-0436", reason = "`paste` is used by datafusion" }, { id = "RUSTSEC-2023-0071", reason = "`rsa` is used by opendal via reqsign" }, { id = "RUSTSEC-2025-0119", reason = "`number_prefix` used by hf-hub in examples" }, - { id = "RUSTSEC-2025-0134", reason = "`rustls-pemfile` unmaintained; awaiting upstream object_store/hyper-rustls migration to rustls-pki-types" } + { id = "RUSTSEC-2025-0134", reason = "`rustls-pemfile` unmaintained; awaiting upstream object_store/hyper-rustls migration to rustls-pki-types" }, + { id = "RUSTSEC-2025-0141", reason = "`bincode` is unmaintained and used by tantivy"}, + { id = "RUSTSEC-2026-0002", reason = "`lru` is used by tantivy and aws-sdk-s3"}, ] # If this is true, then cargo deny will use the git executable to fetch advisory database. # If this is false, then it uses a built-in git library. diff --git a/python/pyproject.toml b/python/pyproject.toml index 60cf4222978..9bd8cb86489 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -64,7 +64,7 @@ tests = [ ] dev = ["ruff==0.4.1", "pyright"] benchmarks = ["pytest-benchmark"] -torch = ["torch"] +torch = ["torch>=2.0"] geo = [ "geoarrow-rust-core", "geoarrow-rust-io", @@ -112,9 +112,13 @@ filterwarnings = [ 'ignore:.*datetime\.datetime\.utcnow\(\) is deprecated.*:DeprecationWarning', # Pandas 2.2 on Python 2.12 'ignore:.*datetime\.datetime\.utcfromtimestamp\(\) is deprecated.*:DeprecationWarning', - # Pytorch 2.2 on Python 2.12 + # Pytorch 2.2 on Python 3.12 'ignore:.*is deprecated and will be removed in Python 3\.14.*:DeprecationWarning', 'ignore:.*The distutils package is deprecated.*:DeprecationWarning', + # Pytorch inductor uses deprecated load_module() in its code cache + 'ignore:.*the load_module\(\) method is deprecated.*:DeprecationWarning', + # Pytorch uses deprecated jit.script_method internally (torch/utils/mkldnn.py) + 'ignore:.*torch\.jit\.script_method.*is deprecated.*:DeprecationWarning', # TensorFlow/Keras import can emit NumPy deprecation FutureWarnings in some environments. # Keep FutureWarnings as errors generally, but ignore this known-noisy import-time warning. 'ignore:.*np\.object.*:FutureWarning', diff --git a/python/python/lance/torch/distance.py b/python/python/lance/torch/distance.py index 06388210544..3c9becfd749 100644 --- a/python/python/lance/torch/distance.py +++ b/python/python/lance/torch/distance.py @@ -16,7 +16,7 @@ ] -@torch.jit.script +@torch.compile def _pairwise_cosine( x: torch.Tensor, y: torch.Tensor, y2: torch.Tensor ) -> torch.Tensor: @@ -49,7 +49,7 @@ def pairwise_cosine( return _pairwise_cosine(x, y, y2) -@torch.jit.script +@torch.compile def _cosine_distance( vectors: torch.Tensor, centroids: torch.Tensor, split_size: int ) -> Tuple[torch.Tensor, torch.Tensor]: @@ -114,7 +114,7 @@ def cosine_distance( raise RuntimeError("Cosine distance out of memory") -@torch.jit.script +@torch.compile def argmin_l2(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: x = x.reshape(1, x.shape[0], -1) y = y.reshape(1, y.shape[0], -1) @@ -125,7 +125,7 @@ def argmin_l2(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Ten return min_dists.pow(2), idx -@torch.jit.script +@torch.compile def pairwise_l2( x: torch.Tensor, y: torch.Tensor, y2: Optional[torch.Tensor] = None ) -> torch.Tensor: @@ -170,7 +170,7 @@ def pairwise_l2( return dists.type(origin_dtype) -@torch.jit.script +@torch.compile def _l2_distance( x: torch.Tensor, y: torch.Tensor, @@ -237,7 +237,7 @@ def l2_distance( raise RuntimeError("L2 distance out of memory") -@torch.jit.script +@torch.compile def dot_distance(x: torch.Tensor, y: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: """Pair-wise dot distance between two 2-D Tensors. From 12c894635cdca8335b2993d40bd562ded69b2ec4 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 21:05:36 -0800 Subject: [PATCH 4/8] feat: upgrade lance-namespace to 0.3.1 and add missing apis (#5457) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Related to refactorings in the namespace spec: https://github.com/lancedb/sophon/pull/4783 and https://github.com/lance-format/lance-namespace/pull/278 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- Cargo.lock | 4 +- Cargo.toml | 2 +- java/lance-jni/Cargo.lock | 4 +- java/pom.xml | 4 +- python/Cargo.lock | 4 +- python/pyproject.toml | 2 +- .../src/object_store/storage_options.rs | 1 + rust/lance-namespace-impls/src/dir.rs | 68 +- .../lance-namespace-impls/src/dir/manifest.rs | 45 +- rust/lance-namespace-impls/src/rest.rs | 335 +++++++-- .../lance-namespace-impls/src/rest_adapter.rs | 636 ++++++++++++++++-- rust/lance-namespace/src/namespace.rs | 223 +++++- rust/lance/src/dataset.rs | 1 + rust/lance/src/dataset/builder.rs | 1 + 14 files changed, 1138 insertions(+), 192 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e82504e3f32..e834924da59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4918,9 +4918,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" dependencies = [ "reqwest", "serde", diff --git a/Cargo.toml b/Cargo.toml index 1b8d05eff6f..6cc4aa503a0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,7 @@ lance-io = { version = "=1.0.3-beta.0", path = "./rust/lance-io", default-featur lance-linalg = { version = "=1.0.3-beta.0", path = "./rust/lance-linalg" } lance-namespace = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace-impls" } -lance-namespace-reqwest-client = "0.0.18" +lance-namespace-reqwest-client = "0.3.1" lance-table = { version = "=1.0.3-beta.0", path = "./rust/lance-table" } lance-test-macros = { version = "=1.0.3-beta.0", path = "./rust/lance-test-macros" } lance-testing = { version = "=1.0.3-beta.0", path = "./rust/lance-testing" } diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index e6406cec933..935b414e845 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -3797,9 +3797,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" dependencies = [ "reqwest", "serde", diff --git a/java/pom.xml b/java/pom.xml index ad757fa8a10..4a24d461f29 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -108,12 +108,12 @@ org.lance lance-namespace-core - 0.2.1 + 0.3.1 org.lance lance-namespace-apache-client - 0.2.1 + 0.3.1 com.fasterxml.jackson.core diff --git a/python/Cargo.lock b/python/Cargo.lock index bc52b611093..99a6979b8ed 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4268,9 +4268,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.0.18" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ea349999bcda4eea53fc05d334b3775ec314761e6a706555c777d7a29b18d19" +checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" dependencies = [ "reqwest", "serde", diff --git a/python/pyproject.toml b/python/pyproject.toml index 9bd8cb86489..3ad7ccef8f9 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.2.1"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.3.1"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 9405f95d70c..f809df8d1d3 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -114,6 +114,7 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { let request = DescribeTableRequest { id: Some(self.table_id.clone()), version: None, + with_table_uri: None, }; let response = self diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fd5a63a0848..fdb4370f6ab 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -777,11 +777,20 @@ impl LanceNamespace for DirectoryNamespace { let arrow_schema: arrow_schema::Schema = lance_schema.into(); let json_schema = arrow_schema_to_json(&arrow_schema)?; Ok(DescribeTableResponse { + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), version: Some(version as i64), - location: Some(table_uri), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - properties: None, storage_options: self.storage_options.clone(), + stats: None, }) } Err(err) => { @@ -793,11 +802,20 @@ impl LanceNamespace for DirectoryNamespace { .unwrap_or(false) { Ok(DescribeTableResponse { + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), version: None, - location: Some(table_uri), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: None, - properties: None, storage_options: self.storage_options.clone(), + stats: None, }) } else { Err(Error::Namespace { @@ -886,21 +904,6 @@ impl LanceNamespace for DirectoryNamespace { }); } - // Validate location if provided - if let Some(location) = &request.location { - let location = location.trim_end_matches('/'); - if location != table_uri { - return Err(Error::Namespace { - source: format!( - "Cannot create table {} at location {}, must be at location {}", - table_name, location, table_uri - ) - .into(), - location: snafu::location!(), - }); - } - } - // Parse the Arrow IPC stream from request_data let cursor = Cursor::new(request_data.to_vec()); let stream_reader = StreamReader::try_new(cursor, None).map_err(|e| Error::Namespace { @@ -948,9 +951,9 @@ impl LanceNamespace for DirectoryNamespace { })?; Ok(CreateTableResponse { + transaction_id: None, version: Some(1), location: Some(table_uri), - properties: None, storage_options: self.storage_options.clone(), }) } @@ -1007,6 +1010,7 @@ impl LanceNamespace for DirectoryNamespace { })?; Ok(CreateEmptyTableResponse { + transaction_id: None, location: Some(table_uri), properties: None, storage_options: self.storage_options.clone(), @@ -1188,28 +1192,6 @@ mod tests { ); } - #[tokio::test] - async fn test_create_table_with_wrong_location() { - let (namespace, _temp_dir) = create_test_namespace().await; - - // Create test IPC data - let schema = create_test_schema(); - let ipc_data = create_test_ipc_data(&schema); - - let mut request = CreateTableRequest::new(); - request.id = Some(vec!["test_table".to_string()]); - request.location = Some("/wrong/path/table.lance".to_string()); - - let result = namespace - .create_table(request, bytes::Bytes::from(ipc_data)) - .await; - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("must be at location")); - } - #[tokio::test] async fn test_list_tables() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -2360,7 +2342,7 @@ mod tests { register_req.id = Some(vec!["registered_table".to_string()]); let response = namespace.register_table(register_req).await.unwrap(); - assert_eq!(response.location, "external_table.lance"); + assert_eq!(response.location, Some("external_table.lance".to_string())); // Verify table exists in namespace let mut exists_req = TableExistsRequest::new(); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index ab6bb6fa78a..4791bbb9df5 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -1078,6 +1078,14 @@ impl LanceNamespace for ManifestNamespace { let object_id = Self::str_object_id(table_id); let table_info = self.query_manifest_for_table(&object_id).await?; + // Extract table name and namespace from table_id + let table_name = table_id.last().cloned().unwrap_or_default(); + let namespace_id: Vec = if table_id.len() > 1 { + table_id[..table_id.len() - 1].to_vec() + } else { + vec![] + }; + match table_info { Some(info) => { // Construct full URI from relative location @@ -1097,21 +1105,27 @@ impl LanceNamespace for ManifestNamespace { let json_schema = arrow_schema_to_json(&arrow_schema)?; Ok(DescribeTableResponse { + table: Some(table_name.clone()), + namespace: Some(namespace_id.clone()), version: Some(version as i64), - location: Some(table_uri), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - properties: None, storage_options: self.storage_options.clone(), + stats: None, }) } Err(_) => { // If dataset can't be opened (e.g., empty table), return minimal info Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), version: None, - location: Some(table_uri), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), schema: None, - properties: None, storage_options: self.storage_options.clone(), + stats: None, }) } } @@ -1197,21 +1211,6 @@ impl LanceNamespace for ManifestNamespace { }); } - // Validate location if provided - if let Some(location) = &request.location { - let location = location.trim_end_matches('/'); - if location != table_uri { - return Err(Error::Namespace { - source: format!( - "Cannot create table {} at location {}, must be at location {}", - table_name, location, table_uri - ) - .into(), - location: location!(), - }); - } - } - // Write the data using Lance Dataset let cursor = Cursor::new(data.to_vec()); let stream_reader = StreamReader::try_new(cursor, None) @@ -1250,9 +1249,9 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(CreateTableResponse { + transaction_id: None, version: Some(1), location: Some(table_uri), - properties: None, storage_options: self.storage_options.clone(), }) } @@ -1440,6 +1439,7 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(CreateNamespaceResponse { + transaction_id: None, properties: request.properties, }) } @@ -1622,6 +1622,7 @@ impl LanceNamespace for ManifestNamespace { ); Ok(CreateEmptyTableResponse { + transaction_id: None, location: Some(table_uri), properties: None, storage_options: self.storage_options.clone(), @@ -1697,7 +1698,8 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(RegisterTableResponse { - location, + transaction_id: None, + location: Some(location), properties: None, }) } @@ -1739,6 +1741,7 @@ impl LanceNamespace for ManifestNamespace { }; Ok(DeregisterTableResponse { + transaction_id: None, id: request.id.clone(), location: Some(table_uri), properties: None, diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 1f7ee341d26..3b5d0650659 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -9,22 +9,32 @@ use async_trait::async_trait; use bytes::Bytes; use lance_namespace::apis::{ - configuration::Configuration, namespace_api, table_api, transaction_api, + configuration::Configuration, namespace_api, table_api, tag_api, transaction_api, }; use lance_namespace::models::{ - AlterTransactionRequest, AlterTransactionResponse, CountTableRowsRequest, - CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTransactionRequest, DescribeTransactionResponse, - DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, DropTableResponse, - InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, ListTablesResponse, + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateEmptyTableRequest, CreateEmptyTableResponse, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, + CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeleteFromTableRequest, DeleteFromTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, + DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, + DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, + DescribeTableRequest, DescribeTableResponse, DescribeTransactionRequest, + DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, + DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, + ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, - QueryTableRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - UpdateTableRequest, UpdateTableResponse, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; use lance_core::{box_error, Error, Result}; @@ -456,9 +466,15 @@ impl LanceNamespace for RestNamespace { async fn describe_table(&self, request: DescribeTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - table_api::describe_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) - .await - .map_err(convert_api_error) + table_api::describe_table( + &self.reqwest_config, + &id, + request.clone(), + Some(&self.delimiter), + request.with_table_uri, + ) + .await + .map_err(convert_api_error) } async fn register_table(&self, request: RegisterTableRequest) -> Result { @@ -480,7 +496,7 @@ impl LanceNamespace for RestNamespace { async fn drop_table(&self, request: DropTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - table_api::drop_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + table_api::drop_table(&self.reqwest_config, &id, Some(&self.delimiter)) .await .map_err(convert_api_error) } @@ -511,26 +527,12 @@ impl LanceNamespace for RestNamespace { ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - let properties_json = request - .properties - .as_ref() - .map(|props| serde_json::to_string(props).unwrap_or_else(|_| "{}".to_string())); - - use lance_namespace::models::create_table_request::Mode; - let mode = request.mode.as_ref().map(|m| match m { - Mode::Create => "create", - Mode::ExistOk => "exist_ok", - Mode::Overwrite => "overwrite", - }); - table_api::create_table( &self.reqwest_config, &id, request_data.to_vec(), Some(&self.delimiter), - mode, - request.location.as_deref(), - properties_json.as_deref(), + request.mode.as_deref(), ) .await .map_err(convert_api_error) @@ -554,18 +556,12 @@ impl LanceNamespace for RestNamespace { ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - use lance_namespace::models::insert_into_table_request::Mode; - let mode = request.mode.as_ref().map(|m| match m { - Mode::Append => "append", - Mode::Overwrite => "overwrite", - }); - table_api::insert_into_table( &self.reqwest_config, &id, request_data.to_vec(), Some(&self.delimiter), - mode, + request.mode.as_deref(), ) .await .map_err(convert_api_error) @@ -594,6 +590,8 @@ impl LanceNamespace for RestNamespace { request.when_not_matched_insert_all, request.when_not_matched_by_source_delete, request.when_not_matched_by_source_delete_filt.as_deref(), + request.timeout.as_deref(), + request.use_index, ) .await .map_err(convert_api_error) @@ -710,6 +708,254 @@ impl LanceNamespace for RestNamespace { .map_err(convert_api_error) } + async fn create_table_scalar_index( + &self, + request: CreateTableIndexRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::create_table_scalar_index( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn drop_table_index( + &self, + request: DropTableIndexRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + let index_name = request.index_name.as_deref().unwrap_or(""); + + table_api::drop_table_index(&self.reqwest_config, &id, index_name, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn list_all_tables(&self, request: ListTablesRequest) -> Result { + table_api::list_all_tables( + &self.reqwest_config, + Some(&self.delimiter), + request.page_token.as_deref(), + request.limit, + ) + .await + .map_err(convert_api_error) + } + + async fn restore_table(&self, request: RestoreTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::restore_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn rename_table(&self, request: RenameTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::rename_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::list_table_versions( + &self.reqwest_config, + &id, + Some(&self.delimiter), + request.page_token.as_deref(), + request.limit, + ) + .await + .map_err(convert_api_error) + } + + async fn update_table_schema_metadata( + &self, + request: UpdateTableSchemaMetadataRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + let metadata = request.metadata.unwrap_or_default(); + + let result = table_api::update_table_schema_metadata( + &self.reqwest_config, + &id, + metadata, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error)?; + + Ok(UpdateTableSchemaMetadataResponse { + metadata: Some(result), + ..Default::default() + }) + } + + async fn get_table_stats( + &self, + request: GetTableStatsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::get_table_stats(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn explain_table_query_plan( + &self, + request: ExplainTableQueryPlanRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::explain_table_query_plan( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn analyze_table_query_plan( + &self, + request: AnalyzeTableQueryPlanRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::analyze_table_query_plan( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn alter_table_add_columns( + &self, + request: AlterTableAddColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::alter_table_add_columns( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn alter_table_alter_columns( + &self, + request: AlterTableAlterColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::alter_table_alter_columns( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn alter_table_drop_columns( + &self, + request: AlterTableDropColumnsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::alter_table_drop_columns( + &self.reqwest_config, + &id, + request, + Some(&self.delimiter), + ) + .await + .map_err(convert_api_error) + } + + async fn list_table_tags( + &self, + request: ListTableTagsRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + tag_api::list_table_tags( + &self.reqwest_config, + &id, + Some(&self.delimiter), + request.page_token.as_deref(), + request.limit, + ) + .await + .map_err(convert_api_error) + } + + async fn get_table_tag_version( + &self, + request: GetTableTagVersionRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + tag_api::get_table_tag_version(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn create_table_tag( + &self, + request: CreateTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + tag_api::create_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn delete_table_tag( + &self, + request: DeleteTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + tag_api::delete_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + + async fn update_table_tag( + &self, + request: UpdateTableTagRequest, + ) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + tag_api::update_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + fn namespace_id(&self) -> String { format!( "RestNamespace {{ endpoint: {:?}, delimiter: {:?} }}", @@ -722,7 +968,6 @@ impl LanceNamespace for RestNamespace { mod tests { use super::*; use bytes::Bytes; - use lance_namespace::models::{create_table_request, insert_into_table_request}; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, ResponseTemplate}; @@ -1023,9 +1268,7 @@ mod tests { "namespace".to_string(), "table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; let data = Bytes::from("arrow data here"); @@ -1045,7 +1288,7 @@ mod tests { Mock::given(method("POST")) .and(path(path_str.as_str())) .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ - "version": 2 + "transaction_id": "txn-123" }))) .mount(&mock_server) .await; @@ -1062,7 +1305,7 @@ mod tests { "namespace".to_string(), "table".to_string(), ]), - mode: Some(insert_into_table_request::Mode::Append), + mode: Some("Append".to_string()), }; let data = Bytes::from("arrow data here"); @@ -1071,6 +1314,6 @@ mod tests { // Should succeed with mock server assert!(result.is_ok()); let response = result.unwrap(); - assert_eq!(response.version, Some(2)); + assert_eq!(response.transaction_id, Some("txn-123".to_string())); } } diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 5e06f64570e..284b0d42fa9 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -67,14 +67,65 @@ impl RestAdapter { .route("/v1/namespace/:id/drop", post(drop_namespace)) .route("/v1/namespace/:id/exists", post(namespace_exists)) .route("/v1/namespace/:id/table/list", get(list_tables)) - // Table operations + // Table metadata operations .route("/v1/table/:id/register", post(register_table)) .route("/v1/table/:id/describe", post(describe_table)) .route("/v1/table/:id/exists", post(table_exists)) .route("/v1/table/:id/drop", post(drop_table)) .route("/v1/table/:id/deregister", post(deregister_table)) + .route("/v1/table/:id/rename", post(rename_table)) + .route("/v1/table/:id/restore", post(restore_table)) + .route("/v1/table/:id/version/list", get(list_table_versions)) + .route("/v1/table/:id/stats", get(get_table_stats)) + // Table data operations .route("/v1/table/:id/create", post(create_table)) .route("/v1/table/:id/create-empty", post(create_empty_table)) + .route("/v1/table/:id/insert", post(insert_into_table)) + .route("/v1/table/:id/merge_insert", post(merge_insert_into_table)) + .route("/v1/table/:id/update", post(update_table)) + .route("/v1/table/:id/delete", post(delete_from_table)) + .route("/v1/table/:id/query", post(query_table)) + .route("/v1/table/:id/count_rows", get(count_table_rows)) + // Index operations + .route("/v1/table/:id/create_index", post(create_table_index)) + .route( + "/v1/table/:id/create_scalar_index", + post(create_table_scalar_index), + ) + .route("/v1/table/:id/index/list", get(list_table_indices)) + .route( + "/v1/table/:id/index/:index_name/stats", + get(describe_table_index_stats), + ) + .route( + "/v1/table/:id/index/:index_name/drop", + post(drop_table_index), + ) + // Schema operations + .route("/v1/table/:id/add_columns", post(alter_table_add_columns)) + .route( + "/v1/table/:id/alter_columns", + post(alter_table_alter_columns), + ) + .route("/v1/table/:id/drop_columns", post(alter_table_drop_columns)) + .route( + "/v1/table/:id/schema_metadata/update", + post(update_table_schema_metadata), + ) + // Tag operations + .route("/v1/table/:id/tags/list", get(list_table_tags)) + .route("/v1/table/:id/tags/version", post(get_table_tag_version)) + .route("/v1/table/:id/tags/create", post(create_table_tag)) + .route("/v1/table/:id/tags/delete", post(delete_table_tag)) + .route("/v1/table/:id/tags/update", post(update_table_tag)) + // Query plan operations + .route("/v1/table/:id/explain_plan", post(explain_table_query_plan)) + .route("/v1/table/:id/analyze_plan", post(analyze_table_query_plan)) + // Transaction operations + .route("/v1/transaction/:id/describe", post(describe_transaction)) + .route("/v1/transaction/:id/alter", post(alter_transaction)) + // Global table operations + .route("/v1/table", get(list_all_tables)) .layer(TraceLayer::new_for_http()) .with_state(self.backend.clone()) } @@ -398,9 +449,10 @@ async fn drop_table( State(backend): State>, Path(id): Path, Query(params): Query, - Json(mut request): Json, ) -> Response { - request.id = Some(parse_id(&id, params.delimiter.as_deref())); + let request = DropTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + }; match backend.drop_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -430,8 +482,6 @@ async fn deregister_table( struct CreateTableQuery { delimiter: Option, mode: Option, - location: Option, - properties: Option, } async fn create_table( @@ -440,25 +490,9 @@ async fn create_table( Query(params): Query, body: Bytes, ) -> Response { - use lance_namespace::models::create_table_request::Mode; - - let mode = params.mode.as_deref().and_then(|m| match m { - "create" => Some(Mode::Create), - "exist_ok" => Some(Mode::ExistOk), - "overwrite" => Some(Mode::Overwrite), - _ => None, - }); - - let properties = params - .properties - .as_ref() - .and_then(|p| serde_json::from_str(p).ok()); - let request = CreateTableRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), - location: params.location, - mode, - properties, + mode: params.mode.clone(), }; match backend.create_table(request, body).await { @@ -481,6 +515,509 @@ async fn create_empty_table( } } +#[derive(Debug, Deserialize)] +struct InsertQuery { + delimiter: Option, + mode: Option, +} + +async fn insert_into_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + body: Bytes, +) -> Response { + let request = InsertIntoTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + mode: params.mode.clone(), + }; + + match backend.insert_into_table(request, body).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +#[derive(Debug, Deserialize)] +struct MergeInsertQuery { + delimiter: Option, + on: Option, + when_matched_update_all: Option, + when_matched_update_all_filt: Option, + when_not_matched_insert_all: Option, + when_not_matched_by_source_delete: Option, + when_not_matched_by_source_delete_filt: Option, + timeout: Option, + use_index: Option, +} + +async fn merge_insert_into_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + body: Bytes, +) -> Response { + let request = MergeInsertIntoTableRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + on: params.on, + when_matched_update_all: params.when_matched_update_all, + when_matched_update_all_filt: params.when_matched_update_all_filt, + when_not_matched_insert_all: params.when_not_matched_insert_all, + when_not_matched_by_source_delete: params.when_not_matched_by_source_delete, + when_not_matched_by_source_delete_filt: params.when_not_matched_by_source_delete_filt, + timeout: params.timeout, + use_index: params.use_index, + }; + + match backend.merge_insert_into_table(request, body).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.update_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_from_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.delete_from_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn query_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.query_table(request).await { + Ok(bytes) => (StatusCode::OK, bytes).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn count_table_rows( + State(backend): State>, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = CountTableRowsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + version: None, + predicate: None, + }; + + match backend.count_table_rows(request).await { + Ok(count) => (StatusCode::OK, Json(serde_json::json!({ "count": count }))).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Table Management Operation Handlers +// ============================================================================ + +async fn rename_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.rename_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn restore_table( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.restore_table(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_versions( + State(backend): State>, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableVersionsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + }; + + match backend.list_table_versions(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn get_table_stats( + State(backend): State>, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = GetTableStatsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + }; + + match backend.get_table_stats(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_all_tables( + State(backend): State>, + Query(params): Query, +) -> Response { + let request = ListTablesRequest { + id: None, + page_token: params.page_token, + limit: params.limit, + }; + + match backend.list_all_tables(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Index Operation Handlers +// ============================================================================ + +async fn create_table_index( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.create_table_index(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn create_table_scalar_index( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.create_table_scalar_index(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn list_table_indices( + State(backend): State>, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableIndicesRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + version: None, + page_token: None, + limit: None, + }; + + match backend.list_table_indices(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +#[derive(Debug, Deserialize)] +struct IndexPathParams { + id: String, + index_name: String, +} + +async fn describe_table_index_stats( + State(backend): State>, + Path(params): Path, + Query(query): Query, +) -> Response { + let request = DescribeTableIndexStatsRequest { + id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), + version: None, + index_name: Some(params.index_name), + }; + + match backend.describe_table_index_stats(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn drop_table_index( + State(backend): State>, + Path(params): Path, + Query(query): Query, +) -> Response { + let request = DropTableIndexRequest { + id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), + index_name: Some(params.index_name), + }; + + match backend.drop_table_index(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Schema Operation Handlers +// ============================================================================ + +async fn alter_table_add_columns( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.alter_table_add_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_table_alter_columns( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.alter_table_alter_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_table_drop_columns( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.alter_table_drop_columns(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table_schema_metadata( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.update_table_schema_metadata(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Tag Operation Handlers +// ============================================================================ + +async fn list_table_tags( + State(backend): State>, + Path(id): Path, + Query(params): Query, +) -> Response { + let request = ListTableTagsRequest { + id: Some(parse_id(&id, params.delimiter.as_deref())), + page_token: params.page_token, + limit: params.limit, + }; + + match backend.list_table_tags(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn get_table_tag_version( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.get_table_tag_version(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn create_table_tag( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.create_table_tag(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn delete_table_tag( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.delete_table_tag(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn update_table_tag( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.update_table_tag(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Query Plan Operation Handlers +// ============================================================================ + +async fn explain_table_query_plan( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.explain_table_query_plan(request).await { + Ok(plan) => (StatusCode::OK, plan).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn analyze_table_query_plan( + State(backend): State>, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + + match backend.analyze_table_query_plan(request).await { + Ok(plan) => (StatusCode::OK, plan).into_response(), + Err(e) => error_to_response(e), + } +} + +// ============================================================================ +// Transaction Operation Handlers +// ============================================================================ + +async fn describe_transaction( + State(backend): State>, + Path(id): Path, + Query(_params): Query, + Json(mut request): Json, +) -> Response { + // The path id is the transaction identifier + // The request.id in body is the table ID (namespace path) + // For the trait, we set request.id to include both table ID and transaction ID + // by appending the transaction ID to the table ID path + if let Some(ref mut table_id) = request.id { + table_id.push(id); + } else { + request.id = Some(vec![id]); + } + + match backend.describe_transaction(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + +async fn alter_transaction( + State(backend): State>, + Path(id): Path, + Query(_params): Query, + Json(mut request): Json, +) -> Response { + // The path id is the transaction identifier + // Append it to the table ID path in the request + if let Some(ref mut table_id) = request.id { + table_id.push(id); + } else { + request.id = Some(vec![id]); + } + + match backend.alter_transaction(request).await { + Ok(response) => (StatusCode::OK, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + // ============================================================================ // Helper Functions // ============================================================================ @@ -791,9 +1328,7 @@ mod tests { // Create table in child namespace let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; let result = fixture @@ -845,9 +1380,7 @@ mod tests { for i in 1..=3 { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), format!("table{}", i)]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -891,9 +1424,7 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -963,9 +1494,7 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1053,9 +1582,7 @@ mod tests { // Create table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1361,9 +1888,7 @@ mod tests { "level3".to_string(), "deep_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; let result = fixture @@ -1422,9 +1947,7 @@ mod tests { // Create table with same name in both namespaces let create_table_req = CreateTableRequest { id: Some(vec!["namespace1".to_string(), "shared_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1434,9 +1957,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["namespace2".to_string(), "shared_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1486,9 +2007,7 @@ mod tests { // Create table in namespace let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1640,9 +2159,7 @@ mod tests { "test_namespace".to_string(), "physical_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1669,7 +2186,10 @@ mod tests { ); let response = result.unwrap(); - assert_eq!(response.location, "test_namespace$physical_table.lance"); + assert_eq!( + response.location, + Some("test_namespace$physical_table.lance".to_string()) + ); // Verify registered table exists let mut exists_req = TableExistsRequest::new(); @@ -1769,9 +2289,7 @@ mod tests { // Create a table let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; fixture .namespace @@ -1849,9 +2367,7 @@ mod tests { "test_namespace".to_string(), "original_table".to_string(), ]), - location: None, - mode: Some(create_table_request::Mode::Create), - properties: None, + mode: Some("Create".to_string()), }; let create_response = fixture .namespace @@ -1903,7 +2419,7 @@ mod tests { .expect("Failed to re-register table with new name"); // Should return the exact location we registered - assert_eq!(register_response.location, relative_location); + assert_eq!(register_response.location, Some(relative_location.clone())); // Verify new table exists let mut exists_req = TableExistsRequest::new(); diff --git a/rust/lance-namespace/src/namespace.rs b/rust/lance-namespace/src/namespace.rs index ac2d0c8e176..60c206530f4 100644 --- a/rust/lance-namespace/src/namespace.rs +++ b/rust/lance-namespace/src/namespace.rs @@ -9,19 +9,29 @@ use lance_core::{Error, Result}; use snafu::Location; use lance_namespace_reqwest_client::models::{ - AlterTransactionRequest, AlterTransactionResponse, CountTableRowsRequest, - CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, - CreateTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, DescribeTableRequest, - DescribeTableResponse, DescribeTransactionRequest, DescribeTransactionResponse, - DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, DropTableResponse, - InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTableIndicesRequest, ListTableIndicesResponse, ListTablesRequest, ListTablesResponse, + AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, + AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, + AlterTransactionRequest, AlterTransactionResponse, AnalyzeTableQueryPlanRequest, + CountTableRowsRequest, CreateEmptyTableRequest, CreateEmptyTableResponse, + CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, + CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, + CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, + DeleteFromTableRequest, DeleteFromTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, + DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, + DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, + DescribeTableRequest, DescribeTableResponse, DescribeTransactionRequest, + DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, + DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, + ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, + InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, + ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, + ListTableVersionsRequest, ListTableVersionsResponse, ListTablesRequest, ListTablesResponse, MergeInsertIntoTableRequest, MergeInsertIntoTableResponse, NamespaceExistsRequest, - QueryTableRequest, RegisterTableRequest, RegisterTableResponse, TableExistsRequest, - UpdateTableRequest, UpdateTableResponse, + QueryTableRequest, RegisterTableRequest, RegisterTableResponse, RenameTableRequest, + RenameTableResponse, RestoreTableRequest, RestoreTableResponse, TableExistsRequest, + UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, + UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; /// Base trait for Lance Namespace implementations. @@ -277,6 +287,195 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { }) } + /// Create a scalar index on a table. + async fn create_table_scalar_index( + &self, + _request: CreateTableIndexRequest, + ) -> Result { + Err(Error::NotSupported { + source: "create_table_scalar_index not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Drop a table index. + async fn drop_table_index( + &self, + _request: DropTableIndexRequest, + ) -> Result { + Err(Error::NotSupported { + source: "drop_table_index not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all tables across all namespaces. + async fn list_all_tables(&self, _request: ListTablesRequest) -> Result { + Err(Error::NotSupported { + source: "list_all_tables not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Restore a table to a specific version. + async fn restore_table(&self, _request: RestoreTableRequest) -> Result { + Err(Error::NotSupported { + source: "restore_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Rename a table. + async fn rename_table(&self, _request: RenameTableRequest) -> Result { + Err(Error::NotSupported { + source: "rename_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all versions of a table. + async fn list_table_versions( + &self, + _request: ListTableVersionsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "list_table_versions not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Update table schema metadata. + async fn update_table_schema_metadata( + &self, + _request: UpdateTableSchemaMetadataRequest, + ) -> Result { + Err(Error::NotSupported { + source: "update_table_schema_metadata not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Get table statistics. + async fn get_table_stats( + &self, + _request: GetTableStatsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "get_table_stats not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Explain a table query plan. + async fn explain_table_query_plan( + &self, + _request: ExplainTableQueryPlanRequest, + ) -> Result { + Err(Error::NotSupported { + source: "explain_table_query_plan not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Analyze a table query plan. + async fn analyze_table_query_plan( + &self, + _request: AnalyzeTableQueryPlanRequest, + ) -> Result { + Err(Error::NotSupported { + source: "analyze_table_query_plan not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Add columns to a table. + async fn alter_table_add_columns( + &self, + _request: AlterTableAddColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_add_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Alter columns in a table. + async fn alter_table_alter_columns( + &self, + _request: AlterTableAlterColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_alter_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Drop columns from a table. + async fn alter_table_drop_columns( + &self, + _request: AlterTableDropColumnsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "alter_table_drop_columns not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// List all tags for a table. + async fn list_table_tags( + &self, + _request: ListTableTagsRequest, + ) -> Result { + Err(Error::NotSupported { + source: "list_table_tags not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Get the version for a specific tag. + async fn get_table_tag_version( + &self, + _request: GetTableTagVersionRequest, + ) -> Result { + Err(Error::NotSupported { + source: "get_table_tag_version not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Create a tag for a table. + async fn create_table_tag( + &self, + _request: CreateTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "create_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Delete a tag from a table. + async fn delete_table_tag( + &self, + _request: DeleteTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "delete_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + + /// Update a tag for a table. + async fn update_table_tag( + &self, + _request: UpdateTableTagRequest, + ) -> Result { + Err(Error::NotSupported { + source: "update_table_tag not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + /// Return a human-readable unique identifier for this namespace instance. /// /// This is used for equality comparison and hashing when the namespace is diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 722ba7c97e1..1079a72d600 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -876,6 +876,7 @@ impl Dataset { let request = DescribeTableRequest { id: Some(table_id.clone()), version: None, + with_table_uri: None, }; let response = namespace diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 16326630d23..332ba504cf9 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -137,6 +137,7 @@ impl DatasetBuilder { let request = DescribeTableRequest { id: Some(table_id.clone()), version: None, + with_table_uri: None, }; let response = namespace From efd9f6b45353505b1a265c6d457e11b1e5dac3d6 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 21:07:26 -0800 Subject: [PATCH 5/8] feat: support credentials vending in directory namespace (#5566) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces the credentials vending feature to the namespace impl, allowing us to vend credentials if we run directory namespace, or run it as backend for rest namespace. This would allow us to fully test the credentials vending code path end to end. The actual vending logic mainly consults the same feature implemented in Apache Polaris. The support covers aws, gcp and azure. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- Cargo.lock | 685 +++++++++-- Cargo.toml | 1 + java/lance-jni/Cargo.lock | 1058 +++++++++++++++-- java/lance-jni/Cargo.toml | 9 +- .../lance/namespace/DirectoryNamespace.java | 52 + python/Cargo.lock | 825 +++++++++++-- python/Cargo.toml | 6 +- python/python/lance/namespace.py | 43 + rust/lance-namespace-impls/Cargo.toml | 20 + rust/lance-namespace-impls/src/credentials.rs | 717 +++++++++++ .../src/credentials/aws.rs | 881 ++++++++++++++ .../src/credentials/azure.rs | 335 ++++++ .../src/credentials/gcp.rs | 637 ++++++++++ rust/lance-namespace-impls/src/dir.rs | 148 ++- rust/lance-namespace-impls/src/lib.rs | 59 + 15 files changed, 5159 insertions(+), 317 deletions(-) create mode 100644 rust/lance-namespace-impls/src/credentials.rs create mode 100644 rust/lance-namespace-impls/src/credentials/aws.rs create mode 100644 rust/lance-namespace-impls/src/credentials/azure.rs create mode 100644 rust/lance-namespace-impls/src/credentials/gcp.rs diff --git a/Cargo.lock b/Cargo.lock index e834924da59..88e4bfefa8c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "addr2line" version = "0.25.1" @@ -430,6 +436,17 @@ dependencies = [ "serde_json", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -459,17 +476,53 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -481,6 +534,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -541,7 +618,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -566,9 +643,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -576,9 +653,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -588,9 +665,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -602,7 +679,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -613,21 +690,22 @@ dependencies = [ [[package]] name = "aws-sdk-dynamodb" -version = "1.101.0" +version = "1.102.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f98cd9e5f2fc790aff1f393bc3c8680deea31c05d3c6f23b625cdc50b1b6b4" +checksum = "f5f7e6a53cf5ee8b7041c73106d9a93480b47f8b955466262b043aab0b5bf489" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -635,9 +713,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3" -version = "1.119.0" +version = "1.120.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d65fddc3844f902dfe1864acb8494db5f9342015ee3ab7890270d36fbd2e01c" +checksum = "06673901e961f20fa8d7da907da48f7ad6c1b383e3726c22bd418900f015abe1" dependencies = [ "aws-credential-types", "aws-runtime", @@ -647,19 +725,20 @@ dependencies = [ "aws-smithy-eventstream", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "hmac", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", - "lru", + "lru 0.16.3", "percent-encoding", "regex-lite", "sha2", @@ -669,21 +748,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -691,21 +771,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -713,22 +794,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -775,9 +857,9 @@ dependencies = [ [[package]] name = "aws-smithy-checksums" -version = "0.63.12" +version = "0.63.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87294a084b43d649d967efe58aa1f9e0adc260e13a6938eb904c0ae9b45824ae" +checksum = "23374b9170cbbcc6f5df8dc5ebb9b6c5c28a3c8f599f0e8b8b10eb6f4a5c6e74" dependencies = [ "aws-smithy-http", "aws-smithy-types", @@ -867,9 +949,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -886,9 +968,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -897,7 +979,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -910,9 +992,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -927,9 +1009,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", @@ -1029,13 +1111,120 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] @@ -1217,6 +1406,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1342,9 +1544,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1381,9 +1583,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1554,6 +1756,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "constant_time_eq" version = "0.4.2" @@ -1615,9 +1823,9 @@ dependencies = [ [[package]] name = "crc" -version = "3.4.0" +version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eb8a2a1cd12ab0d987a5d5e825195d372001a4094a0376319d5a0ad71c1ba0d" +checksum = "9710d3b3739c2e349eb44fe848ad0b7c8cb1e42bd87ee49371df2f7acaf3e675" dependencies = [ "crc-catalog", ] @@ -1630,15 +1838,14 @@ checksum = "19d374276b40fb8bbdee95aef7c7fa6b5316ec764510eb64b8dd0e2ed0d7e7f5" [[package]] name = "crc-fast" -version = "1.6.0" +version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ddc2d09feefeee8bd78101665bd8645637828fa9317f9f292496dbbd8c65ff3" +checksum = "2fd92aca2c6001b1bf5ba0ff84ee74ec8501b52bbef0cac80bf25a6c1d87a83d" dependencies = [ "crc", "digest", - "rand 0.9.2", - "regex", "rustversion", + "spin 0.10.0", ] [[package]] @@ -3040,6 +3247,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -3057,7 +3270,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -3073,6 +3286,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -3091,21 +3313,20 @@ dependencies = [ [[package]] name = "filetime" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.60.2", ] [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "findshlibs" @@ -3291,6 +3512,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -3485,6 +3734,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -3494,7 +3754,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -3536,6 +3796,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder 0.20.2", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + [[package]] name = "group" version = "0.12.1" @@ -3684,7 +3964,7 @@ dependencies = [ "reqwest", "serde", "serde_json", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "ureq", "windows-sys 0.60.2", @@ -3769,6 +4049,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -4153,6 +4453,12 @@ dependencies = [ "web-time", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inferno" version = "0.11.21" @@ -4181,6 +4487,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -4347,9 +4662,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -4741,7 +5056,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -4892,9 +5207,18 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -4909,6 +5233,7 @@ dependencies = [ "serde_json", "snafu", "tempfile", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -4918,9 +5243,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" +checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" dependencies = [ "reqwest", "serde", @@ -5221,7 +5546,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "yada", ] @@ -5344,6 +5669,15 @@ dependencies = [ "hashbrown 0.15.5", ] +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown 0.16.1", +] + [[package]] name = "lru-slab" version = "0.1.2" @@ -5512,7 +5846,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -5559,7 +5893,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -5811,12 +6145,40 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + [[package]] name = "number_prefix" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -5837,9 +6199,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -5864,7 +6226,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -6294,7 +6656,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ - "fastrand", + "fastrand 2.3.0", "phf_shared 0.13.1", ] @@ -6348,6 +6710,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -6430,6 +6803,20 @@ dependencies = [ "plotters-backend", ] +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -6685,6 +7072,16 @@ dependencies = [ "memchr", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -6719,7 +7116,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.36", "socket2 0.6.1", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -6740,7 +7137,7 @@ dependencies = [ "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -6781,6 +7178,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -6802,6 +7212,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -6822,6 +7242,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -6860,6 +7289,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xorshift" version = "0.4.0" @@ -6991,7 +7429,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -7266,9 +7704,9 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.26" +version = "0.1.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56f7d92ca342cea22a06f2121d944b4fd82af56988c270852495420f961d4ace" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" [[package]] name = "rustc-hash" @@ -7334,7 +7772,7 @@ dependencies = [ "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.8", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -7362,9 +7800,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -7382,9 +7820,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -7631,6 +8069,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -7775,7 +8224,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -8184,7 +8633,7 @@ dependencies = [ "itertools 0.14.0", "levenshtein_automata", "log", - "lru", + "lru 0.12.5", "lz4_flex", "measure_time", "memmap2", @@ -8206,7 +8655,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -8329,7 +8778,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -8375,11 +8824,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -8395,9 +8844,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -8441,7 +8890,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -8851,7 +9303,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -8872,6 +9324,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unarray" version = "0.1.4" @@ -8980,6 +9441,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -9057,6 +9519,12 @@ dependencies = [ "libc", ] +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -9076,6 +9544,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -9084,18 +9558,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -9106,11 +9580,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -9119,9 +9594,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -9129,9 +9604,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -9142,9 +9617,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -9164,9 +9639,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -9566,9 +10041,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -9764,9 +10239,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/Cargo.toml b/Cargo.toml index 6cc4aa503a0..6ad07641cd5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -87,6 +87,7 @@ aws-config = "1.2.0" aws-credential-types = "1.2.0" aws-sdk-dynamodb = "1.38.0" aws-sdk-s3 = "1.38.0" +aws-sdk-sts = "1.38.0" half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 935b414e845..15bf118d4f5 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "adler2" version = "2.0.1" @@ -384,6 +390,17 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -413,17 +430,53 @@ dependencies = [ "zstd-safe", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -435,6 +488,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -495,7 +572,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -520,9 +597,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -530,9 +607,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -542,9 +619,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -555,7 +632,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -566,21 +643,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -588,21 +666,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -610,22 +689,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -694,17 +774,23 @@ dependencies = [ "aws-smithy-async", "aws-smithy-runtime-api", "aws-smithy-types", - "h2", + "h2 0.3.27", + "h2 0.4.13", + "http 0.2.12", "http 1.4.0", - "hyper", - "hyper-rustls", + "http-body 0.4.6", + "hyper 0.14.32", + "hyper 1.8.1", + "hyper-rustls 0.24.2", + "hyper-rustls 0.27.7", "hyper-util", "pin-project-lite", - "rustls", + "rustls 0.21.12", + "rustls 0.23.36", "rustls-native-certs", "rustls-pki-types", "tokio", - "tokio-rustls", + "tokio-rustls 0.26.4", "tower", "tracing", ] @@ -720,9 +806,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -739,9 +825,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -750,7 +836,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -763,9 +849,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -780,13 +866,14 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", "bytes-utils", + "futures-core", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -799,6 +886,8 @@ dependencies = [ "ryu", "serde", "time", + "tokio", + "tokio-util", ] [[package]] @@ -837,7 +926,7 @@ dependencies = [ "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper", + "hyper 1.8.1", "hyper-util", "itoa", "matchit", @@ -879,17 +968,130 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -999,6 +1201,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1015,7 +1230,7 @@ version = "3.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "89ec27229c38ed0eb3c0feee3d2c1d6a4379ae44f418a29a658890e062d8f365" dependencies = [ - "darling", + "darling 0.23.0", "ident_case", "prettyplease", "proc-macro2", @@ -1118,9 +1333,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1154,9 +1369,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1257,12 +1472,28 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "constant_time_eq" version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -1386,14 +1617,38 @@ dependencies = [ "memchr", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core 0.20.11", + "darling_macro 0.20.11", +] + [[package]] name = "darling" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" dependencies = [ - "darling_core", - "darling_macro", + "darling_core 0.23.0", + "darling_macro 0.23.0", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.114", ] [[package]] @@ -1409,13 +1664,24 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core 0.20.11", + "quote", + "syn 2.0.114", +] + [[package]] name = "darling_macro" version = "0.23.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" dependencies = [ - "darling_core", + "darling_core 0.23.0", "quote", "syn 2.0.114", ] @@ -2138,6 +2404,37 @@ dependencies = [ "serde_core", ] +[[package]] +name = "derive_builder" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947" +dependencies = [ + "derive_builder_macro", +] + +[[package]] +name = "derive_builder_core" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8" +dependencies = [ + "darling 0.20.11", + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "derive_builder_macro" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c" +dependencies = [ + "derive_builder_core", + "syn 2.0.114", +] + [[package]] name = "digest" version = "0.10.7" @@ -2279,6 +2576,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -2296,7 +2599,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -2312,6 +2615,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2320,9 +2632,9 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "fixedbitset" @@ -2375,6 +2687,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2471,6 +2798,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -2659,6 +3014,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -2668,7 +3034,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -2704,6 +3070,45 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + +[[package]] +name = "h2" +version = "0.3.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0beca50380b1fc32983fc1cb4587bfa4bb9e78fc259aad4a0032d2080309222d" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http 0.2.12", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "h2" version = "0.4.13" @@ -2883,6 +3288,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -2901,6 +3326,30 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" +[[package]] +name = "hyper" +version = "0.14.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41dfc780fdec9373c01bae43289ea34c972e40ee3c9f6b3c8801a35f35586ce7" +dependencies = [ + "bytes", + "futures-channel", + "futures-core", + "futures-util", + "h2 0.3.27", + "http 0.2.12", + "http-body 0.4.6", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "socket2 0.5.10", + "tokio", + "tower-service", + "tracing", + "want", +] + [[package]] name = "hyper" version = "1.8.1" @@ -2911,7 +3360,7 @@ dependencies = [ "bytes", "futures-channel", "futures-core", - "h2", + "h2 0.4.13", "http 1.4.0", "http-body 1.0.1", "httparse", @@ -2924,22 +3373,53 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" +dependencies = [ + "futures-util", + "http 0.2.12", + "hyper 0.14.32", + "log", + "rustls 0.21.12", + "tokio", + "tokio-rustls 0.24.1", +] + [[package]] name = "hyper-rustls" version = "0.27.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" dependencies = [ - "http 1.4.0", - "hyper", + "http 1.4.0", + "hyper 1.8.1", + "hyper-util", + "rustls 0.23.36", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls 0.26.4", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.8.1", "hyper-util", - "rustls", - "rustls-native-certs", - "rustls-pki-types", + "native-tls", "tokio", - "tokio-rustls", + "tokio-native-tls", "tower-service", - "webpki-roots", ] [[package]] @@ -2955,15 +3435,17 @@ dependencies = [ "futures-util", "http 1.4.0", "http-body 1.0.1", - "hyper", + "hyper 1.8.1", "ipnet", "libc", "percent-encoding", "pin-project-lite", - "socket2", + "socket2 0.6.1", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3160,6 +3642,12 @@ dependencies = [ "hashbrown 0.16.1", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inout" version = "0.1.4" @@ -3170,6 +3658,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -3306,9 +3803,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -3614,7 +4111,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -3774,9 +4271,18 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -3789,6 +4295,7 @@ dependencies = [ "serde", "serde_json", "snafu", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -3797,9 +4304,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" +checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" dependencies = [ "reqwest", "serde", @@ -4146,7 +4653,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -4167,7 +4674,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -4188,6 +4695,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.1.6", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -4359,6 +4883,34 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -4370,9 +4922,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -4384,7 +4936,7 @@ dependencies = [ "http-body-util", "httparse", "humantime", - "hyper", + "hyper 1.8.1", "itertools 0.14.0", "md-5", "parking_lot", @@ -4397,7 +4949,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -4470,12 +5022,56 @@ dependencies = [ "uuid", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + [[package]] name = "openssl-probe" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -4759,6 +5355,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -4803,6 +5410,20 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -4932,6 +5553,16 @@ dependencies = [ "cc", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -4964,9 +5595,9 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls", - "socket2", - "thiserror 2.0.17", + "rustls 0.23.36", + "socket2 0.6.1", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -4984,10 +5615,10 @@ dependencies = [ "rand 0.9.2", "ring", "rustc-hash", - "rustls", + "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -5002,7 +5633,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2", + "socket2 0.6.1", "tracing", "windows-sys 0.60.2", ] @@ -5028,6 +5659,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -5049,6 +5693,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -5069,6 +5723,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -5107,6 +5770,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.7.0" @@ -5198,7 +5870,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -5289,21 +5961,23 @@ dependencies = [ "encoding_rs", "futures-core", "futures-util", - "h2", + "h2 0.4.13", "http 1.4.0", "http-body 1.0.1", "http-body-util", - "hyper", - "hyper-rustls", + "hyper 1.8.1", + "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", - "rustls", + "rustls 0.23.36", "rustls-native-certs", "rustls-pki-types", "serde", @@ -5311,7 +5985,8 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", - "tokio-rustls", + "tokio-native-tls", + "tokio-rustls 0.26.4", "tokio-util", "tower", "tower-http 0.6.8", @@ -5447,6 +6122,18 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "rustls" +version = "0.21.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" +dependencies = [ + "log", + "ring", + "rustls-webpki 0.101.7", + "sct", +] + [[package]] name = "rustls" version = "0.23.36" @@ -5454,10 +6141,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -5468,10 +6156,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe", + "openssl-probe 0.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.5.1", ] [[package]] @@ -5485,9 +6173,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -5495,9 +6183,19 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.101.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -5591,6 +6289,29 @@ dependencies = [ "sha2", ] +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework" version = "3.5.1" @@ -5598,7 +6319,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -5695,6 +6416,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -5829,7 +6561,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -5887,6 +6619,16 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b6b67fb9a61334225b5b790716f609cd58395f895b3fe8b328786812a40bc3b" +[[package]] +name = "socket2" +version = "0.5.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +dependencies = [ + "libc", + "windows-sys 0.52.0", +] + [[package]] name = "socket2" version = "0.6.1" @@ -6079,6 +6821,27 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -6131,7 +6894,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -6243,7 +7006,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -6261,11 +7024,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -6281,9 +7044,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -6327,7 +7090,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -6396,7 +7162,7 @@ dependencies = [ "parking_lot", "pin-project-lite", "signal-hook-registry", - "socket2", + "socket2 0.6.1", "tokio-macros", "windows-sys 0.61.2", ] @@ -6412,13 +7178,33 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" +dependencies = [ + "rustls 0.21.12", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.26.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" dependencies = [ - "rustls", + "rustls 0.23.36", "tokio", ] @@ -6653,7 +7439,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -6674,6 +7460,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unicase" version = "2.9.0" @@ -6720,6 +7515,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -6764,6 +7560,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -6776,6 +7578,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -6795,6 +7603,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -6803,18 +7617,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -6825,11 +7639,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -6838,9 +7653,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -6848,9 +7663,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -6861,9 +7676,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -6883,9 +7698,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -6982,6 +7797,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -7242,9 +8068,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -7418,9 +8244,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/java/lance-jni/Cargo.toml b/java/lance-jni/Cargo.toml index 132333a1ec1..8d426cb3e84 100644 --- a/java/lance-jni/Cargo.toml +++ b/java/lance-jni/Cargo.toml @@ -12,6 +12,13 @@ description = "JNI bindings for Lance Columnar format" [lib] crate-type = ["cdylib"] +[features] +default = [] +# Credential vending features for DirectoryNamespace +credential-vendor-aws = ["lance-namespace-impls/credential-vendor-aws"] +credential-vendor-gcp = ["lance-namespace-impls/credential-vendor-gcp"] +credential-vendor-azure = ["lance-namespace-impls/credential-vendor-azure"] + [dependencies] lance = { path = "../../rust/lance", features = ["substrait"] } lance-datafusion = { path = "../../rust/lance-datafusion" } @@ -20,7 +27,7 @@ lance-linalg = { path = "../../rust/lance-linalg" } lance-index = { path = "../../rust/lance-index" } lance-io = { path = "../../rust/lance-io" } lance-namespace = { path = "../../rust/lance-namespace" } -lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter"] } +lance-namespace-impls = { path = "../../rust/lance-namespace-impls", features = ["rest", "rest-adapter", "credential-vendor-aws", "credential-vendor-gcp", "credential-vendor-azure"] } lance-core = { path = "../../rust/lance-core" } lance-file = { path = "../../rust/lance-file" } arrow = { version = "56.1", features = ["ffi"] } diff --git a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java index 19de6d0a4bf..2d13db69694 100644 --- a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java +++ b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java @@ -51,6 +51,43 @@ * for S3, storage.account_name=myaccount for Azure) * * + *

Credential vending properties (requires credential-vendor-* features to be enabled): + * + *

When credential vendor properties are configured, describeTable() will return vended temporary + * credentials. The vendor type is auto-selected based on the table location URI: s3:// for AWS, + * gs:// for GCP, az:// for Azure. + * + *

    + *
  • Common properties: + *
      + *
    • credential_vendor.enabled (required): Set to "true" to enable credential vending + *
    • credential_vendor.permission (optional): read, write, or admin (default: read) + *
    + *
  • AWS-specific properties (for s3:// locations): + *
      + *
    • credential_vendor.aws_role_arn (required): IAM role ARN to assume + *
    • credential_vendor.aws_external_id (optional): External ID for assume role + *
    • credential_vendor.aws_region (optional): AWS region + *
    • credential_vendor.aws_role_session_name (optional): Role session name + *
    • credential_vendor.aws_duration_millis (optional): Duration in ms (default: 3600000, + * range: 15min-12hrs) + *
    + *
  • GCP-specific properties (for gs:// locations): + *
      + *
    • credential_vendor.gcp_service_account (optional): Service account to impersonate + *
    • Note: GCP uses Application Default Credentials (ADC). To use a service account key + * file, set the GOOGLE_APPLICATION_CREDENTIALS environment variable before starting. + *
    • Note: GCP token duration cannot be configured; it's determined by the STS endpoint + *
    + *
  • Azure-specific properties (for az:// locations): + *
      + *
    • credential_vendor.azure_account_name (required): Azure storage account name + *
    • credential_vendor.azure_tenant_id (optional): Azure tenant ID + *
    • credential_vendor.azure_duration_millis (optional): Duration in ms (default: 3600000, + * up to 7 days) + *
    + *
+ * *

Example usage (local filesystem): * *

{@code
@@ -81,6 +118,21 @@
  * // Use namespace...
  * namespace.close();
  * }
+ * + *

Example usage (AWS S3 with credential vending): + * + *

{@code
+ * Map properties = new HashMap<>();
+ * properties.put("root", "s3://my-bucket/lance-data");
+ * properties.put("credential_vendor.enabled", "true");
+ * properties.put("credential_vendor.aws_role_arn", "arn:aws:iam::123456789012:role/MyRole");
+ * properties.put("credential_vendor.aws_duration_millis", "3600000");  // 1 hour
+ *
+ * DirectoryNamespace namespace = new DirectoryNamespace();
+ * namespace.initialize(properties, allocator);
+ * // describeTable() will now return vended credentials (AWS vendor auto-selected from s3:// URI)
+ * namespace.close();
+ * }
*/ public class DirectoryNamespace implements LanceNamespace, Closeable { static { diff --git a/python/Cargo.lock b/python/Cargo.lock index 99a6979b8ed..4e8ef92fde8 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "RustyXML" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b5ace29ee3216de37c0546865ad08edef58b0f9e76838ed8959a84a990e58c5" + [[package]] name = "abi_stable" version = "0.11.3" @@ -463,6 +469,17 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-channel" version = "2.5.0" @@ -501,17 +518,53 @@ dependencies = [ "abi_stable", ] +[[package]] +name = "async-io" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "456b8a8feb6f42d237746d4b3e9a178494627745c3c56c6ea55d92ba50d026fc" +dependencies = [ + "autocfg", + "cfg-if", + "concurrent-queue", + "futures-io", + "futures-lite 2.6.1", + "parking", + "polling", + "rustix 1.1.3", + "slab", + "windows-sys 0.61.2", +] + [[package]] name = "async-lock" version = "3.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" dependencies = [ - "event-listener", + "event-listener 5.4.1", "event-listener-strategy", "pin-project-lite", ] +[[package]] +name = "async-process" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc50921ec0055cdd8a16de48773bfeec5c972598674347252c0399676be7da75" +dependencies = [ + "async-channel 2.5.0", + "async-io", + "async-lock", + "async-signal", + "async-task", + "blocking", + "cfg-if", + "event-listener 5.4.1", + "futures-lite 2.6.1", + "rustix 1.1.3", +] + [[package]] name = "async-recursion" version = "1.1.1" @@ -523,6 +576,30 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "async-signal" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c070bbf59cd3570b6b2dd54cd772527c7c3620fce8be898406dd3ed6adc64c" +dependencies = [ + "async-io", + "async-lock", + "atomic-waker", + "cfg-if", + "futures-core", + "futures-io", + "rustix 1.1.3", + "signal-hook-registry", + "slab", + "windows-sys 0.61.2", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "async-trait" version = "0.1.89" @@ -583,7 +660,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "hex", "http 1.4.0", "ring", @@ -608,9 +685,9 @@ dependencies = [ [[package]] name = "aws-lc-rs" -version = "1.15.2" +version = "1.15.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6a88aab2464f1f25453baa7a07c84c5b7684e274054ba06817f382357f77a288" +checksum = "e84ce723ab67259cfeb9877c6a639ee9eb7a27b28123abd71db7f0d5d0cc9d86" dependencies = [ "aws-lc-sys", "zeroize", @@ -618,9 +695,9 @@ dependencies = [ [[package]] name = "aws-lc-sys" -version = "0.35.0" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b45afffdee1e7c9126814751f88dddc747f41d91da16c9551a0f1e8a11e788a1" +checksum = "43a442ece363113bd4bd4c8b18977a7798dd4d3c3383f34fb61936960e8f4ad8" dependencies = [ "cc", "cmake", @@ -630,9 +707,9 @@ dependencies = [ [[package]] name = "aws-runtime" -version = "1.5.17" +version = "1.5.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d81b5b2898f6798ad58f484856768bca817e3cd9de0974c24ae0f1113fe88f1b" +checksum = "959dab27ce613e6c9658eb3621064d0e2027e5f2acb65bc526a43577facea557" dependencies = [ "aws-credential-types", "aws-sigv4", @@ -643,7 +720,7 @@ dependencies = [ "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http-body 0.4.6", "percent-encoding", @@ -654,21 +731,22 @@ dependencies = [ [[package]] name = "aws-sdk-dynamodb" -version = "1.101.0" +version = "1.102.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6f98cd9e5f2fc790aff1f393bc3c8680deea31c05d3c6f23b625cdc50b1b6b4" +checksum = "f5f7e6a53cf5ee8b7041c73106d9a93480b47f8b955466262b043aab0b5bf489" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -676,21 +754,22 @@ dependencies = [ [[package]] name = "aws-sdk-sso" -version = "1.91.0" +version = "1.92.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ee6402a36f27b52fe67661c6732d684b2635152b676aa2babbfb5204f99115d" +checksum = "b7d63bd2bdeeb49aa3f9b00c15e18583503b778b2e792fc06284d54e7d5b6566" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -698,21 +777,22 @@ dependencies = [ [[package]] name = "aws-sdk-ssooidc" -version = "1.93.0" +version = "1.94.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a45a7f750bbd170ee3677671ad782d90b894548f4e4ae168302c57ec9de5cb3e" +checksum = "532d93574bf731f311bafb761366f9ece345a0416dbcc273d81d6d1a1205239b" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -720,22 +800,23 @@ dependencies = [ [[package]] name = "aws-sdk-sts" -version = "1.95.0" +version = "1.96.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55542378e419558e6b1f398ca70adb0b2088077e79ad9f14eb09441f2f7b2164" +checksum = "357e9a029c7524db6a0099cd77fbd5da165540339e7296cca603531bc783b56c" dependencies = [ "aws-credential-types", "aws-runtime", "aws-smithy-async", "aws-smithy-http", "aws-smithy-json", + "aws-smithy-observability", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", "aws-smithy-xml", "aws-types", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "regex-lite", "tracing", @@ -836,9 +917,9 @@ dependencies = [ [[package]] name = "aws-smithy-observability" -version = "0.1.5" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "17f616c3f2260612fe44cede278bafa18e73e6479c4e393e2c4518cf2a9a228a" +checksum = "ef1fcbefc7ece1d70dcce29e490f269695dfca2d2bacdeaf9e5c3f799e4e6a42" dependencies = [ "aws-smithy-runtime-api", ] @@ -855,9 +936,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime" -version = "1.9.5" +version = "1.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a392db6c583ea4a912538afb86b7be7c5d8887d91604f50eb55c262ee1b4a5f5" +checksum = "bb5b6167fcdf47399024e81ac08e795180c576a20e4d4ce67949f9a88ae37dc1" dependencies = [ "aws-smithy-async", "aws-smithy-http", @@ -866,7 +947,7 @@ dependencies = [ "aws-smithy-runtime-api", "aws-smithy-types", "bytes", - "fastrand", + "fastrand 2.3.0", "http 0.2.12", "http 1.4.0", "http-body 0.4.6", @@ -879,9 +960,9 @@ dependencies = [ [[package]] name = "aws-smithy-runtime-api" -version = "1.9.3" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab0d43d899f9e508300e587bf582ba54c27a452dd0a9ea294690669138ae14a2" +checksum = "efce7aaaf59ad53c5412f14fc19b2d5c6ab2c3ec688d272fd31f76ec12f44fb0" dependencies = [ "aws-smithy-async", "aws-smithy-types", @@ -896,9 +977,9 @@ dependencies = [ [[package]] name = "aws-smithy-types" -version = "1.3.5" +version = "1.3.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "905cb13a9895626d49cf2ced759b062d913834c7482c38e49557eac4e6193f01" +checksum = "65f172bcb02424eb94425db8aed1b6d583b5104d4d5ddddf22402c661a320048" dependencies = [ "base64-simd", "bytes", @@ -998,17 +1079,130 @@ dependencies = [ "tracing", ] +[[package]] +name = "azure_core" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b552ad43a45a746461ec3d3a51dfb6466b4759209414b439c165eb6a6b7729e" +dependencies = [ + "async-trait", + "base64 0.22.1", + "bytes", + "dyn-clone", + "futures", + "getrandom 0.2.17", + "hmac", + "http-types", + "once_cell", + "paste", + "pin-project", + "quick-xml 0.31.0", + "rand 0.8.5", + "reqwest", + "rustc_version", + "serde", + "serde_json", + "sha2", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_identity" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ddd80344317c40c04b603807b63a5cefa532f1b43522e72f480a988141f744" +dependencies = [ + "async-lock", + "async-process", + "async-trait", + "azure_core", + "futures", + "oauth2", + "pin-project", + "serde", + "time", + "tracing", + "tz-rs", + "url", + "uuid", +] + +[[package]] +name = "azure_storage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59f838159f4d29cb400a14d9d757578ba495ae64feb07a7516bf9e4415127126" +dependencies = [ + "RustyXML", + "async-lock", + "async-trait", + "azure_core", + "bytes", + "serde", + "serde_derive", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_storage_blobs" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97e83c3636ae86d9a6a7962b2112e3b19eb3903915c50ce06ff54ff0a2e6a7e4" +dependencies = [ + "RustyXML", + "azure_core", + "azure_storage", + "azure_svc_blobstorage", + "bytes", + "futures", + "serde", + "serde_derive", + "serde_json", + "time", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "azure_svc_blobstorage" +version = "0.21.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e6c6f20c5611b885ba94c7bae5e02849a267381aecb8aee577e8c35ff4064c6" +dependencies = [ + "azure_core", + "bytes", + "futures", + "log", + "once_cell", + "serde", + "serde_json", + "time", +] + [[package]] name = "backon" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" dependencies = [ - "fastrand", + "fastrand 2.3.0", "gloo-timers", "tokio", ] +[[package]] +name = "base64" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" + [[package]] name = "base64" version = "0.21.7" @@ -1144,6 +1338,19 @@ dependencies = [ "generic-array", ] +[[package]] +name = "blocking" +version = "1.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e83f8d02be6967315521be875afa792a316e28d57b5a2d401897e2a7921b7f21" +dependencies = [ + "async-channel 2.5.0", + "async-task", + "futures-io", + "futures-lite 2.6.1", + "piper", +] + [[package]] name = "bon" version = "3.8.2" @@ -1263,9 +1470,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.52" +version = "1.2.53" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd4932aefd12402b36c60956a4fe0035421f544799057659ff86f923657aada3" +checksum = "755d2fce177175ffca841e9a06afdb2c4ab0f593d53b4dee48147dfaade85932" dependencies = [ "find-msvc-tools", "jobserver", @@ -1302,9 +1509,9 @@ checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" [[package]] name = "chrono" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "145052bdd345b87320e369255277e3fb5152762ad123a901ef5c262dd38fe8d2" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" dependencies = [ "iana-time-zone", "js-sys", @@ -1395,6 +1602,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "const_fn" +version = "0.4.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f8a2ca5ac02d09563609681103aada9e1777d54fc57a5acd7a41404f9c93b6e" + [[package]] name = "const_panic" version = "0.2.15" @@ -1410,6 +1623,16 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -2644,6 +2867,12 @@ version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.4.1" @@ -2661,7 +2890,7 @@ version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" dependencies = [ - "event-listener", + "event-listener 5.4.1", "pin-project-lite", ] @@ -2677,6 +2906,15 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" +[[package]] +name = "fastrand" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51093e27b0797c359783294ca4f0a911c270184cb10f85783b118614a1501be" +dependencies = [ + "instant", +] + [[package]] name = "fastrand" version = "2.3.0" @@ -2685,21 +2923,20 @@ checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" [[package]] name = "filetime" -version = "0.2.26" +version = "0.2.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc0505cd1b6fa6580283f6bdf70a73fcf4aba1184038c90902b92b3dd0df63ed" +checksum = "f98844151eee8917efc50bd9e8318cb963ae8b297431495d3f758616ea5c57db" dependencies = [ "cfg-if", "libc", "libredox", - "windows-sys 0.60.2", ] [[package]] name = "find-msvc-tools" -version = "0.1.7" +version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f449e6c6c08c865631d4890cfacf252b3d396c9bcc83adb6623cdb02a8336c41" +checksum = "8591b0bcc8a98a64310a2fae1bb3e9b8564dd10e381e6e28010fde8e8e8568db" [[package]] name = "fixedbitset" @@ -2752,6 +2989,21 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" +[[package]] +name = "foreign-types" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6f339eb8adc052cd2ca78910fda869aefa38d22d5cb648e6485e4d3fc06f3b1" +dependencies = [ + "foreign-types-shared", +] + +[[package]] +name = "foreign-types-shared" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "00b0228411908ca8685dba7fc2cdd70ec9990a6e753e89b6ac91a84c40fbaf4b" + [[package]] name = "form_urlencoded" version = "1.2.2" @@ -2848,6 +3100,34 @@ version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" +[[package]] +name = "futures-lite" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49a9d51ce47660b1e808d3c990b4709f2f415d928835a17dfd16991515c46bce" +dependencies = [ + "fastrand 1.9.0", + "futures-core", + "futures-io", + "memchr", + "parking", + "pin-project-lite", + "waker-fn", +] + +[[package]] +name = "futures-lite" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f78e10609fe0e0b3f4157ffab1876319b5b0db102a2c60dc4626306dc46b44ad" +dependencies = [ + "fastrand 2.3.0", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "futures-macro" version = "0.3.31" @@ -3045,6 +3325,17 @@ dependencies = [ "libm", ] +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -3054,7 +3345,7 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "wasm-bindgen", ] @@ -3090,6 +3381,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "google-cloud-auth" +version = "0.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5572275b7f06b6fde8eec61a23d87c83aae362bee586bbeb8773b3f98658ae81" +dependencies = [ + "async-trait", + "base64 0.22.1", + "derive_builder", + "http 1.4.0", + "reqwest", + "rustls 0.23.36", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror 2.0.18", + "time", + "tokio", +] + [[package]] name = "h2" version = "0.3.27" @@ -3288,6 +3599,26 @@ dependencies = [ "pin-project-lite", ] +[[package]] +name = "http-types" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e9b187a72d63adbfba487f48095306ac823049cb504ee195541e91c7775f5ad" +dependencies = [ + "anyhow", + "async-channel 1.9.0", + "base64 0.13.1", + "futures-lite 1.13.0", + "infer", + "pin-project-lite", + "rand 0.7.3", + "serde", + "serde_json", + "serde_qs", + "serde_urlencoded", + "url", +] + [[package]] name = "httparse" version = "1.10.1" @@ -3386,6 +3717,22 @@ dependencies = [ "webpki-roots", ] +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper 1.8.1", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", +] + [[package]] name = "hyper-util" version = "0.1.19" @@ -3405,9 +3752,11 @@ dependencies = [ "percent-encoding", "pin-project-lite", "socket2 0.6.1", + "system-configuration", "tokio", "tower-service", "tracing", + "windows-registry", ] [[package]] @@ -3650,6 +3999,12 @@ dependencies = [ "rustversion", ] +[[package]] +name = "infer" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64e9829a50b42bb782c1df523f78d332fe371b10c661e78b7a3c34b0198e9fac" + [[package]] name = "inout" version = "0.1.4" @@ -3660,6 +4015,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -3797,9 +4161,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "464a3709c7f55f1f721e5389aa6ea4e3bc6aba669353300af094b29ffbdde1d8" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" dependencies = [ "once_cell", "wasm-bindgen", @@ -4115,7 +4479,7 @@ dependencies = [ "arrow-ord", "arrow-schema", "arrow-select", - "async-channel", + "async-channel 2.5.0", "async-recursion", "async-trait", "bitpacking", @@ -4245,9 +4609,18 @@ dependencies = [ "arrow-ipc", "arrow-schema", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-sts", "axum", + "azure_core", + "azure_identity", + "azure_storage", + "azure_storage_blobs", "bytes", + "chrono", "futures", + "google-cloud-auth", "lance", "lance-core", "lance-index", @@ -4260,6 +4633,7 @@ dependencies = [ "serde", "serde_json", "snafu", + "time", "tokio", "tower", "tower-http 0.5.2", @@ -4268,9 +4642,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.1" +version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b748e89a3a0e5d9fb1b51e4382f783f8aa6b620d755012d4856180968014e619" +checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" dependencies = [ "reqwest", "serde", @@ -4520,7 +4894,7 @@ dependencies = [ "reqwest", "serde", "tar", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "yada", ] @@ -4804,7 +5178,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" dependencies = [ "libc", - "wasi", + "wasi 0.11.1+wasi-snapshot-preview1", "windows-sys 0.61.2", ] @@ -4825,7 +5199,7 @@ dependencies = [ "crossbeam-epoch", "crossbeam-utils", "equivalent", - "event-listener", + "event-listener 5.4.1", "futures-util", "parking_lot", "portable-atomic", @@ -4846,6 +5220,23 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" +[[package]] +name = "native-tls" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87de3442987e9dbec73158d5c715e7ad9072fda936bb03d19d7fa10e00520f0e" +dependencies = [ + "libc", + "log", + "openssl", + "openssl-probe 0.1.6", + "openssl-sys", + "schannel", + "security-framework 2.11.1", + "security-framework-sys", + "tempfile", +] + [[package]] name = "ndarray" version = "0.16.1" @@ -5017,6 +5408,34 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "num_threads" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c7398b9c8b70908f6371f47ed36737907c87c52af34c268fed0bf0ceb92ead9" +dependencies = [ + "libc", +] + +[[package]] +name = "oauth2" +version = "4.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c38841cdd844847e3e7c8d29cef9dcfed8877f8f56f9071f77843ecf3baf937f" +dependencies = [ + "base64 0.13.1", + "chrono", + "getrandom 0.2.17", + "http 0.2.12", + "rand 0.8.5", + "serde", + "serde_json", + "serde_path_to_error", + "sha2", + "thiserror 1.0.69", + "url", +] + [[package]] name = "object" version = "0.32.2" @@ -5028,9 +5447,9 @@ dependencies = [ [[package]] name = "object_store" -version = "0.12.4" +version = "0.12.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c1be0c6c22ec0817cdc77d3842f721a17fd30ab6965001415b5402a74e6b740" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" dependencies = [ "async-trait", "base64 0.22.1", @@ -5055,7 +5474,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "url", @@ -5128,12 +5547,56 @@ dependencies = [ "uuid", ] +[[package]] +name = "openssl" +version = "0.10.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08838db121398ad17ab8531ce9de97b244589089e290a384c900cb9ff7434328" +dependencies = [ + "bitflags 2.10.0", + "cfg-if", + "foreign-types", + "libc", + "once_cell", + "openssl-macros", + "openssl-sys", +] + +[[package]] +name = "openssl-macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.114", +] + +[[package]] +name = "openssl-probe" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" + [[package]] name = "openssl-probe" version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f50d9b3dabb09ecd771ad0aa242ca6894994c130308ca3d7684634df8037391" +[[package]] +name = "openssl-sys" +version = "0.9.111" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82cab2d520aa75e3c58898289429321eb788c3106963d0dc886ec7a5f4adc321" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "option-ext" version = "0.2.0" @@ -5402,7 +5865,7 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" dependencies = [ - "fastrand", + "fastrand 2.3.0", "phf_shared 0.13.1", ] @@ -5456,6 +5919,17 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "piper" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c8c490f422ef9a4efd2cb5b42b76c8613d7e7dfc1caf667b8a3350a5acc066" +dependencies = [ + "atomic-waker", + "fastrand 2.3.0", + "futures-io", +] + [[package]] name = "pkcs1" version = "0.7.5" @@ -5500,6 +5974,20 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" +[[package]] +name = "polling" +version = "3.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d0e4f59085d47d8241c88ead0f274e8a0cb551f3625263c05eb8dd897c34218" +dependencies = [ + "cfg-if", + "concurrent-queue", + "hermit-abi", + "pin-project-lite", + "rustix 1.1.3", + "windows-sys 0.61.2", +] + [[package]] name = "portable-atomic" version = "1.13.0" @@ -5777,6 +6265,16 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "quick-xml" version = "0.37.5" @@ -5811,7 +6309,7 @@ dependencies = [ "rustc-hash", "rustls 0.23.36", "socket2 0.6.1", - "thiserror 2.0.17", + "thiserror 2.0.18", "tokio", "tracing", "web-time", @@ -5832,7 +6330,7 @@ dependencies = [ "rustls 0.23.36", "rustls-pki-types", "slab", - "thiserror 2.0.17", + "thiserror 2.0.18", "tinyvec", "tracing", "web-time", @@ -5873,6 +6371,19 @@ version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", +] + [[package]] name = "rand" version = "0.8.5" @@ -5894,6 +6405,16 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + [[package]] name = "rand_chacha" version = "0.3.1" @@ -5914,6 +6435,15 @@ dependencies = [ "rand_core 0.9.5", ] +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + [[package]] name = "rand_core" version = "0.6.4" @@ -5952,6 +6482,15 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + [[package]] name = "rand_xoshiro" version = "0.7.0" @@ -6052,7 +6591,7 @@ checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" dependencies = [ "getrandom 0.2.17", "libredox", - "thiserror 2.0.17", + "thiserror 2.0.18", ] [[package]] @@ -6158,11 +6697,13 @@ dependencies = [ "http-body-util", "hyper 1.8.1", "hyper-rustls 0.27.7", + "hyper-tls", "hyper-util", "js-sys", "log", "mime", "mime_guess", + "native-tls", "percent-encoding", "pin-project-lite", "quinn", @@ -6174,6 +6715,7 @@ dependencies = [ "serde_urlencoded", "sync_wrapper", "tokio", + "tokio-native-tls", "tokio-rustls 0.26.4", "tokio-util", "tower", @@ -6335,10 +6877,11 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" dependencies = [ "aws-lc-rs", + "log", "once_cell", "ring", "rustls-pki-types", - "rustls-webpki 0.103.8", + "rustls-webpki 0.103.9", "subtle", "zeroize", ] @@ -6349,10 +6892,10 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" dependencies = [ - "openssl-probe", + "openssl-probe 0.2.0", "rustls-pki-types", "schannel", - "security-framework", + "security-framework 3.5.1", ] [[package]] @@ -6366,9 +6909,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "21e6f2ab2928ca4291b86736a8bd920a277a399bba1589409d72154ff87c1282" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" dependencies = [ "web-time", "zeroize", @@ -6386,9 +6929,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.8" +version = "0.103.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2ffdfa2f5286e2247234e03f680868ac2815974dc39e00ea15adc445d0aafe52" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" dependencies = [ "aws-lc-rs", "ring", @@ -6492,6 +7035,19 @@ dependencies = [ "untrusted", ] +[[package]] +name = "security-framework" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "897b2245f0b511c87893af39b033e5ca9cce68824c4d7e7630b5a1d339658d02" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + [[package]] name = "security-framework" version = "3.5.1" @@ -6499,7 +7055,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3297343eaf830f66ede390ea39da1d462b6b0c1b000f420d0a83f898bbbe6ef" dependencies = [ "bitflags 2.10.0", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -6596,6 +7152,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "serde_qs" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7715380eec75f029a4ef7de39a9200e0a63823176b759d055b613f5a87df6a6" +dependencies = [ + "percent-encoding", + "serde", + "thiserror 1.0.69", +] + [[package]] name = "serde_repr" version = "0.1.20" @@ -6730,7 +7297,7 @@ checksum = "297f631f50729c8c99b84667867963997ec0b50f32b2a7dbcab828ef0541e8bb" dependencies = [ "num-bigint", "num-traits", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", ] @@ -7011,6 +7578,27 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "system-configuration" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c879d448e9d986b661742763247d3693ed13609438cf3d006f51f5368a5ba6b" +dependencies = [ + "bitflags 2.10.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "tagptr" version = "0.2.0" @@ -7063,7 +7651,7 @@ dependencies = [ "tantivy-stacker", "tantivy-tokenizer-api", "tempfile", - "thiserror 2.0.17", + "thiserror 2.0.18", "time", "uuid", "winapi", @@ -7192,7 +7780,7 @@ version = "3.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "655da9c7eb6305c55742045d5a8d2037996d61d8de95806335c7c86ce0f82e9c" dependencies = [ - "fastrand", + "fastrand 2.3.0", "getrandom 0.3.4", "once_cell", "rustix 1.1.3", @@ -7210,11 +7798,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f63587ca0f12b72a0600bcba1d40081f830876000bb46dd2337a3051618f4fc8" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" dependencies = [ - "thiserror-impl 2.0.17", + "thiserror-impl 2.0.18", ] [[package]] @@ -7230,9 +7818,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.17" +version = "2.0.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ff15c8ecd7de3849db632e14d18d2571fa09dfc5ed93479bc4485c7a517c913" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" dependencies = [ "proc-macro2", "quote", @@ -7276,7 +7864,10 @@ checksum = "f9e442fc33d7fdb45aa9bfeb312c095964abdf596f7567261062b2a7107aaabd" dependencies = [ "deranged", "itoa", + "js-sys", + "libc", "num-conv", + "num_threads", "powerfmt", "serde_core", "time-core", @@ -7361,6 +7952,16 @@ dependencies = [ "syn 2.0.114", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -7650,7 +8251,7 @@ dependencies = [ "serde", "serde_json", "syn 2.0.114", - "thiserror 2.0.17", + "thiserror 2.0.18", "unicode-ident", ] @@ -7671,6 +8272,15 @@ dependencies = [ "typify-impl", ] +[[package]] +name = "tz-rs" +version = "0.6.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33851b15c848fad2cf4b105c6bb66eb9512b6f6c44a4b13f57c53c73c707e2b4" +dependencies = [ + "const_fn", +] + [[package]] name = "unicase" version = "2.9.0" @@ -7744,6 +8354,7 @@ dependencies = [ "idna", "percent-encoding", "serde", + "serde_derive", ] [[package]] @@ -7788,6 +8399,12 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" +[[package]] +name = "vcpkg" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "accd4ea62f7bb7a82fe23066fb0957d48ef677f6eeb8215f372f52e48bb32426" + [[package]] name = "version_check" version = "0.9.5" @@ -7806,6 +8423,12 @@ version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" +[[package]] +name = "waker-fn" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "317211a0dc0ceedd78fb2ca9a44aed3d7b9b26f81870d485c07122b4350673b7" + [[package]] name = "walkdir" version = "2.5.0" @@ -7825,6 +8448,12 @@ dependencies = [ "try-lock", ] +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -7833,18 +8462,18 @@ checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" [[package]] name = "wasip2" -version = "1.0.1+wasi-0.2.4" +version = "1.0.2+wasi-0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0562428422c63773dad2c345a1882263bbf4d65cf3f42e90921f787ef5ad58e7" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" dependencies = [ "wit-bindgen", ] [[package]] name = "wasm-bindgen" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d759f433fa64a2d763d1340820e46e111a7a5ab75f993d1852d70b03dbb80fd" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" dependencies = [ "cfg-if", "once_cell", @@ -7855,11 +8484,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.56" +version = "0.4.58" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "836d9622d604feee9e5de25ac10e3ea5f2d65b41eac0d9ce72eb5deae707ce7c" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" dependencies = [ "cfg-if", + "futures-util", "js-sys", "once_cell", "wasm-bindgen", @@ -7868,9 +8498,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "48cb0d2638f8baedbc542ed444afc0644a29166f1595371af4fecf8ce1e7eeb3" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -7878,9 +8508,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cefb59d5cd5f92d9dcf80e4683949f15ca4b511f4ac0a6e14d4e1ac60c6ecd40" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" dependencies = [ "bumpalo", "proc-macro2", @@ -7891,9 +8521,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.106" +version = "0.2.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cbc538057e648b67f72a982e708d485b2efa771e1ac05fec311f9f63e5800db4" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" dependencies = [ "unicode-ident", ] @@ -7913,9 +8543,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.83" +version = "0.3.85" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9b32828d774c412041098d182a8b38b16ea816958e07cf40eec2bc080ae137ac" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" dependencies = [ "js-sys", "wasm-bindgen", @@ -8012,6 +8642,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.4.1" @@ -8206,9 +8847,9 @@ dependencies = [ [[package]] name = "wit-bindgen" -version = "0.46.0" +version = "0.51.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f17a85883d4e6d00e8a97c586de764dabcc06133f7f1d55dce5cdc070ad7fe59" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" [[package]] name = "wkb" @@ -8398,9 +9039,9 @@ checksum = "40990edd51aae2c2b6907af74ffb635029d5788228222c4bb811e9351c0caad3" [[package]] name = "zmij" -version = "1.0.14" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd8f3f50b848df28f887acb68e41201b5aea6bc8a8dacc00fb40635ff9a72fea" +checksum = "dfcd145825aace48cff44a8844de64bf75feec3080e0aa5cdbde72961ae51a65" [[package]] name = "zstd" diff --git a/python/Cargo.toml b/python/Cargo.toml index 7f6c1caa70f..eb00dfbc05f 100644 --- a/python/Cargo.toml +++ b/python/Cargo.toml @@ -73,11 +73,15 @@ url = "2.5.0" bytes = "1.4" [features] -default = ["rest", "rest-adapter"] +default = ["rest", "rest-adapter", "credential-vendor-aws", "credential-vendor-gcp", "credential-vendor-azure"] datagen = ["lance-datagen"] fp16kernels = ["lance/fp16kernels"] rest = ["lance-namespace-impls/rest"] rest-adapter = ["lance-namespace-impls/rest-adapter"] +# Credential vending features for DirectoryNamespace +credential-vendor-aws = ["lance-namespace-impls/credential-vendor-aws"] +credential-vendor-gcp = ["lance-namespace-impls/credential-vendor-gcp"] +credential-vendor-azure = ["lance-namespace-impls/credential-vendor-azure"] [profile.ci] debug = "line-tables-only" diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index 426c7176d74..3d22cafefc9 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -86,6 +86,40 @@ class DirectoryNamespace(LanceNamespace): (e.g., storage.region="us-west-2" becomes region="us-west-2" in storage options) + Credential vendor properties (vendor is auto-selected based on table location): + When credential vendor properties are configured, describe_table() will + return vended temporary credentials. The vendor type is auto-selected + based on table location URI: s3:// for AWS, gs:// for GCP, az:// for + Azure. Requires the corresponding credential-vendor-* feature. + + Common properties: + - credential_vendor.enabled (required): Set to "true" to enable + - credential_vendor.permission (optional): read, write, or admin + + AWS-specific properties (for s3:// locations): + - credential_vendor.aws_role_arn (required): IAM role ARN to assume + - credential_vendor.aws_external_id (optional): External ID + - credential_vendor.aws_region (optional): AWS region + - credential_vendor.aws_role_session_name (optional): Session name + - credential_vendor.aws_duration_millis (optional): Duration in ms + (default: 3600000, range: 15min-12hrs) + + GCP-specific properties (for gs:// locations): + - credential_vendor.gcp_service_account (optional): Service account + to impersonate using IAM Credentials API + + Note: GCP uses Application Default Credentials (ADC). To use a service + account key file, set the GOOGLE_APPLICATION_CREDENTIALS environment + variable before starting. GCP token duration cannot be configured; + it's determined by the STS endpoint (typically 1 hour). + + Azure-specific properties (for az:// locations): + - credential_vendor.azure_account_name (required): Azure storage + account name + - credential_vendor.azure_tenant_id (optional): Azure tenant ID + - credential_vendor.azure_duration_millis (optional): Duration in ms + (default: 3600000, up to 7 days) + Examples -------- >>> import lance.namespace @@ -95,6 +129,15 @@ class DirectoryNamespace(LanceNamespace): >>> # Using the connect() factory function from lance_namespace >>> import lance_namespace >>> ns = lance_namespace.connect("dir", {"root": "memory://test"}) + >>> + >>> # With AWS credential vending (requires credential-vendor-aws feature) + >>> # Use **dict to pass property names with dots + >>> ns = lance.namespace.DirectoryNamespace(**{ + ... "root": "s3://my-bucket/data", + ... "credential_vendor.enabled": "true", + ... "credential_vendor.aws_role_arn": "arn:aws:iam::123456789012:role/MyRole", + ... "credential_vendor.aws_duration_millis": "3600000", + ... }) """ def __init__(self, session=None, **properties): diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 9ce32692ffc..cb0ff52d1e0 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -21,6 +21,10 @@ dir-aws = ["lance-io/aws", "lance/aws"] dir-azure = ["lance-io/azure", "lance/azure"] dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] +# Credential vending features +credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types"] +credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde"] +credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time"] [dependencies] lance-namespace.workspace = true @@ -60,6 +64,22 @@ serde_json = { workspace = true } futures.workspace = true log.workspace = true rand.workspace = true +chrono.workspace = true + +# AWS credential vending dependencies (optional, enabled by "dir-aws" feature) +aws-sdk-sts = { version = "1.38.0", optional = true } +aws-config = { workspace = true, optional = true } +aws-credential-types = { workspace = true, optional = true } + +# GCP credential vending dependencies (optional, enabled by "dir-gcp" feature) +google-cloud-auth = { version = "0.18", optional = true } + +# Azure credential vending dependencies (optional, enabled by "dir-azure" feature) +azure_core = { version = "0.21", optional = true } +azure_identity = { version = "0.21", optional = true } +azure_storage = { version = "0.21", optional = true } +azure_storage_blobs = { version = "0.21", optional = true } +time = { version = "0.3", optional = true } [dev-dependencies] tokio = { workspace = true, features = ["full"] } diff --git a/rust/lance-namespace-impls/src/credentials.rs b/rust/lance-namespace-impls/src/credentials.rs new file mode 100644 index 00000000000..6be4f1e38a4 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials.rs @@ -0,0 +1,717 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Credential vending for cloud storage access. +//! +//! This module provides credential vending functionality that generates +//! temporary, scoped credentials for accessing cloud storage. Similar to +//! Apache Polaris's credential vending, it supports: +//! +//! - **AWS**: STS AssumeRole with scoped IAM policies (requires `credential-vendor-aws` feature) +//! - **GCP**: OAuth2 tokens with access boundaries (requires `credential-vendor-gcp` feature) +//! - **Azure**: SAS tokens with user delegation keys (requires `credential-vendor-azure` feature) +//! +//! The appropriate vendor is automatically selected based on the table location URI scheme: +//! - `s3://` for AWS +//! - `gs://` for GCP +//! - `az://` for Azure +//! +//! ## Configuration via Properties +//! +//! Credential vendors are configured via properties with the `credential_vendor.` prefix. +//! +//! ### Properties format: +//! +//! ```text +//! # Required to enable credential vending +//! credential_vendor.enabled = "true" +//! +//! # Common properties (apply to all providers) +//! credential_vendor.permission = "read" # read, write, or admin (default: read) +//! +//! # AWS-specific properties (for s3:// locations) +//! credential_vendor.aws_role_arn = "arn:aws:iam::123456789012:role/MyRole" # required for AWS +//! credential_vendor.aws_external_id = "my-external-id" +//! credential_vendor.aws_region = "us-west-2" +//! credential_vendor.aws_role_session_name = "my-session" +//! credential_vendor.aws_duration_millis = "3600000" # 1 hour (default, range: 15min-12hrs) +//! +//! # GCP-specific properties (for gs:// locations) +//! # Note: GCP token duration cannot be configured; it's determined by the STS endpoint +//! # To use a service account key file, set GOOGLE_APPLICATION_CREDENTIALS env var before starting +//! credential_vendor.gcp_service_account = "my-sa@project.iam.gserviceaccount.com" +//! +//! # Azure-specific properties (for az:// locations) +//! credential_vendor.azure_account_name = "mystorageaccount" # required for Azure +//! credential_vendor.azure_tenant_id = "my-tenant-id" +//! credential_vendor.azure_duration_millis = "3600000" # 1 hour (default, up to 7 days) +//! ``` +//! +//! ### Example using ConnectBuilder: +//! +//! ```ignore +//! ConnectBuilder::new("dir") +//! .property("root", "s3://bucket/path") +//! .property("credential_vendor.enabled", "true") +//! .property("credential_vendor.aws_role_arn", "arn:aws:iam::123456789012:role/MyRole") +//! .property("credential_vendor.permission", "read") +//! .connect() +//! .await?; +//! ``` + +#[cfg(feature = "credential-vendor-aws")] +pub mod aws; + +#[cfg(feature = "credential-vendor-azure")] +pub mod azure; + +#[cfg(feature = "credential-vendor-gcp")] +pub mod gcp; + +use std::collections::HashMap; +use std::str::FromStr; + +use async_trait::async_trait; +use lance_core::Result; +use lance_io::object_store::uri_to_url; + +/// Default credential duration: 1 hour (3600000 milliseconds) +pub const DEFAULT_CREDENTIAL_DURATION_MILLIS: u64 = 3600 * 1000; + +/// Redact a credential string for logging, showing first and last few characters. +/// +/// This is useful for debugging while avoiding exposure of sensitive data. +/// Format: `AKIAIOSF***MPLE` (first 8 + "***" + last 4) +/// +/// Shows 8 characters at the start (useful since AWS keys always start with AKIA/ASIA) +/// and 4 characters at the end. For short strings, shows only the first few with "***". +/// +/// # Security Note +/// +/// This function should only be used for identifiers and tokens, never for secrets +/// like `aws_secret_access_key` which should never be logged even in redacted form. +pub fn redact_credential(credential: &str) -> String { + const SHOW_START: usize = 8; + const SHOW_END: usize = 4; + const MIN_LENGTH_FOR_BOTH_ENDS: usize = SHOW_START + SHOW_END + 4; // Need at least 16 chars + + if credential.is_empty() { + return "[empty]".to_string(); + } + + if credential.len() < MIN_LENGTH_FOR_BOTH_ENDS { + // For short credentials, just show beginning + let show = credential.len().min(SHOW_START); + format!("{}***", &credential[..show]) + } else { + // Show first 8 and last 4 characters + format!( + "{}***{}", + &credential[..SHOW_START], + &credential[credential.len() - SHOW_END..] + ) + } +} + +/// Permission level for vended credentials. +/// +/// This determines what access the vended credentials will have: +/// - `Read`: Read-only access to all table content +/// - `Write`: Full read and write access (no delete) +/// - `Admin`: Full read, write, and delete access +/// +/// Permission enforcement by cloud provider: +/// - **AWS**: Permissions are enforced via scoped IAM policies attached to the AssumeRole request +/// - **Azure**: Permissions are enforced via SAS token permissions +/// - **GCP**: Permissions are enforced via Credential Access Boundaries (CAB) that downscope +/// the OAuth2 token to specific GCS IAM roles +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum VendedPermission { + /// Read-only access to all table content (metadata, indices, data files) + #[default] + Read, + /// Full read and write access (no delete) + /// This is intended ONLY for testing purposes to generate a write-only permission set. + /// Technically, any user with write permission could "delete" the file by + /// overwriting the file with empty content. + /// So this cannot really prevent malicious use cases. + Write, + /// Full read, write, and delete access + Admin, +} + +impl VendedPermission { + /// Returns true if this permission allows writing + pub fn can_write(&self) -> bool { + matches!(self, Self::Write | Self::Admin) + } + + /// Returns true if this permission allows deleting + pub fn can_delete(&self) -> bool { + matches!(self, Self::Admin) + } +} + +impl FromStr for VendedPermission { + type Err = String; + + fn from_str(s: &str) -> std::result::Result { + match s.to_lowercase().as_str() { + "read" => Ok(Self::Read), + "write" => Ok(Self::Write), + "admin" => Ok(Self::Admin), + _ => Err(format!( + "Invalid permission '{}'. Must be one of: read, write, admin", + s + )), + } + } +} + +impl std::fmt::Display for VendedPermission { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Read => write!(f, "read"), + Self::Write => write!(f, "write"), + Self::Admin => write!(f, "admin"), + } + } +} + +/// Property key prefix for credential vendor properties. +/// Properties with this prefix are stripped when using `from_properties`. +pub const PROPERTY_PREFIX: &str = "credential_vendor."; + +/// Common property key to explicitly enable credential vending (short form). +pub const ENABLED: &str = "enabled"; + +/// Common property key for permission level (short form). +pub const PERMISSION: &str = "permission"; + +/// AWS-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-aws")] +pub mod aws_props { + pub const ROLE_ARN: &str = "aws_role_arn"; + pub const EXTERNAL_ID: &str = "aws_external_id"; + pub const REGION: &str = "aws_region"; + pub const ROLE_SESSION_NAME: &str = "aws_role_session_name"; + /// AWS credential duration in milliseconds. + /// Default: 3600000 (1 hour). Range: 900000 (15 min) to 43200000 (12 hours). + pub const DURATION_MILLIS: &str = "aws_duration_millis"; +} + +/// GCP-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-gcp")] +pub mod gcp_props { + pub const SERVICE_ACCOUNT: &str = "gcp_service_account"; +} + +/// Azure-specific property keys (short form, without prefix) +#[cfg(feature = "credential-vendor-azure")] +pub mod azure_props { + pub const TENANT_ID: &str = "azure_tenant_id"; + /// Azure storage account name. Required for credential vending. + pub const ACCOUNT_NAME: &str = "azure_account_name"; + /// Azure credential duration in milliseconds. + /// Default: 3600000 (1 hour). Azure SAS tokens can be valid up to 7 days. + pub const DURATION_MILLIS: &str = "azure_duration_millis"; +} + +/// Vended credentials with expiration information. +#[derive(Clone)] +pub struct VendedCredentials { + /// Storage options map containing credential keys. + /// - For AWS: `aws_access_key_id`, `aws_secret_access_key`, `aws_session_token` + /// - For GCP: `google_storage_token` + /// - For Azure: `azure_storage_sas_token`, `azure_storage_account_name` + pub storage_options: HashMap, + + /// Expiration time in milliseconds since Unix epoch. + pub expires_at_millis: u64, +} + +impl std::fmt::Debug for VendedCredentials { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("VendedCredentials") + .field( + "storage_options", + &format!("[{} keys redacted]", self.storage_options.len()), + ) + .field("expires_at_millis", &self.expires_at_millis) + .finish() + } +} + +impl VendedCredentials { + /// Create new vended credentials. + pub fn new(storage_options: HashMap, expires_at_millis: u64) -> Self { + Self { + storage_options, + expires_at_millis, + } + } + + /// Check if the credentials have expired. + pub fn is_expired(&self) -> bool { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64; + now_millis >= self.expires_at_millis + } +} + +/// Trait for credential vendors that generate temporary credentials. +/// +/// Each cloud provider has its own configuration passed via the vendor +/// implementation. The permission level is configured at vendor creation time +/// via [`VendedPermission`]. +#[async_trait] +pub trait CredentialVendor: Send + Sync + std::fmt::Debug { + /// Vend credentials for accessing the specified table location. + /// + /// The permission level (read/write/admin) is determined by the vendor's + /// configuration, not per-request. + /// + /// # Arguments + /// + /// * `table_location` - The table URI to vend credentials for + /// + /// # Returns + /// + /// Returns vended credentials with expiration information. + async fn vend_credentials(&self, table_location: &str) -> Result; + + /// Returns the cloud provider name (e.g., "aws", "gcp", "azure"). + fn provider_name(&self) -> &'static str; + + /// Returns the permission level configured for this vendor. + fn permission(&self) -> VendedPermission; +} + +/// Detect the cloud provider from a URI scheme. +/// +/// Supported schemes for credential vending: +/// - AWS S3: `s3://` +/// - GCP GCS: `gs://` +/// - Azure Blob: `az://` +/// +/// Returns "aws", "gcp", "azure", or "unknown". +pub fn detect_provider_from_uri(uri: &str) -> &'static str { + let Ok(url) = uri_to_url(uri) else { + return "unknown"; + }; + + match url.scheme() { + "s3" => "aws", + "gs" => "gcp", + "az" => "azure", + _ => "unknown", + } +} + +/// Check if credential vending is enabled. +/// +/// Returns true only if the `enabled` property is set to "true". +/// This expects properties with short names (prefix already stripped). +pub fn has_credential_vendor_config(properties: &HashMap) -> bool { + properties + .get(ENABLED) + .map(|v| v.eq_ignore_ascii_case("true")) + .unwrap_or(false) +} + +/// Create a credential vendor for the specified table location based on its URI scheme. +/// +/// This function automatically detects the cloud provider from the table location +/// and creates the appropriate credential vendor using the provided properties. +/// +/// # Arguments +/// +/// * `table_location` - The table URI to create a vendor for (e.g., "s3://bucket/path") +/// * `properties` - Configuration properties for credential vendors +/// +/// # Returns +/// +/// Returns `Some(vendor)` if the provider is detected and configured, `None` if: +/// - The provider cannot be detected from the URI (e.g., local file path) +/// - The required feature is not enabled for the detected provider +/// +/// # Errors +/// +/// Returns an error if the provider is detected but required configuration is missing: +/// - AWS: `credential_vendor.aws_role_arn` is required +/// - Azure: `credential_vendor.azure_account_name` is required +#[allow(unused_variables)] +pub async fn create_credential_vendor_for_location( + table_location: &str, + properties: &HashMap, +) -> Result>> { + let provider = detect_provider_from_uri(table_location); + + match provider { + #[cfg(feature = "credential-vendor-aws")] + "aws" => create_aws_vendor(properties).await, + + #[cfg(feature = "credential-vendor-gcp")] + "gcp" => create_gcp_vendor(properties).await, + + #[cfg(feature = "credential-vendor-azure")] + "azure" => create_azure_vendor(properties), + + _ => Ok(None), + } +} + +/// Parse permission from properties, defaulting to Read +fn parse_permission(properties: &HashMap) -> VendedPermission { + properties + .get(PERMISSION) + .and_then(|s| s.parse().ok()) + .unwrap_or_default() +} + +/// Parse duration from properties using a vendor-specific key, defaulting to DEFAULT_CREDENTIAL_DURATION_MILLIS +fn parse_duration_millis(properties: &HashMap, key: &str) -> u64 { + properties + .get(key) + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_CREDENTIAL_DURATION_MILLIS) +} + +#[cfg(feature = "credential-vendor-aws")] +async fn create_aws_vendor( + properties: &HashMap, +) -> Result>> { + use aws::{AwsCredentialVendor, AwsCredentialVendorConfig}; + use lance_core::Error; + + // AWS requires role_arn to be configured + let role_arn = properties + .get(aws_props::ROLE_ARN) + .ok_or_else(|| Error::InvalidInput { + source: "AWS credential vending requires 'credential_vendor.aws_role_arn' to be set" + .into(), + location: snafu::location!(), + })?; + + let duration_millis = parse_duration_millis(properties, aws_props::DURATION_MILLIS); + + let permission = parse_permission(properties); + + let mut config = AwsCredentialVendorConfig::new(role_arn) + .with_duration_millis(duration_millis) + .with_permission(permission); + + if let Some(external_id) = properties.get(aws_props::EXTERNAL_ID) { + config = config.with_external_id(external_id); + } + if let Some(region) = properties.get(aws_props::REGION) { + config = config.with_region(region); + } + if let Some(session_name) = properties.get(aws_props::ROLE_SESSION_NAME) { + config = config.with_role_session_name(session_name); + } + + let vendor = AwsCredentialVendor::new(config).await?; + Ok(Some(Box::new(vendor))) +} + +#[cfg(feature = "credential-vendor-gcp")] +async fn create_gcp_vendor( + properties: &HashMap, +) -> Result>> { + use gcp::{GcpCredentialVendor, GcpCredentialVendorConfig}; + + let permission = parse_permission(properties); + + let mut config = GcpCredentialVendorConfig::new().with_permission(permission); + + if let Some(sa) = properties.get(gcp_props::SERVICE_ACCOUNT) { + config = config.with_service_account(sa); + } + + let vendor = GcpCredentialVendor::new(config).await?; + Ok(Some(Box::new(vendor))) +} + +#[cfg(feature = "credential-vendor-azure")] +fn create_azure_vendor( + properties: &HashMap, +) -> Result>> { + use azure::{AzureCredentialVendor, AzureCredentialVendorConfig}; + use lance_core::Error; + + // Azure requires account_name to be configured + let account_name = + properties + .get(azure_props::ACCOUNT_NAME) + .ok_or_else(|| { + Error::InvalidInput { + source: + "Azure credential vending requires 'credential_vendor.azure_account_name' to be set" + .into(), + location: snafu::location!(), + } + })?; + + let duration_millis = parse_duration_millis(properties, azure_props::DURATION_MILLIS); + let permission = parse_permission(properties); + + let mut config = AzureCredentialVendorConfig::new() + .with_account_name(account_name) + .with_duration_millis(duration_millis) + .with_permission(permission); + + if let Some(tenant_id) = properties.get(azure_props::TENANT_ID) { + config = config.with_tenant_id(tenant_id); + } + + let vendor = AzureCredentialVendor::new(config); + Ok(Some(Box::new(vendor))) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_provider_from_uri() { + // AWS (supported scheme: s3://) + assert_eq!(detect_provider_from_uri("s3://bucket/path"), "aws"); + assert_eq!(detect_provider_from_uri("S3://bucket/path"), "aws"); + + // GCP (supported scheme: gs://) + assert_eq!(detect_provider_from_uri("gs://bucket/path"), "gcp"); + assert_eq!(detect_provider_from_uri("GS://bucket/path"), "gcp"); + + // Azure (supported scheme: az://) + assert_eq!(detect_provider_from_uri("az://container/path"), "azure"); + + // Unknown (unsupported schemes) + assert_eq!(detect_provider_from_uri("/local/path"), "unknown"); + assert_eq!(detect_provider_from_uri("file:///local/path"), "unknown"); + assert_eq!(detect_provider_from_uri("memory://test"), "unknown"); + // Hadoop-style schemes not supported by lance-io + assert_eq!(detect_provider_from_uri("s3a://bucket/path"), "unknown"); + assert_eq!( + detect_provider_from_uri("abfss://container@account.dfs.core.windows.net/path"), + "unknown" + ); + assert_eq!( + detect_provider_from_uri("wasbs://container@account.blob.core.windows.net/path"), + "unknown" + ); + } + + #[test] + fn test_vended_permission_from_str() { + // Valid values (case-insensitive) + assert_eq!( + "read".parse::().unwrap(), + VendedPermission::Read + ); + assert_eq!( + "READ".parse::().unwrap(), + VendedPermission::Read + ); + assert_eq!( + "write".parse::().unwrap(), + VendedPermission::Write + ); + assert_eq!( + "WRITE".parse::().unwrap(), + VendedPermission::Write + ); + assert_eq!( + "admin".parse::().unwrap(), + VendedPermission::Admin + ); + assert_eq!( + "Admin".parse::().unwrap(), + VendedPermission::Admin + ); + + // Invalid values should return error + let err = "invalid".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + assert!(err.contains("invalid")); + + let err = "".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + + let err = "readwrite".parse::().unwrap_err(); + assert!(err.contains("Invalid permission")); + } + + #[test] + fn test_vended_permission_display() { + assert_eq!(VendedPermission::Read.to_string(), "read"); + assert_eq!(VendedPermission::Write.to_string(), "write"); + assert_eq!(VendedPermission::Admin.to_string(), "admin"); + } + + #[test] + fn test_parse_permission_with_invalid_values() { + // Invalid permission should default to Read + let mut props = HashMap::new(); + props.insert(PERMISSION.to_string(), "invalid".to_string()); + assert_eq!(parse_permission(&props), VendedPermission::Read); + + // Empty permission should default to Read + props.insert(PERMISSION.to_string(), "".to_string()); + assert_eq!(parse_permission(&props), VendedPermission::Read); + + // Missing permission should default to Read + let empty_props: HashMap = HashMap::new(); + assert_eq!(parse_permission(&empty_props), VendedPermission::Read); + } + + #[test] + fn test_parse_duration_millis_with_invalid_values() { + const TEST_KEY: &str = "test_duration_millis"; + + // Invalid duration should default to DEFAULT_CREDENTIAL_DURATION_MILLIS + let mut props = HashMap::new(); + props.insert(TEST_KEY.to_string(), "not_a_number".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Negative number (parsed as u64 fails) + props.insert(TEST_KEY.to_string(), "-1000".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Empty string should default + props.insert(TEST_KEY.to_string(), "".to_string()); + assert_eq!( + parse_duration_millis(&props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Missing duration should default + let empty_props: HashMap = HashMap::new(); + assert_eq!( + parse_duration_millis(&empty_props, TEST_KEY), + DEFAULT_CREDENTIAL_DURATION_MILLIS + ); + + // Valid duration should work + props.insert(TEST_KEY.to_string(), "7200000".to_string()); + assert_eq!(parse_duration_millis(&props, TEST_KEY), 7200000); + } + + #[test] + fn test_has_credential_vendor_config() { + // enabled = true + let mut props = HashMap::new(); + props.insert(ENABLED.to_string(), "true".to_string()); + assert!(has_credential_vendor_config(&props)); + + // enabled = TRUE (case-insensitive) + props.insert(ENABLED.to_string(), "TRUE".to_string()); + assert!(has_credential_vendor_config(&props)); + + // enabled = false + props.insert(ENABLED.to_string(), "false".to_string()); + assert!(!has_credential_vendor_config(&props)); + + // enabled = invalid value + props.insert(ENABLED.to_string(), "yes".to_string()); + assert!(!has_credential_vendor_config(&props)); + + // enabled missing + let empty_props: HashMap = HashMap::new(); + assert!(!has_credential_vendor_config(&empty_props)); + } + + #[test] + fn test_vended_credentials_debug_redacts_secrets() { + let mut storage_options = HashMap::new(); + storage_options.insert( + "aws_access_key_id".to_string(), + "AKIAIOSFODNN7EXAMPLE".to_string(), + ); + storage_options.insert( + "aws_secret_access_key".to_string(), + "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_string(), + ); + storage_options.insert( + "aws_session_token".to_string(), + "FwoGZXIvYXdzE...".to_string(), + ); + + let creds = VendedCredentials::new(storage_options, 1234567890); + let debug_output = format!("{:?}", creds); + + // Should NOT contain actual secrets + assert!(!debug_output.contains("AKIAIOSFODNN7EXAMPLE")); + assert!(!debug_output.contains("wJalrXUtnFEMI")); + assert!(!debug_output.contains("FwoGZXIvYXdzE")); + + // Should contain redacted message + assert!(debug_output.contains("redacted")); + assert!(debug_output.contains("3 keys")); + + // Should contain expiration time + assert!(debug_output.contains("1234567890")); + } + + #[test] + fn test_vended_credentials_is_expired() { + // Create credentials that expired in the past + let past_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64 + - 1000; // 1 second ago + + let expired_creds = VendedCredentials::new(HashMap::new(), past_millis); + assert!(expired_creds.is_expired()); + + // Create credentials that expire in the future + let future_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64 + + 3600000; // 1 hour from now + + let valid_creds = VendedCredentials::new(HashMap::new(), future_millis); + assert!(!valid_creds.is_expired()); + } + + #[test] + fn test_redact_credential() { + // Long credential: shows first 8 and last 4 + assert_eq!(redact_credential("AKIAIOSFODNN7EXAMPLE"), "AKIAIOSF***MPLE"); + + // Exactly 16 chars: shows first 8 and last 4 + assert_eq!(redact_credential("1234567890123456"), "12345678***3456"); + + // Short credential (< 16 chars): shows only first few + assert_eq!(redact_credential("short1234567"), "short123***"); + assert_eq!(redact_credential("short123"), "short123***"); + assert_eq!(redact_credential("tiny"), "tiny***"); + assert_eq!(redact_credential("ab"), "ab***"); + assert_eq!(redact_credential("a"), "a***"); + + // Empty string + assert_eq!(redact_credential(""), "[empty]"); + + // Real-world examples + // AWS access key ID (20 chars) - shows AKIA + 4 more chars which helps identify the key + assert_eq!(redact_credential("AKIAIOSFODNN7EXAMPLE"), "AKIAIOSF***MPLE"); + + // GCP token (typically very long) + let long_token = "ya29.a0AfH6SMBx1234567890abcdefghijklmnopqrstuvwxyz"; + assert_eq!(redact_credential(long_token), "ya29.a0A***wxyz"); + + // Azure SAS token + let sas_token = "sv=2021-06-08&ss=b&srt=sco&sp=rwdlacuiytfx&se=2024-12-31"; + assert_eq!(redact_credential(sas_token), "sv=2021-***2-31"); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/aws.rs b/rust/lance-namespace-impls/src/credentials/aws.rs new file mode 100644 index 00000000000..96e0e8a2a80 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/aws.rs @@ -0,0 +1,881 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! AWS credential vending using STS AssumeRole. +//! +//! This module provides credential vending for AWS S3 storage by assuming +//! an IAM role using AWS STS (Security Token Service). + +use std::collections::HashMap; + +use async_trait::async_trait; +use aws_config::BehaviorVersion; +use aws_sdk_sts::Client as StsClient; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use log::{debug, info}; + +use super::{ + redact_credential, CredentialVendor, VendedCredentials, VendedPermission, + DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +/// Configuration for AWS credential vending. +#[derive(Debug, Clone)] +pub struct AwsCredentialVendorConfig { + /// The IAM role ARN to assume. + pub role_arn: String, + + /// Optional external ID for the assume role request. + pub external_id: Option, + + /// Duration for vended credentials in milliseconds. + /// Default: 3600000 (1 hour). + /// AWS STS allows 900-43200 seconds (15 min - 12 hours). + /// Values outside this range will be clamped. + pub duration_millis: u64, + + /// Optional role session name. Defaults to "lance-credential-vending". + pub role_session_name: Option, + + /// Optional AWS region for the STS client. + pub region: Option, + + /// Permission level for vended credentials. + /// Default: Read (full read access) + pub permission: VendedPermission, +} + +impl AwsCredentialVendorConfig { + /// Create a new config with the specified role ARN. + pub fn new(role_arn: impl Into) -> Self { + Self { + role_arn: role_arn.into(), + external_id: None, + duration_millis: DEFAULT_CREDENTIAL_DURATION_MILLIS, + role_session_name: None, + region: None, + permission: VendedPermission::default(), + } + } + + /// Set the external ID for the assume role request. + pub fn with_external_id(mut self, external_id: impl Into) -> Self { + self.external_id = Some(external_id.into()); + self + } + + /// Set the credential duration in milliseconds. + pub fn with_duration_millis(mut self, millis: u64) -> Self { + self.duration_millis = millis; + self + } + + /// Set the role session name. + pub fn with_role_session_name(mut self, name: impl Into) -> Self { + self.role_session_name = Some(name.into()); + self + } + + /// Set the AWS region for the STS client. + pub fn with_region(mut self, region: impl Into) -> Self { + self.region = Some(region.into()); + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } +} + +/// AWS credential vendor that uses STS AssumeRole. +#[derive(Debug)] +pub struct AwsCredentialVendor { + config: AwsCredentialVendorConfig, + sts_client: StsClient, +} + +impl AwsCredentialVendor { + /// Create a new AWS credential vendor with the specified configuration. + pub async fn new(config: AwsCredentialVendorConfig) -> Result { + let mut aws_config_loader = aws_config::defaults(BehaviorVersion::latest()); + + if let Some(ref region) = config.region { + aws_config_loader = aws_config_loader.region(aws_config::Region::new(region.clone())); + } + + let aws_config = aws_config_loader.load().await; + let sts_client = StsClient::new(&aws_config); + + Ok(Self { config, sts_client }) + } + + /// Create a new AWS credential vendor with an existing STS client. + pub fn with_sts_client(config: AwsCredentialVendorConfig, sts_client: StsClient) -> Self { + Self { config, sts_client } + } + + /// Parse an S3 URI to extract bucket and prefix. + fn parse_s3_uri(uri: &str) -> Result<(String, String)> { + let url = uri_to_url(uri)?; + + let bucket = url + .host_str() + .ok_or_else(|| Error::InvalidInput { + source: format!("S3 URI '{}' missing bucket", uri).into(), + location: snafu::location!(), + })? + .to_string(); + + let prefix = url.path().trim_start_matches('/').to_string(); + + Ok((bucket, prefix)) + } + + /// Build a scoped IAM policy for the specified location and permission level. + /// + /// Permission levels: + /// - `Read`: Full read access to all content (metadata, indices, data files) + /// - `Write`: Full read and write access (no delete) + /// - `Admin`: Full read, write, and delete access + fn build_policy(bucket: &str, prefix: &str, permission: VendedPermission) -> String { + let prefix_trimmed = prefix.trim_end_matches('/'); + let base_path = if prefix.is_empty() { + format!("arn:aws:s3:::{}/*", bucket) + } else { + format!("arn:aws:s3:::{}/{}/*", bucket, prefix_trimmed) + }; + let bucket_arn = format!("arn:aws:s3:::{}", bucket); + + let mut statements = vec![]; + + // List bucket permission (always needed) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:ListBucket", + "Resource": bucket_arn, + "Condition": { + "StringLike": { + "s3:prefix": if prefix.is_empty() { + "*".to_string() + } else { + format!("{}/*", prefix_trimmed) + } + } + } + })); + + // Get bucket location (always needed) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:GetBucketLocation", + "Resource": bucket_arn + })); + + // Read access (all permission levels have full read) + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": ["s3:GetObject", "s3:GetObjectVersion"], + "Resource": base_path + })); + + // Write access (Write and Admin) + if permission.can_write() { + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:PutObject", + "Resource": base_path + })); + } + + // Delete access (Admin only) + if permission.can_delete() { + statements.push(serde_json::json!({ + "Effect": "Allow", + "Action": "s3:DeleteObject", + "Resource": base_path + })); + } + + let policy = serde_json::json!({ + "Version": "2012-10-17", + "Statement": statements + }); + + policy.to_string() + } +} + +#[async_trait] +impl CredentialVendor for AwsCredentialVendor { + async fn vend_credentials(&self, table_location: &str) -> Result { + debug!( + "AWS credential vending: location={}, permission={}", + table_location, self.config.permission + ); + + let (bucket, prefix) = Self::parse_s3_uri(table_location)?; + let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + + let role_session_name = self + .config + .role_session_name + .clone() + .unwrap_or_else(|| "lance-credential-vending".to_string()); + + // Cap session name to 64 chars (AWS limit) + let role_session_name = if role_session_name.len() > 64 { + role_session_name[..64].to_string() + } else { + role_session_name + }; + + // Convert millis to seconds for AWS API (rounding up to ensure at least the requested duration) + // AWS STS allows 900-43200 seconds (15 min - 12 hours), clamp to valid range + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + let mut request = self + .sts_client + .assume_role() + .role_arn(&self.config.role_arn) + .role_session_name(&role_session_name) + .policy(&policy) + .duration_seconds(duration_secs); + + if let Some(ref external_id) = self.config.external_id { + request = request.external_id(external_id); + } + + let response = request.send().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to assume role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + let credentials = response.credentials().ok_or_else(|| Error::IO { + source: Box::new(std::io::Error::other( + "AssumeRole response missing credentials", + )), + location: snafu::location!(), + })?; + + let access_key_id = credentials.access_key_id().to_string(); + let secret_access_key = credentials.secret_access_key().to_string(); + let session_token = credentials.session_token().to_string(); + + let expiration = credentials.expiration(); + let expires_at_millis = + (expiration.secs() as u64) * 1000 + (expiration.subsec_nanos() / 1_000_000) as u64; + + info!( + "AWS credentials vended: bucket={}, prefix={}, permission={}, expires_at={}, access_key_id={}", + bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&access_key_id) + ); + + let mut storage_options = HashMap::new(); + storage_options.insert("aws_access_key_id".to_string(), access_key_id); + storage_options.insert("aws_secret_access_key".to_string(), secret_access_key); + storage_options.insert("aws_session_token".to_string(), session_token); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + // Include region if configured + if let Some(ref region) = self.config.region { + storage_options.insert("aws_region".to_string(), region.clone()); + } + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + fn provider_name(&self) -> &'static str { + "aws" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_s3_uri() { + let (bucket, prefix) = AwsCredentialVendor::parse_s3_uri("s3://my-bucket/path/to/table") + .expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, "path/to/table"); + + let (bucket, prefix) = + AwsCredentialVendor::parse_s3_uri("s3://my-bucket/").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + + let (bucket, prefix) = + AwsCredentialVendor::parse_s3_uri("s3://my-bucket").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + } + + #[test] + fn test_build_policy_read() { + let policy = + AwsCredentialVendor::build_policy("my-bucket", "path/to/table", VendedPermission::Read); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + assert_eq!(statements.len(), 3); // ListBucket, GetBucketLocation, GetObject + + // Verify no write actions + for stmt in statements { + let actions = stmt["Action"].clone(); + let action_list: Vec = if actions.is_array() { + actions + .as_array() + .unwrap() + .iter() + .map(|a| a.as_str().unwrap().to_string()) + .collect() + } else { + vec![actions.as_str().unwrap().to_string()] + }; + assert!(!action_list.contains(&"s3:PutObject".to_string())); + assert!(!action_list.contains(&"s3:DeleteObject".to_string())); + } + } + + #[test] + fn test_build_policy_write() { + let policy = AwsCredentialVendor::build_policy( + "my-bucket", + "path/to/table", + VendedPermission::Write, + ); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + // ListBucket, GetBucketLocation, GetObject, PutObject + assert_eq!(statements.len(), 4); + + // Verify PutObject is present + let write_stmt = statements + .iter() + .find(|s| { + let action = &s["Action"]; + action.as_str() == Some("s3:PutObject") + }) + .expect("should have PutObject statement"); + assert!(write_stmt["Effect"].as_str() == Some("Allow")); + + // Verify DeleteObject is NOT present (Write doesn't have delete) + let delete_stmt = statements.iter().find(|s| { + let action = &s["Action"]; + action.as_str() == Some("s3:DeleteObject") + }); + assert!(delete_stmt.is_none(), "Write should not have DeleteObject"); + + // Verify no Deny statements + let deny_stmt = statements + .iter() + .find(|s| s["Effect"].as_str() == Some("Deny")); + assert!(deny_stmt.is_none(), "Write should not have Deny statements"); + } + + #[test] + fn test_build_policy_admin() { + let policy = AwsCredentialVendor::build_policy( + "my-bucket", + "path/to/table", + VendedPermission::Admin, + ); + let parsed: serde_json::Value = serde_json::from_str(&policy).expect("valid json"); + + let statements = parsed["Statement"].as_array().expect("statements array"); + // ListBucket, GetBucketLocation, GetObject, PutObject, DeleteObject + assert_eq!(statements.len(), 5); + + // Verify read actions + let read_stmt = statements + .iter() + .find(|s| { + let actions = s["Action"].clone(); + if actions.is_array() { + actions + .as_array() + .unwrap() + .iter() + .any(|a| a.as_str().unwrap() == "s3:GetObject") + } else { + false + } + }) + .expect("should have read statement"); + assert!(read_stmt["Effect"].as_str() == Some("Allow")); + + // Verify PutObject + let write_stmt = statements + .iter() + .find(|s| s["Action"].as_str() == Some("s3:PutObject")) + .expect("should have PutObject statement"); + assert!(write_stmt["Effect"].as_str() == Some("Allow")); + + // Verify DeleteObject (Admin only) + let delete_stmt = statements + .iter() + .find(|s| s["Action"].as_str() == Some("s3:DeleteObject")) + .expect("should have DeleteObject statement"); + assert!(delete_stmt["Effect"].as_str() == Some("Allow")); + + // Verify no Deny statements + let deny_stmt = statements + .iter() + .find(|s| s["Effect"].as_str() == Some("Deny")); + assert!(deny_stmt.is_none(), "Admin should not have Deny statements"); + } + + #[test] + fn test_config_builder() { + let config = AwsCredentialVendorConfig::new("arn:aws:iam::123456789012:role/MyRole") + .with_external_id("my-external-id") + .with_duration_millis(7200000) + .with_role_session_name("my-session") + .with_region("us-west-2"); + + assert_eq!(config.role_arn, "arn:aws:iam::123456789012:role/MyRole"); + assert_eq!(config.external_id, Some("my-external-id".to_string())); + assert_eq!(config.duration_millis, 7200000); + assert_eq!(config.role_session_name, Some("my-session".to_string())); + assert_eq!(config.region, Some("us-west-2".to_string())); + } + + // ============================================================================ + // Integration Tests + // ============================================================================ + + /// Integration tests for AWS credential vending. + /// + /// These tests require: + /// - Valid AWS credentials (via environment, IAM role, or credential file) + /// - The `LANCE_TEST_AWS_ROLE_ARN` environment variable set to a role ARN that + /// can be assumed by the current credentials + /// - Access to the S3 bucket `jack-lancedb-devland-us-east-1` + /// + /// Run with: `cargo test --features credential-vendor-aws -- --ignored` + #[cfg(test)] + mod integration { + use super::*; + use crate::DirectoryNamespaceBuilder; + use arrow::array::{Int32Array, StringArray}; + use arrow::datatypes::{DataType, Field, Schema}; + use arrow::ipc::writer::StreamWriter; + use arrow::record_batch::RecordBatch; + use bytes::Bytes; + use lance_namespace::models::*; + use lance_namespace::LanceNamespace; + use std::sync::Arc; + + const TEST_BUCKET: &str = "jack-lancedb-devland-us-east-1"; + + /// Helper to create Arrow IPC data for testing + fn create_test_arrow_data() -> Bytes { + let schema = Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ]); + + let batch = RecordBatch::try_new( + Arc::new(schema), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 3])), + Arc::new(StringArray::from(vec!["alice", "bob", "charlie"])), + ], + ) + .unwrap(); + + let mut buffer = Vec::new(); + { + let mut writer = StreamWriter::try_new(&mut buffer, &batch.schema()).unwrap(); + writer.write(&batch).unwrap(); + writer.finish().unwrap(); + } + + Bytes::from(buffer) + } + + /// Generate a unique test path for each test run to avoid conflicts + fn unique_test_path() -> String { + let timestamp = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis(); + format!("lance-test/credential-vending-{}", timestamp) + } + + /// Get the role ARN from environment variable + fn get_test_role_arn() -> Option { + std::env::var("LANCE_TEST_AWS_ROLE_ARN").ok() + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_aws_credential_vending_basic() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let table_location = format!("s3://{}/{}/test_table", TEST_BUCKET, test_path); + + // Test Read permission + let read_config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) // 15 minutes (minimum) + .with_region("us-east-1") + .with_permission(VendedPermission::Read); + + let read_vendor = AwsCredentialVendor::new(read_config) + .await + .expect("should create read vendor"); + + let read_creds = read_vendor + .vend_credentials(&table_location) + .await + .expect("should vend read credentials"); + + assert!( + read_creds.storage_options.contains_key("aws_access_key_id"), + "should have access key id" + ); + assert!( + read_creds + .storage_options + .contains_key("aws_secret_access_key"), + "should have secret access key" + ); + assert!( + read_creds.storage_options.contains_key("aws_session_token"), + "should have session token" + ); + assert!( + !read_creds.is_expired(), + "credentials should not be expired" + ); + assert_eq!( + read_vendor.permission(), + VendedPermission::Read, + "permission should be Read" + ); + + // Test Admin permission + let admin_config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let admin_vendor = AwsCredentialVendor::new(admin_config) + .await + .expect("should create admin vendor"); + + let admin_creds = admin_vendor + .vend_credentials(&table_location) + .await + .expect("should vend admin credentials"); + + assert!( + admin_creds + .storage_options + .contains_key("aws_access_key_id"), + "should have access key id" + ); + assert!( + !admin_creds.is_expired(), + "credentials should not be expired" + ); + assert_eq!( + admin_vendor.permission(), + VendedPermission::Admin, + "permission should be Admin" + ); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_directory_namespace_with_aws_credential_vending() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let root = format!("s3://{}/{}", TEST_BUCKET, test_path); + + // Build DirectoryNamespace with credential vending using short property names + let namespace = DirectoryNamespaceBuilder::new(&root) + .manifest_enabled(true) + .credential_vendor_property("enabled", "true") + .credential_vendor_property("aws_role_arn", &role_arn) + .credential_vendor_property("aws_duration_millis", "900000") // 15 minutes + .credential_vendor_property("aws_region", "us-east-1") + .credential_vendor_property("permission", "admin") + .build() + .await + .expect("should build namespace"); + + // Create a child namespace + let create_ns_req = CreateNamespaceRequest { + id: Some(vec!["test_ns".to_string()]), + properties: None, + mode: None, + }; + namespace + .create_namespace(create_ns_req) + .await + .expect("should create namespace"); + + // Create a table with data + let table_data = create_test_arrow_data(); + let create_table_req = CreateTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + mode: Some("Create".to_string()), + }; + let create_response = namespace + .create_table(create_table_req, table_data) + .await + .expect("should create table"); + + assert!( + create_response.location.is_some(), + "should have location in response" + ); + assert_eq!(create_response.version, Some(1), "should be version 1"); + + // Describe the table (this should use vended credentials) + let describe_req = DescribeTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + ..Default::default() + }; + let describe_response = namespace + .describe_table(describe_req) + .await + .expect("should describe table"); + + assert!(describe_response.location.is_some(), "should have location"); + assert!( + describe_response.storage_options.is_some(), + "should have storage_options with vended credentials" + ); + + let storage_options = describe_response.storage_options.unwrap(); + assert!( + storage_options.contains_key("aws_access_key_id"), + "should have vended aws_access_key_id" + ); + assert!( + storage_options.contains_key("aws_secret_access_key"), + "should have vended aws_secret_access_key" + ); + assert!( + storage_options.contains_key("aws_session_token"), + "should have vended aws_session_token" + ); + assert!( + storage_options.contains_key("expires_at_millis"), + "should have expires_at_millis" + ); + + // Verify expiration is in the future + let expires_at: u64 = storage_options + .get("expires_at_millis") + .unwrap() + .parse() + .expect("should parse expires_at_millis"); + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + assert!( + expires_at > now_millis, + "expiration should be in the future" + ); + + // List tables to verify the table was created + let list_req = ListTablesRequest { + id: Some(vec!["test_ns".to_string()]), + page_token: None, + limit: None, + }; + let list_response = namespace + .list_tables(list_req) + .await + .expect("should list tables"); + assert!( + list_response.tables.contains(&"test_table".to_string()), + "should contain test_table" + ); + + // Clean up: drop the table + let drop_req = DropTableRequest { + id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + }; + namespace + .drop_table(drop_req) + .await + .expect("should drop table"); + + // Clean up: drop the namespace + let mut drop_ns_req = DropNamespaceRequest::new(); + drop_ns_req.id = Some(vec!["test_ns".to_string()]); + namespace + .drop_namespace(drop_ns_req) + .await + .expect("should drop namespace"); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_credential_refresh_on_expiration() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let table_location = format!("s3://{}/{}/refresh_test", TEST_BUCKET, test_path); + + // Create vendor with minimum duration and Admin permission + let config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) // 15 minutes + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let vendor = AwsCredentialVendor::new(config) + .await + .expect("should create vendor"); + + // Vend credentials multiple times to verify consistent behavior + let creds1 = vendor + .vend_credentials(&table_location) + .await + .expect("should vend credentials first time"); + + let creds2 = vendor + .vend_credentials(&table_location) + .await + .expect("should vend credentials second time"); + + // Both should be valid (not expired) + assert!(!creds1.is_expired(), "first credentials should be valid"); + assert!(!creds2.is_expired(), "second credentials should be valid"); + + // Both should have access keys (they may be different due to new STS calls) + assert!( + creds1.storage_options.contains_key("aws_access_key_id"), + "first creds should have access key" + ); + assert!( + creds2.storage_options.contains_key("aws_access_key_id"), + "second creds should have access key" + ); + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_scoped_policy_permissions() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + + // Create two different table locations + let table1_location = format!("s3://{}/{}/table1", TEST_BUCKET, test_path); + let table2_location = format!("s3://{}/{}/table2", TEST_BUCKET, test_path); + + let config = AwsCredentialVendorConfig::new(&role_arn) + .with_duration_millis(900_000) + .with_region("us-east-1") + .with_permission(VendedPermission::Admin); + + let vendor = AwsCredentialVendor::new(config) + .await + .expect("should create vendor"); + + // Vend credentials for table1 + let creds1 = vendor + .vend_credentials(&table1_location) + .await + .expect("should vend credentials for table1"); + + // Vend credentials for table2 + let creds2 = vendor + .vend_credentials(&table2_location) + .await + .expect("should vend credentials for table2"); + + // Both should be valid + assert!(!creds1.is_expired(), "table1 credentials should be valid"); + assert!(!creds2.is_expired(), "table2 credentials should be valid"); + + // The credentials are scoped to their respective paths via IAM policy + // (the policy restricts access to specific S3 paths) + } + + #[tokio::test] + #[ignore = "requires AWS credentials and LANCE_TEST_AWS_ROLE_ARN env var"] + async fn test_from_properties_builder() { + let role_arn = get_test_role_arn() + .expect("LANCE_TEST_AWS_ROLE_ARN must be set for integration tests"); + + let test_path = unique_test_path(); + let root = format!("s3://{}/{}", TEST_BUCKET, test_path); + + // Build namespace using from_properties (simulating config from external source) + // Properties use the "credential_vendor." prefix which gets stripped + let mut properties = HashMap::new(); + properties.insert("root".to_string(), root.clone()); + properties.insert("manifest_enabled".to_string(), "true".to_string()); + properties.insert("credential_vendor.enabled".to_string(), "true".to_string()); + properties.insert( + "credential_vendor.aws_role_arn".to_string(), + role_arn.clone(), + ); + properties.insert( + "credential_vendor.aws_duration_millis".to_string(), + "900000".to_string(), + ); + properties.insert( + "credential_vendor.aws_region".to_string(), + "us-east-1".to_string(), + ); + properties.insert( + "credential_vendor.permission".to_string(), + "admin".to_string(), + ); + + let namespace = DirectoryNamespaceBuilder::from_properties(properties, None) + .expect("should parse properties") + .build() + .await + .expect("should build namespace"); + + // Verify namespace works + let create_ns_req = CreateNamespaceRequest { + id: Some(vec!["props_test".to_string()]), + properties: None, + mode: None, + }; + namespace + .create_namespace(create_ns_req) + .await + .expect("should create namespace"); + + // Clean up + let mut drop_ns_req = DropNamespaceRequest::new(); + drop_ns_req.id = Some(vec!["props_test".to_string()]); + namespace + .drop_namespace(drop_ns_req) + .await + .expect("should drop namespace"); + } + } +} diff --git a/rust/lance-namespace-impls/src/credentials/azure.rs b/rust/lance-namespace-impls/src/credentials/azure.rs new file mode 100644 index 00000000000..1d4e4ded081 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/azure.rs @@ -0,0 +1,335 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Azure credential vending using SAS tokens. +//! +//! This module provides credential vending for Azure Blob Storage by generating +//! SAS (Shared Access Signature) tokens with user delegation keys. + +use std::collections::HashMap; +use std::sync::Arc; + +use async_trait::async_trait; +use azure_core::auth::TokenCredential; +use azure_identity::DefaultAzureCredential; +use azure_storage::prelude::*; +use azure_storage_blobs::prelude::*; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use log::{debug, info, warn}; + +use super::{ + redact_credential, CredentialVendor, VendedCredentials, VendedPermission, + DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +/// Configuration for Azure credential vending. +#[derive(Debug, Clone)] +pub struct AzureCredentialVendorConfig { + /// Optional tenant ID for authentication. + pub tenant_id: Option, + + /// Storage account name. Required for credential vending. + pub account_name: Option, + + /// Duration for vended credentials in milliseconds. + /// Default: 3600000 (1 hour). Azure allows up to 7 days for SAS tokens. + pub duration_millis: u64, + + /// Permission level for vended credentials. + /// Default: Read (full read access) + pub permission: VendedPermission, +} + +impl Default for AzureCredentialVendorConfig { + fn default() -> Self { + Self { + tenant_id: None, + account_name: None, + duration_millis: DEFAULT_CREDENTIAL_DURATION_MILLIS, + permission: VendedPermission::default(), + } + } +} + +impl AzureCredentialVendorConfig { + /// Create a new default config. + pub fn new() -> Self { + Self::default() + } + + /// Set the tenant ID. + pub fn with_tenant_id(mut self, tenant_id: impl Into) -> Self { + self.tenant_id = Some(tenant_id.into()); + self + } + + /// Set the storage account name. + pub fn with_account_name(mut self, account_name: impl Into) -> Self { + self.account_name = Some(account_name.into()); + self + } + + /// Set the credential duration in milliseconds. + pub fn with_duration_millis(mut self, millis: u64) -> Self { + self.duration_millis = millis; + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } +} + +/// Azure credential vendor that generates SAS tokens. +#[derive(Debug)] +pub struct AzureCredentialVendor { + config: AzureCredentialVendorConfig, +} + +impl AzureCredentialVendor { + /// Create a new Azure credential vendor with the specified configuration. + pub fn new(config: AzureCredentialVendorConfig) -> Self { + Self { config } + } + + /// Build SAS permissions based on the VendedPermission level. + /// + /// - Read: read + list + /// - Write: read + list + write + add + create + /// - Admin: read + list + write + add + create + delete + #[allow(clippy::field_reassign_with_default)] + fn build_sas_permissions(permission: VendedPermission) -> BlobSasPermissions { + let mut p = BlobSasPermissions::default(); + + // All permission levels have read access + p.read = true; + p.list = true; + + // Write and Admin have write access + if permission.can_write() { + p.write = true; + p.add = true; + p.create = true; + } + + // Admin has delete access + if permission.can_delete() { + p.delete = true; + } + + p + } + + /// Generate a SAS token for the specified container. + async fn generate_sas_token(&self, account: &str, container: &str) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + let credential: Arc = Arc::new(credential); + + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + // Calculate times using time crate (which Azure SDK uses) + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + // Azure limits user delegation key to 7 days + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + // Get user delegation key (note: typo in the library method name) + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(self.config.permission); + + // Generate SAS token for the container + let container_client = blob_service_client.container_client(container); + + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token for container '{}': {}", + container, e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + let token = sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok((token, expires_at_millis)) + } +} + +#[async_trait] +impl CredentialVendor for AzureCredentialVendor { + async fn vend_credentials(&self, table_location: &str) -> Result { + debug!( + "Azure credential vending: location={}, permission={}", + table_location, self.config.permission + ); + + let url = uri_to_url(table_location)?; + + let container = url.host_str().ok_or_else(|| Error::InvalidInput { + source: format!("Azure URI '{}' missing container", table_location).into(), + location: snafu::location!(), + })?; + + // Check if path extends beyond container level + let path = url.path().trim_start_matches('/'); + if !path.is_empty() { + warn!( + "Azure SAS tokens are scoped to container level only. \ + Credentials for '{}' will have access to entire container '{}', not just path '{}'", + table_location, container, path + ); + } + + let account = + self.config + .account_name + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "Azure credential vending requires 'credential_vendor.azure_account_name' to be set in configuration".into(), + location: snafu::location!(), + })?; + + let (sas_token, expires_at_millis) = self.generate_sas_token(account, container).await?; + + let mut storage_options = HashMap::new(); + // Use the standard key that object_store/lance-io expects + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert("azure_storage_account_name".to_string(), account.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended: account={}, container={}, permission={}, expires_at={}, sas_token={}", + account, container, self.config.permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + fn provider_name(&self) -> &'static str { + "azure" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_config_builder() { + let config = AzureCredentialVendorConfig::new() + .with_tenant_id("my-tenant-id") + .with_account_name("myaccount") + .with_duration_millis(7200000); + + assert_eq!(config.tenant_id, Some("my-tenant-id".to_string())); + assert_eq!(config.account_name, Some("myaccount".to_string())); + assert_eq!(config.duration_millis, 7200000); + } + + #[test] + fn test_build_sas_permissions_read() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Read); + + assert!(permissions.read, "Read permission should have read=true"); + assert!(permissions.list, "Read permission should have list=true"); + assert!( + !permissions.write, + "Read permission should have write=false" + ); + assert!(!permissions.add, "Read permission should have add=false"); + assert!( + !permissions.create, + "Read permission should have create=false" + ); + assert!( + !permissions.delete, + "Read permission should have delete=false" + ); + } + + #[test] + fn test_build_sas_permissions_write() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Write); + + assert!(permissions.read, "Write permission should have read=true"); + assert!(permissions.list, "Write permission should have list=true"); + assert!(permissions.write, "Write permission should have write=true"); + assert!(permissions.add, "Write permission should have add=true"); + assert!( + permissions.create, + "Write permission should have create=true" + ); + assert!( + !permissions.delete, + "Write permission should have delete=false" + ); + } + + #[test] + fn test_build_sas_permissions_admin() { + let permissions = AzureCredentialVendor::build_sas_permissions(VendedPermission::Admin); + + assert!(permissions.read, "Admin permission should have read=true"); + assert!(permissions.list, "Admin permission should have list=true"); + assert!(permissions.write, "Admin permission should have write=true"); + assert!(permissions.add, "Admin permission should have add=true"); + assert!( + permissions.create, + "Admin permission should have create=true" + ); + assert!( + permissions.delete, + "Admin permission should have delete=true" + ); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/gcp.rs b/rust/lance-namespace-impls/src/credentials/gcp.rs new file mode 100644 index 00000000000..ce4bac40fa1 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/gcp.rs @@ -0,0 +1,637 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! GCP credential vending using downscoped OAuth2 tokens. +//! +//! This module provides credential vending for GCP Cloud Storage by obtaining +//! OAuth2 access tokens and downscoping them using Credential Access Boundaries (CAB). +//! +//! ## Authentication +//! +//! This module uses [Application Default Credentials (ADC)][adc] for authentication. +//! ADC automatically finds credentials based on the environment: +//! +//! 1. **`GOOGLE_APPLICATION_CREDENTIALS` environment variable**: Set this to the path +//! of a service account key file (JSON format) before starting the application. +//! 2. **Well-known file locations**: `~/.config/gcloud/application_default_credentials.json` +//! on Linux/macOS, or the equivalent on Windows. +//! 3. **Metadata server**: When running on GCP (Compute Engine, Cloud Run, GKE, etc.), +//! credentials are automatically obtained from the metadata server. +//! +//! For production deployments on GCP, using the metadata server (option 3) is recommended +//! as it doesn't require managing key files. +//! +//! [adc]: https://cloud.google.com/docs/authentication/application-default-credentials +//! +//! ## Service Account Impersonation +//! +//! For multi-tenant scenarios, you can configure `service_account` to impersonate a +//! different service account. The base credentials (from ADC) must have the +//! `roles/iam.serviceAccountTokenCreator` role on the target service account. +//! +//! ## Permission Scoping +//! +//! Permissions are enforced using GCP's Credential Access Boundaries: +//! - **Read**: `roles/storage.legacyObjectReader` + `roles/storage.objectViewer` (read and list) +//! - **Write**: Read permissions + `roles/storage.legacyBucketWriter` + `roles/storage.objectCreator` +//! - **Admin**: Write permissions + `roles/storage.objectAdmin` (includes delete) +//! +//! The downscoped token is restricted to the specific bucket and path prefix. +//! +//! Note: Legacy roles are used because modern roles like `storage.objectCreator` lack +//! `storage.buckets.get` which many client libraries require. + +use std::collections::HashMap; + +use async_trait::async_trait; +use google_cloud_auth::credentials; +use lance_core::{Error, Result}; +use lance_io::object_store::uri_to_url; +use log::{debug, info}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; + +use super::{redact_credential, CredentialVendor, VendedCredentials, VendedPermission}; + +/// GCP STS token exchange endpoint for downscoping credentials. +const STS_TOKEN_EXCHANGE_URL: &str = "https://sts.googleapis.com/v1/token"; + +/// Configuration for GCP credential vending. +#[derive(Debug, Clone, Default)] +pub struct GcpCredentialVendorConfig { + /// Optional service account to impersonate. + /// + /// When set, the vendor will impersonate this service account using the + /// IAM Credentials API's generateAccessToken endpoint before downscoping. + /// This is useful for multi-tenant scenarios where you want to issue tokens + /// on behalf of different service accounts. + /// + /// The base credentials (from ADC) must have the `roles/iam.serviceAccountTokenCreator` + /// role on this service account. + /// + /// Format: `my-sa@project.iam.gserviceaccount.com` + pub service_account: Option, + + /// Permission level for vended credentials. + /// Default: Read + /// Permissions are enforced via Credential Access Boundaries (CAB). + /// + /// Note: GCP token duration cannot be configured; the token lifetime + /// is determined by the STS endpoint (typically 1 hour). + pub permission: VendedPermission, +} + +impl GcpCredentialVendorConfig { + /// Create a new default config. + pub fn new() -> Self { + Self::default() + } + + /// Set the service account to impersonate. + /// + /// When set, the vendor uses the IAM Credentials API to generate an access + /// token for this service account, then downscopes it with CAB. + /// + /// The base credentials (from ADC) must have the `roles/iam.serviceAccountTokenCreator` + /// role on this service account. + pub fn with_service_account(mut self, service_account: impl Into) -> Self { + self.service_account = Some(service_account.into()); + self + } + + /// Set the permission level for vended credentials. + pub fn with_permission(mut self, permission: VendedPermission) -> Self { + self.permission = permission; + self + } +} + +/// Access boundary rule for a single resource. +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct AccessBoundaryRule { + available_resource: String, + available_permissions: Vec, + #[serde(skip_serializing_if = "Option::is_none")] + availability_condition: Option, +} + +/// Condition for access boundary rule. +#[derive(Debug, Clone, Serialize)] +struct AvailabilityCondition { + expression: String, +} + +/// Credential Access Boundary structure. +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct CredentialAccessBoundary { + access_boundary: AccessBoundaryInner, +} + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +struct AccessBoundaryInner { + access_boundary_rules: Vec, +} + +/// Response from STS token exchange. +#[derive(Debug, Deserialize)] +struct TokenExchangeResponse { + access_token: String, + #[serde(default)] + expires_in: Option, +} + +/// Response from IAM generateAccessToken API. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +struct GenerateAccessTokenResponse { + access_token: String, + #[allow(dead_code)] + expire_time: String, +} + +/// GCP credential vendor that provides downscoped OAuth2 tokens. +pub struct GcpCredentialVendor { + config: GcpCredentialVendorConfig, + http_client: Client, + credential: credentials::Credential, +} + +impl std::fmt::Debug for GcpCredentialVendor { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("GcpCredentialVendor") + .field("config", &self.config) + .field("credential", &"[credential]") + .finish() + } +} + +impl GcpCredentialVendor { + /// Create a new GCP credential vendor with the specified configuration. + /// + /// Uses [Application Default Credentials (ADC)][adc] for authentication. + /// To use a service account key file, set the `GOOGLE_APPLICATION_CREDENTIALS` + /// environment variable to the file path before starting the application. + /// + /// [adc]: https://cloud.google.com/docs/authentication/application-default-credentials + pub async fn new(config: GcpCredentialVendorConfig) -> Result { + let credential = credentials::create_access_token_credential() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create GCP credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok(Self { + config, + http_client: Client::new(), + credential, + }) + } + + /// Parse a GCS URI to extract bucket and prefix. + fn parse_gcs_uri(uri: &str) -> Result<(String, String)> { + let url = uri_to_url(uri)?; + + if url.scheme() != "gs" { + return Err(Error::InvalidInput { + source: format!( + "Unsupported GCS URI scheme '{}', expected 'gs'", + url.scheme() + ) + .into(), + location: snafu::location!(), + }); + } + + let bucket = url + .host_str() + .ok_or_else(|| Error::InvalidInput { + source: format!("GCS URI '{}' missing bucket", uri).into(), + location: snafu::location!(), + })? + .to_string(); + + let prefix = url.path().trim_start_matches('/').to_string(); + + Ok((bucket, prefix)) + } + + /// Get a source token for downscoping. + /// + /// If service_account is configured, impersonates that service account + /// using the IAM Credentials API. Otherwise, uses the configured credential + /// directly. + async fn get_source_token(&self) -> Result { + let base_token = self.credential.get_token().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get GCP token: {}", + e + ))), + location: snafu::location!(), + })?; + + // If service account impersonation is configured, use generateAccessToken API + if let Some(ref service_account) = self.config.service_account { + return self + .impersonate_service_account(&base_token.token, service_account) + .await; + } + + Ok(base_token.token) + } + + /// Impersonate a service account using the IAM Credentials API. + /// + /// Uses the base token to call generateAccessToken for the target service account. + async fn impersonate_service_account( + &self, + base_token: &str, + service_account: &str, + ) -> Result { + let url = format!( + "https://iamcredentials.googleapis.com/v1/projects/-/serviceAccounts/{}:generateAccessToken", + service_account + ); + + // Request body with cloud-platform scope (required for GCS access) + let body = serde_json::json!({ + "scope": ["https://www.googleapis.com/auth/cloud-platform"] + }); + + let response = self + .http_client + .post(&url) + .bearer_auth(base_token) + .json(&body) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to call IAM generateAccessToken: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .await + .unwrap_or_else(|_| "unknown error".to_string()); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "IAM generateAccessToken failed for '{}' with status {}: {}", + service_account, status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: GenerateAccessTokenResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse generateAccessToken response: {}", + e + ))), + location: snafu::location!(), + })?; + + Ok(token_response.access_token) + } + + /// Build Credential Access Boundary for the specified bucket/prefix and permission. + fn build_access_boundary( + bucket: &str, + prefix: &str, + permission: VendedPermission, + ) -> CredentialAccessBoundary { + let bucket_resource = format!("//storage.googleapis.com/projects/_/buckets/{}", bucket); + + let mut rules = vec![]; + + // Build condition expression for path restriction + let condition = if prefix.is_empty() { + None + } else { + let prefix_trimmed = prefix.trim_end_matches('/'); + // CEL expression to restrict access to the specific path prefix. + // We append '/' to ensure exact prefix matching - without it, prefix "data" + // would incorrectly match "data-other/file.txt". + // + // For object access: resource.name must start with "prefix/" + // For list operations: listPrefix must equal "prefix" OR start with "prefix/" + let list_prefix_attr = + "api.getAttribute('storage.googleapis.com/objectListPrefix', '')"; + let expr = format!( + "resource.name.startsWith('projects/_/buckets/{}/objects/{}/') || \ + {list_attr} == '{prefix}' || {list_attr}.startsWith('{prefix}/')", + bucket, + prefix_trimmed, + list_attr = list_prefix_attr, + prefix = prefix_trimmed + ); + Some(AvailabilityCondition { expression: expr }) + }; + + // Read permissions: legacyObjectReader for read + objectViewer for list + // Using legacy roles because modern roles lack storage.buckets.get + rules.push(AccessBoundaryRule { + available_resource: bucket_resource.clone(), + available_permissions: vec![ + "inRole:roles/storage.legacyObjectReader".to_string(), + "inRole:roles/storage.objectViewer".to_string(), + ], + availability_condition: condition.clone(), + }); + + // Write permission: legacyBucketWriter + objectCreator for create/update + if permission.can_write() { + rules.push(AccessBoundaryRule { + available_resource: bucket_resource.clone(), + available_permissions: vec![ + "inRole:roles/storage.legacyBucketWriter".to_string(), + "inRole:roles/storage.objectCreator".to_string(), + ], + availability_condition: condition.clone(), + }); + } + + // Admin permission: objectAdmin for delete + if permission.can_delete() { + rules.push(AccessBoundaryRule { + available_resource: bucket_resource, + available_permissions: vec!["inRole:roles/storage.objectAdmin".to_string()], + availability_condition: condition, + }); + } + + CredentialAccessBoundary { + access_boundary: AccessBoundaryInner { + access_boundary_rules: rules, + }, + } + } + + /// Exchange source token for a downscoped token using STS. + async fn downscope_token( + &self, + source_token: &str, + access_boundary: &CredentialAccessBoundary, + ) -> Result<(String, u64)> { + let options_json = serde_json::to_string(access_boundary).map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to serialize access boundary: {}", + e + ))), + location: snafu::location!(), + })?; + + let params = [ + ( + "grant_type", + "urn:ietf:params:oauth:grant-type:token-exchange", + ), + ( + "subject_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ( + "requested_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ("subject_token", source_token), + ("options", &options_json), + ]; + + let response = self + .http_client + .post(STS_TOKEN_EXCHANGE_URL) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to call STS token exchange: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response + .text() + .await + .unwrap_or_else(|_| "unknown error".to_string()); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "STS token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: TokenExchangeResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse STS response: {}", + e + ))), + location: snafu::location!(), + })?; + + // Calculate expiration time + // Use expires_in from response if available, otherwise default to 1 hour + let expires_in_secs = token_response.expires_in.unwrap_or(3600); + let expires_at_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64 + + expires_in_secs * 1000; + + Ok((token_response.access_token, expires_at_millis)) + } +} + +#[async_trait] +impl CredentialVendor for GcpCredentialVendor { + async fn vend_credentials(&self, table_location: &str) -> Result { + debug!( + "GCP credential vending: location={}, permission={}", + table_location, self.config.permission + ); + + let (bucket, prefix) = Self::parse_gcs_uri(table_location)?; + + // Get source token from default credentials + let source_token = self.get_source_token().await?; + + // Build access boundary for this location and permission + let access_boundary = Self::build_access_boundary(&bucket, &prefix, self.config.permission); + + // Exchange for downscoped token + let (downscoped_token, expires_at_millis) = self + .downscope_token(&source_token, &access_boundary) + .await?; + + let mut storage_options = HashMap::new(); + storage_options.insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended: bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, + prefix, + self.config.permission, + expires_at_millis, + redact_credential(&downscoped_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + fn provider_name(&self) -> &'static str { + "gcp" + } + + fn permission(&self) -> VendedPermission { + self.config.permission + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_gcs_uri() { + let (bucket, prefix) = GcpCredentialVendor::parse_gcs_uri("gs://my-bucket/path/to/table") + .expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, "path/to/table"); + + let (bucket, prefix) = + GcpCredentialVendor::parse_gcs_uri("gs://my-bucket/").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + + let (bucket, prefix) = + GcpCredentialVendor::parse_gcs_uri("gs://my-bucket").expect("should parse"); + assert_eq!(bucket, "my-bucket"); + assert_eq!(prefix, ""); + } + + #[test] + fn test_parse_gcs_uri_invalid() { + // Wrong scheme - should fail + let result = GcpCredentialVendor::parse_gcs_uri("s3://bucket/path"); + assert!(result.is_err()); + + // Missing bucket + let result = GcpCredentialVendor::parse_gcs_uri("gs:///path"); + assert!(result.is_err()); + + // Invalid URI format + let result = GcpCredentialVendor::parse_gcs_uri("not-a-uri"); + assert!(result.is_err()); + + // Empty string + let result = GcpCredentialVendor::parse_gcs_uri(""); + assert!(result.is_err()); + } + + #[test] + fn test_config_builder() { + let config = GcpCredentialVendorConfig::new() + .with_service_account("my-sa@project.iam.gserviceaccount.com") + .with_permission(VendedPermission::Write); + + assert_eq!( + config.service_account, + Some("my-sa@project.iam.gserviceaccount.com".to_string()) + ); + assert_eq!(config.permission, VendedPermission::Write); + } + + #[test] + fn test_build_access_boundary_read() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Read, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 1, "Read should have 1 rule"); + + let permissions = &rules[0].available_permissions; + assert!(permissions.contains(&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&"inRole:roles/storage.objectViewer".to_string())); + assert!(rules[0].availability_condition.is_some()); + } + + #[test] + fn test_build_access_boundary_write() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Write, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 2, "Write should have 2 rules"); + + let permissions: Vec<_> = rules + .iter() + .flat_map(|r| r.available_permissions.iter()) + .collect(); + assert!(permissions.contains(&&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectViewer".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.legacyBucketWriter".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectCreator".to_string())); + } + + #[test] + fn test_build_access_boundary_admin() { + let boundary = GcpCredentialVendor::build_access_boundary( + "my-bucket", + "path/to/data", + VendedPermission::Admin, + ); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 3, "Admin should have 3 rules"); + + let permissions: Vec<_> = rules + .iter() + .flat_map(|r| r.available_permissions.iter()) + .collect(); + assert!(permissions.contains(&&"inRole:roles/storage.legacyObjectReader".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectViewer".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.legacyBucketWriter".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectCreator".to_string())); + assert!(permissions.contains(&&"inRole:roles/storage.objectAdmin".to_string())); + } + + #[test] + fn test_build_access_boundary_no_prefix() { + let boundary = + GcpCredentialVendor::build_access_boundary("my-bucket", "", VendedPermission::Read); + + let rules = &boundary.access_boundary.access_boundary_rules; + assert_eq!(rules.len(), 1); + // No condition when prefix is empty (full bucket access) + assert!(rules[0].availability_condition.is_none()); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index fdb4370f6ab..91714d73d90 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -33,6 +33,10 @@ use lance_core::{box_error, Error, Result}; use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; +use crate::credentials::{ + create_credential_vendor_for_location, has_credential_vendor_config, CredentialVendor, +}; + /// Builder for creating a DirectoryNamespace. /// /// This builder provides a fluent API for configuring and establishing @@ -75,6 +79,7 @@ pub struct DirectoryNamespaceBuilder { manifest_enabled: bool, dir_listing_enabled: bool, inline_optimization_enabled: bool, + credential_vendor_properties: HashMap, } impl DirectoryNamespaceBuilder { @@ -91,6 +96,7 @@ impl DirectoryNamespaceBuilder { manifest_enabled: true, dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, + credential_vendor_properties: HashMap::new(), } } @@ -132,6 +138,29 @@ impl DirectoryNamespaceBuilder { /// - `inline_optimization_enabled`: Enable inline optimization of __manifest table (optional, default: true) /// - `storage.*`: Storage options (optional, prefix will be stripped) /// + /// Credential vendor properties (prefixed with `credential_vendor.`, prefix is stripped): + /// - `credential_vendor.enabled`: Set to "true" to enable credential vending (required) + /// - `credential_vendor.permission`: Permission level: read, write, or admin (default: read) + /// + /// AWS-specific properties (for s3:// locations): + /// - `credential_vendor.aws_role_arn`: AWS IAM role ARN (required for AWS) + /// - `credential_vendor.aws_external_id`: AWS external ID (optional) + /// - `credential_vendor.aws_region`: AWS region (optional) + /// - `credential_vendor.aws_role_session_name`: AWS role session name (optional) + /// - `credential_vendor.aws_duration_millis`: Credential duration in ms (default: 3600000, range: 15min-12hrs) + /// + /// GCP-specific properties (for gs:// locations): + /// - `credential_vendor.gcp_service_account`: Service account to impersonate (optional) + /// + /// Note: GCP uses Application Default Credentials (ADC). To use a service account key file, + /// set the `GOOGLE_APPLICATION_CREDENTIALS` environment variable before starting. + /// GCP token duration cannot be configured; it's determined by the STS endpoint (typically 1 hour). + /// + /// Azure-specific properties (for az:// locations): + /// - `credential_vendor.azure_account_name`: Azure storage account name (required for Azure) + /// - `credential_vendor.azure_tenant_id`: Azure tenant ID (optional) + /// - `credential_vendor.azure_duration_millis`: Credential duration in ms (default: 3600000, up to 7 days) + /// /// # Arguments /// /// * `properties` - Configuration properties @@ -209,6 +238,17 @@ impl DirectoryNamespaceBuilder { .and_then(|v| v.parse::().ok()) .unwrap_or(true); + // Extract credential vendor properties (properties prefixed with "credential_vendor.") + // The prefix is stripped to get short property names + // The build() method will check if enabled=true before creating the vendor + let credential_vendor_properties: HashMap = properties + .iter() + .filter_map(|(k, v)| { + k.strip_prefix("credential_vendor.") + .map(|key| (key.to_string(), v.clone())) + }) + .collect(); + Ok(Self { root: root.trim_end_matches('/').to_string(), storage_options, @@ -216,6 +256,7 @@ impl DirectoryNamespaceBuilder { manifest_enabled, dir_listing_enabled, inline_optimization_enabled, + credential_vendor_properties, }) } @@ -258,6 +299,55 @@ impl DirectoryNamespaceBuilder { self } + /// Add a credential vendor property. + /// + /// Use short property names without the `credential_vendor.` prefix. + /// Common properties: `enabled`, `permission`. + /// AWS properties: `aws_role_arn`, `aws_external_id`, `aws_region`, `aws_role_session_name`, `aws_duration_millis`. + /// GCP properties: `gcp_service_account`. + /// Azure properties: `azure_account_name`, `azure_tenant_id`, `azure_duration_millis`. + /// + /// # Arguments + /// + /// * `key` - Property key (e.g., "enabled", "aws_role_arn") + /// * `value` - Property value + /// + /// # Example + /// + /// ```no_run + /// # use lance_namespace_impls::DirectoryNamespaceBuilder; + /// # async fn example() -> Result<(), Box> { + /// let namespace = DirectoryNamespaceBuilder::new("s3://my-bucket/data") + /// .credential_vendor_property("enabled", "true") + /// .credential_vendor_property("aws_role_arn", "arn:aws:iam::123456789012:role/MyRole") + /// .credential_vendor_property("permission", "read") + /// .build() + /// .await?; + /// # Ok(()) + /// # } + /// ``` + pub fn credential_vendor_property( + mut self, + key: impl Into, + value: impl Into, + ) -> Self { + self.credential_vendor_properties + .insert(key.into(), value.into()); + self + } + + /// Add multiple credential vendor properties. + /// + /// Use short property names without the `credential_vendor.` prefix. + /// + /// # Arguments + /// + /// * `properties` - HashMap of credential vendor properties to add + pub fn credential_vendor_properties(mut self, properties: HashMap) -> Self { + self.credential_vendor_properties.extend(properties); + self + } + /// Build the DirectoryNamespace. /// /// # Returns @@ -300,6 +390,16 @@ impl DirectoryNamespaceBuilder { None }; + // Create credential vendor once during initialization if enabled + let credential_vendor = if has_credential_vendor_config(&self.credential_vendor_properties) + { + create_credential_vendor_for_location(&self.root, &self.credential_vendor_properties) + .await? + .map(Arc::from) + } else { + None + }; + Ok(DirectoryNamespace { root: self.root, storage_options: self.storage_options, @@ -308,6 +408,7 @@ impl DirectoryNamespaceBuilder { base_path, manifest_ns, dir_listing_enabled: self.dir_listing_enabled, + credential_vendor, }) } @@ -357,6 +458,14 @@ impl DirectoryNamespaceBuilder { /// /// When `dir_listing_enabled=true`, the namespace falls back to directory scanning for tables not /// found in the manifest, enabling gradual migration. +/// +/// ## Credential Vending +/// +/// When credential vendor properties are configured, `describe_table` will vend temporary +/// credentials based on the table location URI. The vendor type is auto-selected: +/// - `s3://` locations use AWS STS AssumeRole +/// - `gs://` locations use GCP OAuth2 tokens +/// - `az://` locations use Azure SAS tokens pub struct DirectoryNamespace { root: String, storage_options: Option>, @@ -366,6 +475,9 @@ pub struct DirectoryNamespace { base_path: Path, manifest_ns: Option>, dir_listing_enabled: bool, + /// Credential vendor created once during initialization. + /// Used to vend temporary credentials for table access. + credential_vendor: Option>, } impl std::fmt::Debug for DirectoryNamespace { @@ -496,6 +608,35 @@ impl DirectoryNamespace { .child(".lance-reserved") } + /// Get storage options for a table, using credential vending if configured. + /// + /// If credential vendor properties are configured and the table location matches + /// a supported cloud provider, this will create an appropriate vendor and vend + /// temporary credentials scoped to the table location. Otherwise, returns the + /// static storage options. + /// + /// The vendor type is auto-selected based on the table URI: + /// - `s3://` locations use AWS STS AssumeRole + /// - `gs://` locations use GCP OAuth2 tokens + /// - `az://` locations use Azure SAS tokens + /// + /// The permission level (Read, Write, Admin) is configured at namespace + /// initialization time via the `credential_vendor_permission` property. + /// + /// # Arguments + /// + /// * `table_uri` - The full URI of the table + async fn get_storage_options_for_table( + &self, + table_uri: &str, + ) -> Result>> { + if let Some(ref vendor) = self.credential_vendor { + let vended = vendor.vend_credentials(table_uri).await?; + return Ok(Some(vended.storage_options)); + } + Ok(self.storage_options.clone()) + } + /// Migrate directory-based tables to the manifest. /// /// This is a one-time migration operation that: @@ -776,6 +917,8 @@ impl LanceNamespace for DirectoryNamespace { let lance_schema = dataset.schema(); let arrow_schema: arrow_schema::Schema = lance_schema.into(); let json_schema = arrow_schema_to_json(&arrow_schema)?; + let storage_options = self.get_storage_options_for_table(&table_uri).await?; + Ok(DescribeTableResponse { table: Some(table_name), namespace: request.id.as_ref().map(|id| { @@ -789,7 +932,7 @@ impl LanceNamespace for DirectoryNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - storage_options: self.storage_options.clone(), + storage_options, stats: None, }) } @@ -801,6 +944,7 @@ impl LanceNamespace for DirectoryNamespace { .await .unwrap_or(false) { + let storage_options = self.get_storage_options_for_table(&table_uri).await?; Ok(DescribeTableResponse { table: Some(table_name), namespace: request.id.as_ref().map(|id| { @@ -814,7 +958,7 @@ impl LanceNamespace for DirectoryNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), schema: None, - storage_options: self.storage_options.clone(), + storage_options, stats: None, }) } else { diff --git a/rust/lance-namespace-impls/src/lib.rs b/rust/lance-namespace-impls/src/lib.rs index 634199ce98a..88248841bcb 100644 --- a/rust/lance-namespace-impls/src/lib.rs +++ b/rust/lance-namespace-impls/src/lib.rs @@ -10,12 +10,49 @@ //! - `rest`: REST API-based namespace implementation //! - `rest-adapter`: REST server adapter that exposes any namespace via HTTP //! - `dir-aws`, `dir-azure`, `dir-gcp`, `dir-oss`: Cloud storage backend support for directory namespace (via lance-io) +//! - `credential-vendor-aws`, `credential-vendor-gcp`, `credential-vendor-azure`: Credential vending for cloud storage //! //! ## Implementations //! //! - `DirectoryNamespace`: Directory-based implementation (always available) //! - `RestNamespace`: REST API-based implementation (requires `rest` feature) //! +//! ## Credential Vending +//! +//! The `credentials` module provides temporary credential vending for cloud storage: +//! - AWS: STS AssumeRole with scoped IAM policies (requires `credential-vendor-aws` feature) +//! - GCP: OAuth2 tokens with access boundaries (requires `credential-vendor-gcp` feature) +//! - Azure: SAS tokens with user delegation keys (requires `credential-vendor-azure` feature) +//! +//! The credential vendor is automatically selected based on the table location URI scheme: +//! - `s3://` for AWS +//! - `gs://` for GCP +//! - `az://` for Azure +//! +//! Configuration properties (prefixed with `credential_vendor.`, prefix is stripped): +//! +//! ```text +//! # Required to enable credential vending +//! credential_vendor.enabled = "true" +//! +//! # Common properties (apply to all providers) +//! credential_vendor.permission = "read" # read, write, or admin (default: read) +//! +//! # AWS-specific properties (for s3:// locations) +//! credential_vendor.aws_role_arn = "arn:aws:iam::123456789012:role/MyRole" # required for AWS +//! credential_vendor.aws_duration_millis = "3600000" # 1 hour (default, range: 15min-12hrs) +//! +//! # GCP-specific properties (for gs:// locations) +//! # Note: GCP uses ADC; set GOOGLE_APPLICATION_CREDENTIALS env var for service account key +//! # Note: GCP token duration cannot be configured; it's determined by the STS endpoint +//! credential_vendor.gcp_service_account = "my-sa@project.iam.gserviceaccount.com" +//! +//! # Azure-specific properties (for az:// locations) +//! credential_vendor.azure_account_name = "mystorageaccount" # required for Azure +//! credential_vendor.azure_tenant_id = "my-tenant-id" +//! credential_vendor.azure_duration_millis = "3600000" # 1 hour (default, up to 7 days) +//! ``` +//! //! ## Usage //! //! The recommended way to connect to a namespace is using [`ConnectBuilder`]: @@ -32,6 +69,7 @@ //! ``` pub mod connect; +pub mod credentials; pub mod dir; #[cfg(feature = "rest")] @@ -44,6 +82,27 @@ pub mod rest_adapter; pub use connect::ConnectBuilder; pub use dir::{manifest::ManifestNamespace, DirectoryNamespace, DirectoryNamespaceBuilder}; +// Re-export credential vending +pub use credentials::{ + create_credential_vendor_for_location, detect_provider_from_uri, has_credential_vendor_config, + redact_credential, CredentialVendor, VendedCredentials, DEFAULT_CREDENTIAL_DURATION_MILLIS, +}; + +#[cfg(feature = "credential-vendor-aws")] +pub use credentials::aws::{AwsCredentialVendor, AwsCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-aws")] +pub use credentials::aws_props; + +#[cfg(feature = "credential-vendor-gcp")] +pub use credentials::gcp::{GcpCredentialVendor, GcpCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-gcp")] +pub use credentials::gcp_props; + +#[cfg(feature = "credential-vendor-azure")] +pub use credentials::azure::{AzureCredentialVendor, AzureCredentialVendorConfig}; +#[cfg(feature = "credential-vendor-azure")] +pub use credentials::azure_props; + #[cfg(feature = "rest")] pub use rest::{RestNamespace, RestNamespaceBuilder}; From 85a96856adfffe64cc8a36329e09f8052e9a6873 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 21:09:38 -0800 Subject: [PATCH 6/8] feat: upgrade lance-namespace to 0.4.0 and 0.4.5 (#5568, #5611) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Combined patches for upgrading lance-namespace. Changes from 0.4.0 (#5568): 1. Introduced full error handling spec, update rust interface to implement the spec, and also dir and rest implementations in rust, python, java based on it 2. Fixed that `create_empty_table` is deprecated and `declare_table` is introduced. 3. Added `deregister_table` support for dir namespace (without manifest) Changes from 0.4.5 (#5611): 1. Use `Default::default()` for constructing request and response models 2. Leverage newly added identity to cache vended credentials 3. Support newly added `load_detailed_metadata` and `vend_credentials` flags in requests Also made ToSnafuLocation trait public to fix compilation on release branch. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- Cargo.lock | 6 +- Cargo.toml | 3 +- java/lance-jni/Cargo.lock | 6 +- java/lance-jni/src/error.rs | 124 ++- java/lance-jni/src/namespace.rs | 36 + java/pom.xml | 4 +- .../java/org/lance/WriteDatasetBuilder.java | 31 +- .../lance/namespace/DirectoryNamespace.java | 10 + .../org/lance/namespace/RestNamespace.java | 10 + .../org/lance/NamespaceIntegrationTest.java | 12 + python/Cargo.lock | 6 +- python/pyproject.toml | 2 +- python/python/lance/dataset.py | 40 +- python/python/lance/namespace.py | 10 + .../tests/test_namespace_integration.py | 18 +- python/src/error.rs | 52 +- python/src/namespace.rs | 18 + rust/lance-core/src/error.rs | 2 +- .../src/object_store/storage_options.rs | 3 +- rust/lance-namespace-impls/Cargo.toml | 10 +- rust/lance-namespace-impls/src/credentials.rs | 92 +- .../src/credentials/aws.rs | 385 +++++++-- .../src/credentials/azure.rs | 702 ++++++++++++++- .../src/credentials/cache.rs | 438 ++++++++++ .../src/credentials/gcp.rs | 400 ++++++++- rust/lance-namespace-impls/src/dir.rs | 811 ++++++++++++++++-- .../lance-namespace-impls/src/dir/manifest.rs | 190 +++- rust/lance-namespace-impls/src/rest.rs | 35 +- .../lance-namespace-impls/src/rest_adapter.rs | 215 ++++- rust/lance-namespace/src/error.rs | 404 +++++++++ rust/lance-namespace/src/lib.rs | 13 + rust/lance-namespace/src/namespace.rs | 64 +- rust/lance/src/dataset.rs | 49 +- rust/lance/src/dataset/builder.rs | 3 +- 34 files changed, 3907 insertions(+), 297 deletions(-) create mode 100644 rust/lance-namespace-impls/src/credentials/cache.rs create mode 100644 rust/lance-namespace/src/error.rs diff --git a/Cargo.lock b/Cargo.lock index 88e4bfefa8c..f7263fb0748 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5215,6 +5215,7 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", "bytes", "chrono", "futures", @@ -5231,6 +5232,7 @@ dependencies = [ "rstest", "serde", "serde_json", + "sha2", "snafu", "tempfile", "time", @@ -5243,9 +5245,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", diff --git a/Cargo.toml b/Cargo.toml index 6ad07641cd5..4cf3883ed23 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -63,7 +63,7 @@ lance-io = { version = "=1.0.3-beta.0", path = "./rust/lance-io", default-featur lance-linalg = { version = "=1.0.3-beta.0", path = "./rust/lance-linalg" } lance-namespace = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace" } lance-namespace-impls = { version = "=1.0.3-beta.0", path = "./rust/lance-namespace-impls" } -lance-namespace-reqwest-client = "0.3.1" +lance-namespace-reqwest-client = { version = "=0.4.5" } lance-table = { version = "=1.0.3-beta.0", path = "./rust/lance-table" } lance-test-macros = { version = "=1.0.3-beta.0", path = "./rust/lance-test-macros" } lance-testing = { version = "=1.0.3-beta.0", path = "./rust/lance-testing" } @@ -87,7 +87,6 @@ aws-config = "1.2.0" aws-credential-types = "1.2.0" aws-sdk-dynamodb = "1.38.0" aws-sdk-s3 = "1.38.0" -aws-sdk-sts = "1.38.0" half = { "version" = "2.1", default-features = false, features = [ "num-traits", "std", diff --git a/java/lance-jni/Cargo.lock b/java/lance-jni/Cargo.lock index 15bf118d4f5..8a85397603e 100644 --- a/java/lance-jni/Cargo.lock +++ b/java/lance-jni/Cargo.lock @@ -4279,6 +4279,7 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", "bytes", "chrono", "futures", @@ -4294,6 +4295,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "snafu", "time", "tokio", @@ -4304,9 +4306,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", diff --git a/java/lance-jni/src/error.rs b/java/lance-jni/src/error.rs index 4e8f988120d..ef05b8cdb5c 100644 --- a/java/lance-jni/src/error.rs +++ b/java/lance-jni/src/error.rs @@ -6,6 +6,7 @@ use std::str::Utf8Error; use arrow_schema::ArrowError; use jni::{errors::Error as JniError, JNIEnv}; use lance::Error as LanceError; +use lance_namespace::error::NamespaceError; use serde_json::Error as JsonError; #[derive(Debug, PartialEq, Eq)] @@ -15,6 +16,7 @@ pub enum JavaExceptionClass { RuntimeException, UnsupportedOperationException, AlreadyInException, + LanceNamespaceException, } impl JavaExceptionClass { @@ -26,6 +28,7 @@ impl JavaExceptionClass { Self::UnsupportedOperationException => "java/lang/UnsupportedOperationException", // Included for display purposes. This is not a real exception. Self::AlreadyInException => "AlreadyInException", + Self::LanceNamespaceException => "org/lance/namespace/errors/LanceNamespaceException", } } } @@ -34,6 +37,7 @@ impl JavaExceptionClass { pub struct Error { message: String, java_class: JavaExceptionClass, + namespace_error_code: Option, } impl Error { @@ -41,6 +45,7 @@ impl Error { Self { message, java_class, + namespace_error_code: None, } } @@ -48,6 +53,7 @@ impl Error { Self { message, java_class: JavaExceptionClass::RuntimeException, + namespace_error_code: None, } } @@ -63,10 +69,19 @@ impl Error { Self::new(message, JavaExceptionClass::UnsupportedOperationException) } + pub fn namespace_error(code: u32, message: String) -> Self { + Self { + message, + java_class: JavaExceptionClass::LanceNamespaceException, + namespace_error_code: Some(code), + } + } + pub fn in_exception() -> Self { Self { message: String::default(), java_class: JavaExceptionClass::AlreadyInException, + namespace_error_code: None, } } @@ -75,11 +90,105 @@ impl Error { // An exception is already in progress, so we don't need to throw another one. return; } + + // For namespace errors, throw the specific LanceNamespaceException + if self.java_class == JavaExceptionClass::LanceNamespaceException { + if let Some(code) = self.namespace_error_code { + // Call LanceNamespaceException.fromCode static method + if self.throw_namespace_exception(env, code).is_err() { + // lance-namespace is bundled as a dependency, so the exception classes + // should always be available. Panic if they're not. + panic!( + "Failed to throw LanceNamespaceException (code={}). \ + org.lance.namespace.errors.LanceNamespaceException and ErrorCode classes \ + must be available in the classpath.", + code + ); + } + return; + } + } + if let Err(e) = env.throw_new(self.java_class.as_str(), &self.message) { eprintln!("Error when throwing Java exception: {:?}", e.to_string()); panic!("Error when throwing Java exception: {:?}", e); } } + + fn throw_namespace_exception( + &self, + env: &mut JNIEnv, + code: u32, + ) -> std::result::Result<(), ()> { + // Try to find and call the LanceNamespaceException constructor + // that takes ErrorCode and message + let class_name = "org/lance/namespace/errors/LanceNamespaceException"; + let error_code_class = "org/lance/namespace/errors/ErrorCode"; + + // Find the ErrorCode.fromCode method + let error_code_cls = env.find_class(error_code_class).map_err(|_| ())?; + let from_code_method = env + .get_static_method_id( + &error_code_cls, + "fromCode", + "(I)Lorg/lance/namespace/errors/ErrorCode;", + ) + .map_err(|_| ())?; + let error_code_obj = unsafe { + env.call_static_method_unchecked( + &error_code_cls, + from_code_method, + jni::signature::ReturnType::Object, + &[jni::sys::jvalue { + i: code as jni::sys::jint, + }], + ) + } + .map_err(|_| ())?; + + let error_code = match error_code_obj { + jni::objects::JValueGen::Object(obj) => obj, + _ => return Err(()), + }; + + // Find the LanceNamespaceException class + let exception_cls = env.find_class(class_name).map_err(|_| ())?; + + // Create message JString + let message_str = env.new_string(&self.message).map_err(|_| ())?; + + // Find constructor (ErrorCode, String) + let constructor = env + .get_method_id( + &exception_cls, + "", + "(Lorg/lance/namespace/errors/ErrorCode;Ljava/lang/String;)V", + ) + .map_err(|_| ())?; + + // Create the exception object + let exception_obj = unsafe { + env.new_object_unchecked( + &exception_cls, + constructor, + &[ + jni::sys::jvalue { + l: error_code.as_raw(), + }, + jni::sys::jvalue { + l: message_str.as_raw(), + }, + ], + ) + } + .map_err(|_| ())?; + + // Throw the exception + env.throw(jni::objects::JThrowable::from(exception_obj)) + .map_err(|_| ())?; + + Ok(()) + } } pub type Result = std::result::Result; @@ -92,7 +201,7 @@ impl std::fmt::Display for Error { impl From for Error { fn from(err: LanceError) -> Self { - match err { + match &err { LanceError::DatasetNotFound { .. } | LanceError::DatasetAlreadyExists { .. } | LanceError::CommitConflict { .. } @@ -100,6 +209,19 @@ impl From for Error { LanceError::IO { .. } => Self::io_error(err.to_string()), LanceError::NotSupported { .. } => Self::unsupported_error(err.to_string()), LanceError::NotFound { .. } => Self::io_error(err.to_string()), + LanceError::Namespace { source, .. } => { + // Try to downcast to NamespaceError and get the error code + if let Some(ns_err) = source.downcast_ref::() { + Self::namespace_error(ns_err.code().as_u32(), ns_err.to_string()) + } else { + log::warn!( + "Failed to downcast NamespaceError source, falling back to runtime error. \ + This may indicate a version mismatch. Source type: {:?}", + source + ); + Self::runtime_error(err.to_string()) + } + } _ => Self::runtime_error(err.to_string()), } } diff --git a/java/lance-jni/src/namespace.rs b/java/lance-jni/src/namespace.rs index d197c2b594b..4b1d5a82d21 100644 --- a/java/lance-jni/src/namespace.rs +++ b/java/lance-jni/src/namespace.rs @@ -313,6 +313,7 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createTableNa } #[no_mangle] +#[allow(deprecated)] pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createEmptyTableNative( mut env: JNIEnv, _obj: JObject, @@ -329,6 +330,23 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createEmptyTa .into_raw() } +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + #[no_mangle] pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_insertIntoTableNative( mut env: JNIEnv, @@ -790,6 +808,7 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createTableNative( } #[no_mangle] +#[allow(deprecated)] pub extern "system" fn Java_org_lance_namespace_RestNamespace_createEmptyTableNative( mut env: JNIEnv, _obj: JObject, @@ -806,6 +825,23 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createEmptyTableNa .into_raw() } +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_RestNamespace_declareTableNative( + mut env: JNIEnv, + _obj: JObject, + handle: jlong, + request_json: JString, +) -> jstring { + ok_or_throw_with_return!( + env, + call_rest_namespace_method(&mut env, handle, request_json, |ns, req| { + RT.block_on(ns.inner.declare_table(req)) + }), + std::ptr::null_mut() + ) + .into_raw() +} + #[no_mangle] pub extern "system" fn Java_org_lance_namespace_RestNamespace_insertIntoTableNative( mut env: JNIEnv, diff --git a/java/pom.xml b/java/pom.xml index 4a24d461f29..20fa8a767b1 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -108,12 +108,12 @@ org.lance lance-namespace-core - 0.3.1 + 0.4.5 org.lance lance-namespace-apache-client - 0.3.1 + 0.4.5 com.fasterxml.jackson.core diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index 74f8c298fe8..dc90b425291 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -18,6 +18,8 @@ import org.lance.namespace.LanceNamespaceStorageOptionsProvider; import org.lance.namespace.model.CreateEmptyTableRequest; import org.lance.namespace.model.CreateEmptyTableResponse; +import org.lance.namespace.model.DeclareTableRequest; +import org.lance.namespace.model.DeclareTableResponse; import org.lance.namespace.model.DescribeTableRequest; import org.lance.namespace.model.DescribeTableResponse; @@ -353,18 +355,33 @@ private Dataset executeWithNamespace() { // Mode-specific namespace operations if (mode == WriteParams.WriteMode.CREATE) { - // Call namespace.createEmptyTable() to create new table - CreateEmptyTableRequest request = new CreateEmptyTableRequest(); - request.setId(tableId); - - CreateEmptyTableResponse response = namespace.createEmptyTable(request); + // Try declareTable first, fall back to deprecated createEmptyTable + // for backward compatibility with older namespace implementations. + // createEmptyTable support will be removed in 3.0.0. + String location; + Map responseStorageOptions; + + try { + DeclareTableRequest declareRequest = new DeclareTableRequest(); + declareRequest.setId(tableId); + DeclareTableResponse declareResponse = namespace.declareTable(declareRequest); + location = declareResponse.getLocation(); + responseStorageOptions = declareResponse.getStorageOptions(); + } catch (UnsupportedOperationException e) { + // Fall back to deprecated createEmptyTable + CreateEmptyTableRequest fallbackRequest = new CreateEmptyTableRequest(); + fallbackRequest.setId(tableId); + CreateEmptyTableResponse fallbackResponse = namespace.createEmptyTable(fallbackRequest); + location = fallbackResponse.getLocation(); + responseStorageOptions = fallbackResponse.getStorageOptions(); + } - tableUri = response.getLocation(); + tableUri = location; if (tableUri == null || tableUri.isEmpty()) { throw new IllegalArgumentException("Namespace did not return a table location"); } - namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : response.getStorageOptions(); + namespaceStorageOptions = ignoreNamespaceStorageOptions ? null : responseStorageOptions; } else { // For APPEND/OVERWRITE modes, call namespace.describeTable() DescribeTableRequest request = new DescribeTableRequest(); diff --git a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java index 2d13db69694..a0796739a3c 100644 --- a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java +++ b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java @@ -272,6 +272,14 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return fromJson(responseJson, CreateEmptyTableResponse.class); } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureInitialized(); + String requestJson = toJson(request); + String responseJson = declareTableNative(nativeDirectoryNamespaceHandle, requestJson); + return fromJson(responseJson, DeclareTableResponse.class); + } + @Override public InsertIntoTableResponse insertIntoTable( InsertIntoTableRequest request, byte[] requestData) { @@ -423,6 +431,8 @@ private static T fromJson(String json, Class clazz) { private native String createEmptyTableNative(long handle, String requestJson); + private native String declareTableNative(long handle, String requestJson); + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); private native String mergeInsertIntoTableNative( diff --git a/java/src/main/java/org/lance/namespace/RestNamespace.java b/java/src/main/java/org/lance/namespace/RestNamespace.java index 995c53c4b92..b55eeb2f200 100644 --- a/java/src/main/java/org/lance/namespace/RestNamespace.java +++ b/java/src/main/java/org/lance/namespace/RestNamespace.java @@ -196,6 +196,14 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return fromJson(responseJson, CreateEmptyTableResponse.class); } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + ensureInitialized(); + String requestJson = toJson(request); + String responseJson = declareTableNative(nativeRestNamespaceHandle, requestJson); + return fromJson(responseJson, DeclareTableResponse.class); + } + @Override public InsertIntoTableResponse insertIntoTable( InsertIntoTableRequest request, byte[] requestData) { @@ -345,6 +353,8 @@ private static T fromJson(String json, Class clazz) { private native String createEmptyTableNative(long handle, String requestJson); + private native String declareTableNative(long handle, String requestJson); + private native String insertIntoTableNative(long handle, String requestJson, byte[] requestData); private native String mergeInsertIntoTableNative( diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index d2ea43f5e53..ad0b55dccdc 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -18,6 +18,8 @@ import org.lance.namespace.LanceNamespaceStorageOptionsProvider; import org.lance.namespace.model.CreateEmptyTableRequest; import org.lance.namespace.model.CreateEmptyTableResponse; +import org.lance.namespace.model.DeclareTableRequest; +import org.lance.namespace.model.DeclareTableResponse; import org.lance.namespace.model.DescribeTableRequest; import org.lance.namespace.model.DescribeTableResponse; import org.lance.operation.Append; @@ -215,6 +217,16 @@ public CreateEmptyTableResponse createEmptyTable(CreateEmptyTableRequest request return response; } + @Override + public DeclareTableResponse declareTable(DeclareTableRequest request) { + int count = createCallCount.incrementAndGet(); + + DeclareTableResponse response = inner.declareTable(request); + response.setStorageOptions(modifyStorageOptions(response.getStorageOptions(), count)); + + return response; + } + @Override public DescribeTableResponse describeTable(DescribeTableRequest request) { int count = describeCallCount.incrementAndGet(); diff --git a/python/Cargo.lock b/python/Cargo.lock index 4e8ef92fde8..10753904f26 100644 --- a/python/Cargo.lock +++ b/python/Cargo.lock @@ -4617,6 +4617,7 @@ dependencies = [ "azure_identity", "azure_storage", "azure_storage_blobs", + "base64 0.22.1", "bytes", "chrono", "futures", @@ -4632,6 +4633,7 @@ dependencies = [ "reqwest", "serde", "serde_json", + "sha2", "snafu", "time", "tokio", @@ -4642,9 +4644,9 @@ dependencies = [ [[package]] name = "lance-namespace-reqwest-client" -version = "0.3.2" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "00a21b43fe2a373896727b97927adedd2683d2907683f294f62cf8815fbf6a01" +checksum = "a2acdba67f84190067532fce07b51a435dd390d7cdc1129a05003e5cb3274cf0" dependencies = [ "reqwest", "serde", diff --git a/python/pyproject.toml b/python/pyproject.toml index 3ad7ccef8f9..5cf1205c586 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "pylance" dynamic = ["version"] -dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.3.1"] +dependencies = ["pyarrow>=14", "numpy>=1.22", "lance-namespace>=0.4.5"] description = "python wrapper for Lance columnar format" authors = [{ name = "Lance Devs", email = "dev@lance.org" }] license = { file = "LICENSE" } diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index ab40b265a04..0707c314574 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -5508,16 +5508,48 @@ def write_dataset( from .namespace import ( CreateEmptyTableRequest, + DeclareTableRequest, DescribeTableRequest, LanceNamespaceStorageOptionsProvider, ) # Determine which namespace method to call based on mode if mode == "create": - request = CreateEmptyTableRequest( - id=table_id, location=None, properties=None - ) - response = namespace.create_empty_table(request) + # Try declare_table first, fall back to deprecated create_empty_table + # for backward compatibility with older namespace implementations. + # create_empty_table support will be removed in 3.0.0. + if hasattr(namespace, "declare_table"): + try: + from lance_namespace.errors import UnsupportedOperationError + + declare_request = DeclareTableRequest(id=table_id, location=None) + response = namespace.declare_table(declare_request) + except (UnsupportedOperationError, NotImplementedError): + # Fall back to deprecated create_empty_table + import warnings + + warnings.warn( + "create_empty_table is deprecated, use declare_table instead. " + "Support will be removed in 3.0.0.", + DeprecationWarning, + stacklevel=2, + ) + fallback_request = CreateEmptyTableRequest( + id=table_id, location=None + ) + response = namespace.create_empty_table(fallback_request) + else: + # Namespace doesn't have declare_table, fall back to create_empty_table + import warnings + + warnings.warn( + "create_empty_table is deprecated, use declare_table instead. " + "Support will be removed in 3.0.0.", + DeprecationWarning, + stacklevel=2, + ) + fallback_request = CreateEmptyTableRequest(id=table_id, location=None) + response = namespace.create_empty_table(fallback_request) elif mode in ("append", "overwrite"): request = DescribeTableRequest(id=table_id, version=None) response = namespace.describe_table(request) diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index 3d22cafefc9..d879ddcb99f 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -20,6 +20,8 @@ CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, + DeclareTableRequest, + DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, @@ -218,6 +220,10 @@ def create_empty_table( response_dict = self._inner.create_empty_table(request.model_dump()) return CreateEmptyTableResponse.from_dict(response_dict) + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + response_dict = self._inner.declare_table(request.model_dump()) + return DeclareTableResponse.from_dict(response_dict) + class RestNamespace(LanceNamespace): """REST-based Lance Namespace implementation backed by Rust. @@ -334,6 +340,10 @@ def create_empty_table( response_dict = self._inner.create_empty_table(request.model_dump()) return CreateEmptyTableResponse.from_dict(response_dict) + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + response_dict = self._inner.declare_table(request.model_dump()) + return DeclareTableResponse.from_dict(response_dict) + class RestAdapter: """REST adapter server that creates a namespace backend and exposes it via REST. diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 592bbd2c3ef..3c93dbcb504 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -22,6 +22,8 @@ from lance.namespace import ( CreateEmptyTableRequest, CreateEmptyTableResponse, + DeclareTableRequest, + DeclareTableResponse, DescribeTableRequest, DescribeTableResponse, LanceNamespace, @@ -143,6 +145,18 @@ def create_empty_table( return response + def declare_table(self, request: DeclareTableRequest) -> DeclareTableResponse: + with self.lock: + self.create_call_count += 1 + count = self.create_call_count + + response = self.inner.declare_table(request) + response.storage_options = self._modify_storage_options( + response.storage_options, count + ) + + return response + def describe_table(self, request: DescribeTableRequest) -> DescribeTableResponse: with self.lock: self.describe_call_count += 1 @@ -434,8 +448,8 @@ def test_namespace_distributed_write(s3_bucket: str): table_name = uuid.uuid4().hex table_id = ["test_ns", table_name] - request = CreateEmptyTableRequest(id=table_id, location=None, properties=None) - response = namespace.create_empty_table(request) + request = DeclareTableRequest(id=table_id, location=None) + response = namespace.declare_table(request) assert namespace.get_create_call_count() == 1 assert namespace.get_describe_call_count() == 0 diff --git a/python/src/error.rs b/python/src/error.rs index ab12bead1e2..45569331289 100644 --- a/python/src/error.rs +++ b/python/src/error.rs @@ -12,13 +12,49 @@ // See the License for the specific language governing permissions and // limitations under the License. +use lance_namespace::error::NamespaceError; use pyo3::{ exceptions::{PyIOError, PyNotImplementedError, PyRuntimeError, PyValueError}, - PyResult, + types::{PyAnyMethods, PyModule}, + BoundObject, PyErr, PyResult, Python, }; use lance::Error as LanceError; +/// Try to convert a NamespaceError to the corresponding Python exception. +/// Returns the appropriate Python exception from lance_namespace.errors module. +fn namespace_error_to_pyerr(py: Python<'_>, ns_err: &NamespaceError) -> PyErr { + let code = ns_err.code().as_u32(); + let message = ns_err.to_string(); + + // Try to import the lance_namespace.errors module and use from_error_code + match PyModule::import(py, "lance_namespace.errors") { + Ok(module) => { + match module.getattr("from_error_code") { + Ok(from_error_code) => { + match from_error_code.call1((code, message.clone())) { + Ok(exc) => { + // Create a PyErr from the exception object + PyErr::from_value(exc.into_bound()) + } + Err(_) => PyRuntimeError::new_err(format!( + "[NamespaceError code={}] {}", + code, message + )), + } + } + Err(_) => { + PyRuntimeError::new_err(format!("[NamespaceError code={}] {}", code, message)) + } + } + } + Err(_) => { + // lance_namespace module not available, use RuntimeError with code prefix + PyRuntimeError::new_err(format!("[NamespaceError code={}] {}", code, message)) + } + } +} + pub trait PythonErrorExt { /// Convert to a python error based on the Lance error type fn infer_error(self) -> PyResult; @@ -43,7 +79,19 @@ impl PythonErrorExt for std::result::Result { LanceError::NotFound { .. } => self.value_error(), LanceError::RefNotFound { .. } => self.value_error(), LanceError::VersionNotFound { .. } => self.value_error(), - + LanceError::Namespace { source, .. } => { + // Try to downcast to NamespaceError and convert to proper Python exception + if let Some(ns_err) = source.downcast_ref::() { + Python::with_gil(|py| Err(namespace_error_to_pyerr(py, ns_err))) + } else { + log::warn!( + "Failed to downcast NamespaceError source, falling back to runtime error. \ + This may indicate a version mismatch. Source type: {:?}", + source + ); + self.runtime_error() + } + } _ => self.runtime_error(), }, } diff --git a/python/src/namespace.rs b/python/src/namespace.rs index 4ddf0fc76a4..cc579248943 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -183,6 +183,7 @@ impl PyDirectoryNamespace { Ok(pythonize(py, &response)?.into()) } + #[allow(deprecated)] fn create_empty_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { let request = depythonize(request)?; let response = crate::rt() @@ -190,6 +191,14 @@ impl PyDirectoryNamespace { .infer_error()?; Ok(pythonize(py, &response)?.into()) } + + fn declare_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.declare_table(request))? + .infer_error()?; + Ok(pythonize(py, &response)?.into()) + } } #[cfg(feature = "rest")] @@ -341,6 +350,7 @@ impl PyRestNamespace { Ok(pythonize(py, &response)?.into()) } + #[allow(deprecated)] fn create_empty_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { let request = depythonize(request)?; let response = crate::rt() @@ -348,6 +358,14 @@ impl PyRestNamespace { .infer_error()?; Ok(pythonize(py, &response)?.into()) } + + fn declare_table(&self, py: Python, request: &Bound<'_, PyAny>) -> PyResult { + let request = depythonize(request)?; + let response = crate::rt() + .block_on(Some(py), self.inner.declare_table(request))? + .infer_error()?; + Ok(pythonize(py, &response)?.into()) + } } #[cfg(feature = "rest-adapter")] diff --git a/rust/lance-core/src/error.rs b/rust/lance-core/src/error.rs index 48150db4354..f80dbca4a7b 100644 --- a/rust/lance-core/src/error.rs +++ b/rust/lance-core/src/error.rs @@ -184,7 +184,7 @@ impl LanceOptionExt for Option { } } -trait ToSnafuLocation { +pub trait ToSnafuLocation { fn to_snafu_location(&'static self) -> snafu::Location; } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index f809df8d1d3..22854e8fd53 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -113,8 +113,7 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { async fn fetch_storage_options(&self) -> Result>> { let request = DescribeTableRequest { id: Some(self.table_id.clone()), - version: None, - with_table_uri: None, + ..Default::default() }; let response = self diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index cb0ff52d1e0..85ee4a6989f 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -22,9 +22,9 @@ dir-azure = ["lance-io/azure", "lance/azure"] dir-oss = ["lance-io/oss", "lance/oss"] dir-huggingface = ["lance-io/huggingface", "lance/huggingface"] # Credential vending features -credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types"] -credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde"] -credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time"] +credential-vendor-aws = ["dep:aws-sdk-sts", "dep:aws-config", "dep:aws-credential-types", "dep:sha2", "dep:base64"] +credential-vendor-gcp = ["dep:google-cloud-auth", "dep:reqwest", "dep:serde", "dep:sha2", "dep:base64"] +credential-vendor-azure = ["dep:azure_core", "dep:azure_identity", "dep:azure_storage", "dep:azure_storage_blobs", "dep:time", "dep:sha2", "dep:base64", "dep:reqwest"] [dependencies] lance-namespace.workspace = true @@ -66,10 +66,12 @@ log.workspace = true rand.workspace = true chrono.workspace = true -# AWS credential vending dependencies (optional, enabled by "dir-aws" feature) +# AWS credential vending dependencies (optional, enabled by "credential-vendor-aws" feature) aws-sdk-sts = { version = "1.38.0", optional = true } aws-config = { workspace = true, optional = true } aws-credential-types = { workspace = true, optional = true } +sha2 = { version = "0.10", optional = true } +base64 = { version = "0.22", optional = true } # GCP credential vending dependencies (optional, enabled by "dir-gcp" feature) google-cloud-auth = { version = "0.18", optional = true } diff --git a/rust/lance-namespace-impls/src/credentials.rs b/rust/lance-namespace-impls/src/credentials.rs index 6be4f1e38a4..f9f7ecc7950 100644 --- a/rust/lance-namespace-impls/src/credentials.rs +++ b/rust/lance-namespace-impls/src/credentials.rs @@ -68,12 +68,22 @@ pub mod azure; #[cfg(feature = "credential-vendor-gcp")] pub mod gcp; +/// Credential caching module. +/// Available when any credential vendor feature is enabled. +#[cfg(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" +))] +pub mod cache; + use std::collections::HashMap; use std::str::FromStr; use async_trait::async_trait; use lance_core::Result; use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; /// Default credential duration: 1 hour (3600000 milliseconds) pub const DEFAULT_CREDENTIAL_DURATION_MILLIS: u64 = 3600 * 1000; @@ -188,6 +198,18 @@ pub const ENABLED: &str = "enabled"; /// Common property key for permission level (short form). pub const PERMISSION: &str = "permission"; +/// Common property key to enable credential caching (short form). +/// Default: true. Set to "false" to disable caching. +pub const CACHE_ENABLED: &str = "cache_enabled"; + +/// Common property key for API key salt (short form). +/// Used to hash API keys before comparison: SHA256(api_key + ":" + salt) +pub const API_KEY_SALT: &str = "api_key_salt"; + +/// Property key prefix for API key hash to permission mappings (short form). +/// Format: `api_key_hash. = ""` +pub const API_KEY_HASH_PREFIX: &str = "api_key_hash."; + /// AWS-specific property keys (short form, without prefix) #[cfg(feature = "credential-vendor-aws")] pub mod aws_props { @@ -204,6 +226,14 @@ pub mod aws_props { #[cfg(feature = "credential-vendor-gcp")] pub mod gcp_props { pub const SERVICE_ACCOUNT: &str = "gcp_service_account"; + + /// Workload Identity Provider resource name for OIDC token exchange. + /// Format: //iam.googleapis.com/projects/{project}/locations/global/workloadIdentityPools/{pool}/providers/{provider} + pub const WORKLOAD_IDENTITY_PROVIDER: &str = "gcp_workload_identity_provider"; + + /// Service account to impersonate after Workload Identity Federation (optional). + /// If not set, uses the federated identity directly. + pub const IMPERSONATION_SERVICE_ACCOUNT: &str = "gcp_impersonation_service_account"; } /// Azure-specific property keys (short form, without prefix) @@ -215,6 +245,10 @@ pub mod azure_props { /// Azure credential duration in milliseconds. /// Default: 3600000 (1 hour). Azure SAS tokens can be valid up to 7 days. pub const DURATION_MILLIS: &str = "azure_duration_millis"; + + /// Client ID of the Azure AD App Registration for Workload Identity Federation. + /// Required when using auth_token identity for OIDC token exchange. + pub const FEDERATED_CLIENT_ID: &str = "azure_federated_client_id"; } /// Vended credentials with expiration information. @@ -271,16 +305,30 @@ pub trait CredentialVendor: Send + Sync + std::fmt::Debug { /// Vend credentials for accessing the specified table location. /// /// The permission level (read/write/admin) is determined by the vendor's - /// configuration, not per-request. + /// configuration, not per-request. When identity is provided, the vendor + /// may use different authentication flows: + /// + /// - `auth_token`: Use AssumeRoleWithWebIdentity (AWS validates the token) + /// - `api_key`: Validate against configured API key hashes and use AssumeRole + /// - `None`: Use static configuration with AssumeRole /// /// # Arguments /// /// * `table_location` - The table URI to vend credentials for + /// * `identity` - Optional identity from the request (api_key OR auth_token, mutually exclusive) /// /// # Returns /// /// Returns vended credentials with expiration information. - async fn vend_credentials(&self, table_location: &str) -> Result; + /// + /// # Errors + /// + /// Returns error if identity validation fails (no fallback to static config). + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result; /// Returns the cloud provider name (e.g., "aws", "gcp", "azure"). fn provider_name(&self) -> &'static str; @@ -349,21 +397,50 @@ pub async fn create_credential_vendor_for_location( ) -> Result>> { let provider = detect_provider_from_uri(table_location); - match provider { + let vendor: Option> = match provider { #[cfg(feature = "credential-vendor-aws")] - "aws" => create_aws_vendor(properties).await, + "aws" => create_aws_vendor(properties).await?, #[cfg(feature = "credential-vendor-gcp")] - "gcp" => create_gcp_vendor(properties).await, + "gcp" => create_gcp_vendor(properties).await?, #[cfg(feature = "credential-vendor-azure")] - "azure" => create_azure_vendor(properties), + "azure" => create_azure_vendor(properties)?, + + _ => None, + }; - _ => Ok(None), + // Wrap with caching if enabled (default: true) + #[cfg(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" + ))] + if let Some(v) = vendor { + let cache_enabled = properties + .get(CACHE_ENABLED) + .map(|s| !s.eq_ignore_ascii_case("false")) + .unwrap_or(true); + + if cache_enabled { + return Ok(Some(Box::new(cache::CachingCredentialVendor::new(v)))); + } else { + return Ok(Some(v)); + } } + + #[cfg(not(any( + feature = "credential-vendor-aws", + feature = "credential-vendor-azure", + feature = "credential-vendor-gcp" + )))] + let _ = vendor; + + Ok(None) } /// Parse permission from properties, defaulting to Read +#[allow(dead_code)] fn parse_permission(properties: &HashMap) -> VendedPermission { properties .get(PERMISSION) @@ -372,6 +449,7 @@ fn parse_permission(properties: &HashMap) -> VendedPermission { } /// Parse duration from properties using a vendor-specific key, defaulting to DEFAULT_CREDENTIAL_DURATION_MILLIS +#[allow(dead_code)] fn parse_duration_millis(properties: &HashMap, key: &str) -> u64 { properties .get(key) diff --git a/rust/lance-namespace-impls/src/credentials/aws.rs b/rust/lance-namespace-impls/src/credentials/aws.rs index 96e0e8a2a80..d9b363e37e0 100644 --- a/rust/lance-namespace-impls/src/credentials/aws.rs +++ b/rust/lance-namespace-impls/src/credentials/aws.rs @@ -11,9 +11,12 @@ use std::collections::HashMap; use async_trait::async_trait; use aws_config::BehaviorVersion; use aws_sdk_sts::Client as StsClient; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; use lance_core::{Error, Result}; use lance_io::object_store::uri_to_url; -use log::{debug, info}; +use lance_namespace::models::Identity; +use log::{debug, info, warn}; +use sha2::{Digest, Sha256}; use super::{ redact_credential, CredentialVendor, VendedCredentials, VendedPermission, @@ -24,6 +27,7 @@ use super::{ #[derive(Debug, Clone)] pub struct AwsCredentialVendorConfig { /// The IAM role ARN to assume. + /// Used for both AssumeRole (static/api_key) and AssumeRoleWithWebIdentity (auth_token). pub role_arn: String, /// Optional external ID for the assume role request. @@ -43,7 +47,18 @@ pub struct AwsCredentialVendorConfig { /// Permission level for vended credentials. /// Default: Read (full read access) + /// Used to generate scoped IAM policy for all credential flows. pub permission: VendedPermission, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, } impl AwsCredentialVendorConfig { @@ -56,6 +71,8 @@ impl AwsCredentialVendorConfig { role_session_name: None, region: None, permission: VendedPermission::default(), + api_key_salt: None, + api_key_hash_permissions: HashMap::new(), } } @@ -88,6 +105,32 @@ impl AwsCredentialVendorConfig { self.permission = permission; self } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } } /// AWS credential vendor that uses STS AssumeRole. @@ -206,60 +249,84 @@ impl AwsCredentialVendor { policy.to_string() } -} -#[async_trait] -impl CredentialVendor for AwsCredentialVendor { - async fn vend_credentials(&self, table_location: &str) -> Result { - debug!( - "AWS credential vending: location={}, permission={}", - table_location, self.config.permission - ); + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } - let (bucket, prefix) = Self::parse_s3_uri(table_location)?; - let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-web-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-web-identity".to_string(); + } - let role_session_name = self - .config - .role_session_name - .clone() - .unwrap_or_else(|| "lance-credential-vending".to_string()); + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-web-identity".to_string(), + } + } + }; - // Cap session name to 64 chars (AWS limit) - let role_session_name = if role_session_name.len() > 64 { - role_session_name[..64].to_string() - } else { - role_session_name + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-web-identity".to_string(), }; - // Convert millis to seconds for AWS API (rounding up to ensure at least the requested duration) - // AWS STS allows 900-43200 seconds (15 min - 12 hours), clamp to valid range - let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); - let mut request = self - .sts_client - .assume_role() - .role_arn(&self.config.role_arn) - .role_session_name(&role_session_name) - .policy(&policy) - .duration_seconds(duration_secs); + // Sanitize for role session name (alphanumeric, =, @, -, .) + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '=' || *c == '@' || *c == '-' || *c == '.') + .collect(); - if let Some(ref external_id) = self.config.external_id { - request = request.external_id(external_id); + let session_name = format!("lance-{}", sanitized); + + // Cap to 64 chars (AWS limit) + if session_name.len() > 64 { + session_name[..64].to_string() + } else { + session_name } + } - let response = request.send().await.map_err(|e| Error::IO { - source: Box::new(std::io::Error::other(format!( - "Failed to assume role '{}': {}", - self.config.role_arn, e - ))), - location: snafu::location!(), - })?; + /// Cap a session name to 64 characters (AWS limit). + fn cap_session_name(name: &str) -> String { + if name.len() > 64 { + name[..64].to_string() + } else { + name.to_string() + } + } - let credentials = response.credentials().ok_or_else(|| Error::IO { - source: Box::new(std::io::Error::other( - "AssumeRole response missing credentials", - )), + /// Extract credentials from an STS Credentials response. + fn extract_credentials( + &self, + credentials: Option<&aws_sdk_sts::types::Credentials>, + bucket: &str, + prefix: &str, + permission: VendedPermission, + ) -> Result { + let credentials = credentials.ok_or_else(|| Error::IO { + source: Box::new(std::io::Error::other("STS response missing credentials")), location: snafu::location!(), })?; @@ -273,7 +340,7 @@ impl CredentialVendor for AwsCredentialVendor { info!( "AWS credentials vended: bucket={}, prefix={}, permission={}, expires_at={}, access_key_id={}", - bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&access_key_id) + bucket, prefix, permission, expires_at_millis, redact_credential(&access_key_id) ); let mut storage_options = HashMap::new(); @@ -293,6 +360,211 @@ impl CredentialVendor for AwsCredentialVendor { Ok(VendedCredentials::new(storage_options, expires_at_millis)) } + /// Vend credentials using AssumeRoleWithWebIdentity (for auth_token). + async fn vend_with_web_identity( + &self, + bucket: &str, + prefix: &str, + auth_token: &str, + policy: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRoleWithWebIdentity: role={}, session={}, permission={}", + self.config.role_arn, session_name, self.config.permission + ); + + let response = self + .sts_client + .assume_role_with_web_identity() + .role_arn(&self.config.role_arn) + .web_identity_token(auth_token) + .role_session_name(&session_name) + .policy(policy) + .duration_seconds(duration_secs) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRoleWithWebIdentity failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials( + response.credentials(), + bucket, + prefix, + self.config.permission, + ) + } + + /// Vend credentials using AssumeRole with API key validation. + async fn vend_with_api_key( + &self, + bucket: &str, + prefix: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; + + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + let policy = Self::build_policy(bucket, prefix, permission); + let session_name = Self::cap_session_name(&format!("lance-api-{}", &key_hash[..16])); + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRole with API key: role={}, session={}, permission={}", + self.config.role_arn, session_name, permission + ); + + let request = self + .sts_client + .assume_role() + .role_arn(&self.config.role_arn) + .role_session_name(&session_name) + .policy(&policy) + .duration_seconds(duration_secs) + .external_id(&key_hash); // Use hash as external_id + + let response = request.send().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRole with API key failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials(response.credentials(), bucket, prefix, permission) + } + + /// Vend credentials using AssumeRole with static configuration. + async fn vend_with_static_config( + &self, + bucket: &str, + prefix: &str, + policy: &str, + ) -> Result { + let role_session_name = self + .config + .role_session_name + .clone() + .unwrap_or_else(|| "lance-credential-vending".to_string()); + let role_session_name = Self::cap_session_name(&role_session_name); + + let duration_secs = self.config.duration_millis.div_ceil(1000).clamp(900, 43200) as i32; + + debug!( + "AWS AssumeRole (static): role={}, session={}, permission={}", + self.config.role_arn, role_session_name, self.config.permission + ); + + let mut request = self + .sts_client + .assume_role() + .role_arn(&self.config.role_arn) + .role_session_name(&role_session_name) + .policy(policy) + .duration_seconds(duration_secs); + + if let Some(ref external_id) = self.config.external_id { + request = request.external_id(external_id); + } + + let response = request.send().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "AssumeRole failed for role '{}': {}", + self.config.role_arn, e + ))), + location: snafu::location!(), + })?; + + self.extract_credentials( + response.credentials(), + bucket, + prefix, + self.config.permission, + ) + } +} + +#[async_trait] +impl CredentialVendor for AwsCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "AWS credential vending: location={}, permission={}, has_identity={}", + table_location, + self.config.permission, + identity.is_some() + ); + + let (bucket, prefix) = Self::parse_s3_uri(table_location)?; + + match identity { + Some(id) if id.auth_token.is_some() => { + // Use AssumeRoleWithWebIdentity with configured permission + let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + self.vend_with_web_identity( + &bucket, + &prefix, + id.auth_token.as_ref().unwrap(), + &policy, + ) + .await + } + Some(id) if id.api_key.is_some() => { + // Use AssumeRole with API key validation and mapped permission + self.vend_with_api_key(&bucket, &prefix, id.api_key.as_ref().unwrap()) + .await + } + Some(_) => { + // Identity provided but neither api_key nor auth_token set + Err(Error::InvalidInput { + source: "Identity provided but neither api_key nor auth_token is set".into(), + location: snafu::location!(), + }) + } + None => { + // Use AssumeRole with static configuration + let policy = Self::build_policy(&bucket, &prefix, self.config.permission); + self.vend_with_static_config(&bucket, &prefix, &policy) + .await + } + } + } + fn provider_name(&self) -> &'static str { "aws" } @@ -543,7 +815,7 @@ mod tests { .expect("should create read vendor"); let read_creds = read_vendor - .vend_credentials(&table_location) + .vend_credentials(&table_location, None) .await .expect("should vend read credentials"); @@ -582,7 +854,7 @@ mod tests { .expect("should create admin vendor"); let admin_creds = admin_vendor - .vend_credentials(&table_location) + .vend_credentials(&table_location, None) .await .expect("should vend admin credentials"); @@ -627,8 +899,7 @@ mod tests { // Create a child namespace let create_ns_req = CreateNamespaceRequest { id: Some(vec!["test_ns".to_string()]), - properties: None, - mode: None, + ..Default::default() }; namespace .create_namespace(create_ns_req) @@ -640,6 +911,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; let create_response = namespace .create_table(create_table_req, table_data) @@ -704,8 +976,7 @@ mod tests { // List tables to verify the table was created let list_req = ListTablesRequest { id: Some(vec!["test_ns".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let list_response = namespace .list_tables(list_req) @@ -719,6 +990,7 @@ mod tests { // Clean up: drop the table let drop_req = DropTableRequest { id: Some(vec!["test_ns".to_string(), "test_table".to_string()]), + ..Default::default() }; namespace .drop_table(drop_req) @@ -755,12 +1027,12 @@ mod tests { // Vend credentials multiple times to verify consistent behavior let creds1 = vendor - .vend_credentials(&table_location) + .vend_credentials(&table_location, None) .await .expect("should vend credentials first time"); let creds2 = vendor - .vend_credentials(&table_location) + .vend_credentials(&table_location, None) .await .expect("should vend credentials second time"); @@ -802,13 +1074,13 @@ mod tests { // Vend credentials for table1 let creds1 = vendor - .vend_credentials(&table1_location) + .vend_credentials(&table1_location, None) .await .expect("should vend credentials for table1"); // Vend credentials for table2 let creds2 = vendor - .vend_credentials(&table2_location) + .vend_credentials(&table2_location, None) .await .expect("should vend credentials for table2"); @@ -861,8 +1133,7 @@ mod tests { // Verify namespace works let create_ns_req = CreateNamespaceRequest { id: Some(vec!["props_test".to_string()]), - properties: None, - mode: None, + ..Default::default() }; namespace .create_namespace(create_ns_req) diff --git a/rust/lance-namespace-impls/src/credentials/azure.rs b/rust/lance-namespace-impls/src/credentials/azure.rs index 1d4e4ded081..75a711b7448 100644 --- a/rust/lance-namespace-impls/src/credentials/azure.rs +++ b/rust/lance-namespace-impls/src/credentials/azure.rs @@ -13,10 +13,14 @@ use async_trait::async_trait; use azure_core::auth::TokenCredential; use azure_identity::DefaultAzureCredential; use azure_storage::prelude::*; +use azure_storage::shared_access_signature::service_sas::{BlobSharedAccessSignature, SasKey}; use azure_storage_blobs::prelude::*; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; use lance_core::{Error, Result}; use lance_io::object_store::uri_to_url; +use lance_namespace::models::Identity; use log::{debug, info, warn}; +use sha2::{Digest, Sha256}; use super::{ redact_credential, CredentialVendor, VendedCredentials, VendedPermission, @@ -38,7 +42,22 @@ pub struct AzureCredentialVendorConfig { /// Permission level for vended credentials. /// Default: Read (full read access) + /// Used to generate SAS permissions for all credential flows. pub permission: VendedPermission, + + /// Client ID of the Azure AD App Registration for Workload Identity Federation. + /// Required when using auth_token identity for OIDC token exchange. + pub federated_client_id: Option, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, } impl Default for AzureCredentialVendorConfig { @@ -48,6 +67,9 @@ impl Default for AzureCredentialVendorConfig { account_name: None, duration_millis: DEFAULT_CREDENTIAL_DURATION_MILLIS, permission: VendedPermission::default(), + federated_client_id: None, + api_key_salt: None, + api_key_hash_permissions: HashMap::new(), } } } @@ -81,18 +103,105 @@ impl AzureCredentialVendorConfig { self.permission = permission; self } + + /// Set the federated client ID for Workload Identity Federation. + pub fn with_federated_client_id(mut self, client_id: impl Into) -> Self { + self.federated_client_id = Some(client_id.into()); + self + } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } } /// Azure credential vendor that generates SAS tokens. #[derive(Debug)] pub struct AzureCredentialVendor { config: AzureCredentialVendorConfig, + http_client: reqwest::Client, } impl AzureCredentialVendor { /// Create a new Azure credential vendor with the specified configuration. pub fn new(config: AzureCredentialVendorConfig) -> Self { - Self { config } + Self { + config, + http_client: reqwest::Client::new(), + } + } + + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } + + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-azure-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-azure-identity".to_string(); + } + + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-azure-identity".to_string(), + } + } + }; + + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-azure-identity".to_string(), + }; + + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + // Sanitize: keep only alphanumeric, @, -, . + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '@' || *c == '-' || *c == '.') + .collect(); + + format!("lance-{}", sanitized) } /// Build SAS permissions based on the VendedPermission level. @@ -196,61 +305,596 @@ impl AzureCredentialVendor { Ok((token, expires_at_millis)) } -} -#[async_trait] -impl CredentialVendor for AzureCredentialVendor { - async fn vend_credentials(&self, table_location: &str) -> Result { - debug!( - "Azure credential vending: location={}, permission={}", - table_location, self.config.permission - ); + /// Generate a SAS token with a specific permission level. + async fn generate_sas_token_with_permission( + &self, + account: &str, + container: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; - let url = uri_to_url(table_location)?; + let credential: Arc = Arc::new(credential); + let blob_service_client = BlobServiceClient::new(account, credential.clone()); - let container = url.host_str().ok_or_else(|| Error::InvalidInput { - source: format!("Azure URI '{}' missing container", table_location).into(), + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(permission); + let container_client = blob_service_client.container_client(container); + + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token for container '{}': {}", + container, e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + let token = sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), location: snafu::location!(), })?; - // Check if path extends beyond container level - let path = url.path().trim_start_matches('/'); - if !path.is_empty() { - warn!( - "Azure SAS tokens are scoped to container level only. \ - Credentials for '{}' will have access to entire container '{}', not just path '{}'", - table_location, container, path - ); - } + Ok((token, expires_at_millis)) + } - let account = + /// Generate a directory-scoped SAS token. + /// + /// Unlike container-level SAS tokens, this restricts access to a specific directory + /// path within the container. This is more secure for multi-tenant scenarios. + /// + /// # Arguments + /// * `account` - Storage account name + /// * `container` - Container name + /// * `path` - Directory path within the container (e.g., "tenant-a/tables/my-table") + /// * `permission` - Permission level for the SAS token + async fn generate_directory_sas_token( + &self, + account: &str, + container: &str, + path: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + let credential = + DefaultAzureCredential::create(azure_identity::TokenCredentialOptions::default()) + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to create Azure credentials: {}", + e + ))), + location: snafu::location!(), + })?; + + let credential: Arc = Arc::new(credential); + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key for account '{}': {}", + account, e + ))), + location: snafu::location!(), + })?; + + // Normalize path: remove leading/trailing slashes + let normalized_path = path.trim_matches('/'); + let depth = if normalized_path.is_empty() { + 0 + } else { + normalized_path.split('/').count() + }; + + // Build canonical resource path for directory-level SAS + let canonical_resource = format!("/blob/{}/{}/{}", account, container, normalized_path); + + // Convert user delegation key to SasKey + let sas_key = SasKey::UserDelegationKey(user_delegation_key.user_deligation_key); + + let permissions = Self::build_sas_permissions(permission); + + // Create directory-scoped SAS signature + let sas = BlobSharedAccessSignature::new( + sas_key, + canonical_resource, + permissions, + end_time, + BlobSignedResource::Directory, + ) + .signed_directory_depth(depth as u8); + + let token = sas.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate directory SAS token: {}", + e + ))), + location: snafu::location!(), + })?; + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + info!( + "Azure directory-scoped SAS generated: account={}, container={}, path={}, depth={}, permission={}", + account, container, normalized_path, depth, permission + ); + + Ok((token, expires_at_millis)) + } + + /// Exchange an OIDC token for Azure AD access token using Workload Identity Federation. + /// + /// This requires: + /// 1. An Azure AD App Registration with Federated Credentials configured + /// 2. The OIDC token's issuer and subject to match the Federated Credential configuration + async fn exchange_oidc_for_azure_token(&self, oidc_token: &str) -> Result { + let tenant_id = self + .config + .tenant_id + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "azure_tenant_id must be configured for OIDC token exchange".into(), + location: snafu::location!(), + })?; + + let client_id = self.config - .account_name + .federated_client_id .as_ref() .ok_or_else(|| Error::InvalidInput { - source: "Azure credential vending requires 'credential_vendor.azure_account_name' to be set in configuration".into(), + source: "azure_federated_client_id must be configured for OIDC token exchange" + .into(), + location: snafu::location!(), + })?; + + let token_url = format!( + "https://login.microsoftonline.com/{}/oauth2/v2.0/token", + tenant_id + ); + + let params = [ + ("grant_type", "client_credentials"), + ( + "client_assertion_type", + "urn:ietf:params:oauth:client-assertion-type:jwt-bearer", + ), + ("client_assertion", oidc_token), + ("client_id", client_id), + ("scope", "https://storage.azure.com/.default"), + ]; + + let response = self + .http_client + .post(&token_url) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to exchange OIDC token for Azure AD token: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "Azure AD token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: serde_json::Value = response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse Azure AD token response: {}", + e + ))), + location: snafu::location!(), + })?; + + token_response + .get("access_token") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .ok_or_else(|| Error::IO { + source: Box::new(std::io::Error::other( + "Azure AD token response missing access_token", + )), + location: snafu::location!(), + }) + } + + /// Generate a SAS token using a federated Azure AD token. + /// + /// Uses directory-scoped SAS when path is provided, container-level otherwise. + async fn generate_sas_with_azure_token( + &self, + azure_token: &str, + account: &str, + container: &str, + path: &str, + permission: VendedPermission, + ) -> Result<(String, u64)> { + // Create a custom TokenCredential that uses our Azure AD token + let credential = FederatedTokenCredential::new(azure_token.to_string()); + let credential: Arc = Arc::new(credential); + + let blob_service_client = BlobServiceClient::new(account, credential.clone()); + + let now = time::OffsetDateTime::now_utc(); + let duration_millis = self.config.duration_millis as i64; + let end_time = now + time::Duration::milliseconds(duration_millis); + + let max_key_end = now + time::Duration::days(7) - time::Duration::seconds(60); + let key_end_time = if end_time > max_key_end { + max_key_end + } else { + end_time + }; + + let user_delegation_key = blob_service_client + .get_user_deligation_key(now, key_end_time) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get user delegation key with federated token: {}", + e + ))), + location: snafu::location!(), + })?; + + let permissions = Self::build_sas_permissions(permission); + + let expires_at_millis = + (end_time.unix_timestamp() * 1000 + end_time.millisecond() as i64) as u64; + + // Use directory-scoped SAS when path is provided + let normalized_path = path.trim_matches('/'); + let token = if normalized_path.is_empty() { + // Container-level SAS + let container_client = blob_service_client.container_client(container); + let sas_token = container_client + .user_delegation_shared_access_signature( + permissions, + &user_delegation_key.user_deligation_key, + ) + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate SAS token with federated token: {}", + e + ))), location: snafu::location!(), })?; - let (sas_token, expires_at_millis) = self.generate_sas_token(account, container).await?; + sas_token.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to get SAS token: {}", + e + ))), + location: snafu::location!(), + })? + } else { + // Directory-scoped SAS + let depth = normalized_path.split('/').count(); + let canonical_resource = format!("/blob/{}/{}/{}", account, container, normalized_path); + let sas_key = SasKey::UserDelegationKey(user_delegation_key.user_deligation_key); + + let sas = BlobSharedAccessSignature::new( + sas_key, + canonical_resource, + permissions, + end_time, + BlobSignedResource::Directory, + ) + .signed_directory_depth(depth as u8); + + sas.token().map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to generate directory SAS token with federated token: {}", + e + ))), + location: snafu::location!(), + })? + }; + + Ok((token, expires_at_millis)) + } + + /// Vend credentials using Workload Identity Federation (for auth_token). + async fn vend_with_web_identity( + &self, + account: &str, + container: &str, + path: &str, + auth_token: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); + debug!( + "Azure vend_with_web_identity: account={}, container={}, path={}, session={}", + account, container, path, session_name + ); + + // Exchange OIDC token for Azure AD token + let azure_token = self.exchange_oidc_for_azure_token(auth_token).await?; + + // Generate SAS token using the Azure AD token + // Use directory-scoped SAS when path is provided + let (sas_token, expires_at_millis) = self + .generate_sas_with_azure_token( + &azure_token, + account, + container, + path, + self.config.permission, + ) + .await?; let mut storage_options = HashMap::new(); - // Use the standard key that object_store/lance-io expects storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); - storage_options.insert("azure_storage_account_name".to_string(), account.clone()); + storage_options.insert( + "azure_storage_account_name".to_string(), + account.to_string(), + ); storage_options.insert( "expires_at_millis".to_string(), expires_at_millis.to_string(), ); info!( - "Azure credentials vended: account={}, container={}, permission={}, expires_at={}, sas_token={}", - account, container, self.config.permission, expires_at_millis, redact_credential(&sas_token) + "Azure credentials vended (web identity): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, self.config.permission, expires_at_millis, redact_credential(&sas_token) ); Ok(VendedCredentials::new(storage_options, expires_at_millis)) } + /// Vend credentials using API key validation. + async fn vend_with_api_key( + &self, + account: &str, + container: &str, + path: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; + + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + debug!( + "Azure vend_with_api_key: account={}, container={}, path={}, permission={}", + account, container, path, permission + ); + + // Use directory-scoped SAS when path is provided, container-level otherwise + let (sas_token, expires_at_millis) = if path.is_empty() { + self.generate_sas_token_with_permission(account, container, permission) + .await? + } else { + self.generate_directory_sas_token(account, container, path, permission) + .await? + }; + + let mut storage_options = HashMap::new(); + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert( + "azure_storage_account_name".to_string(), + account.to_string(), + ); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended (api_key): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } +} + +/// A custom TokenCredential that wraps a pre-obtained Azure AD access token. +#[derive(Debug)] +struct FederatedTokenCredential { + token: String, +} + +impl FederatedTokenCredential { + fn new(token: String) -> Self { + Self { token } + } +} + +#[async_trait] +impl TokenCredential for FederatedTokenCredential { + async fn get_token( + &self, + _scopes: &[&str], + ) -> std::result::Result { + // Return the pre-obtained token with a 1-hour expiry (conservative estimate) + let expires_on = time::OffsetDateTime::now_utc() + time::Duration::hours(1); + Ok(azure_core::auth::AccessToken::new( + azure_core::auth::Secret::new(self.token.clone()), + expires_on, + )) + } + + async fn clear_cache(&self) -> std::result::Result<(), azure_core::Error> { + Ok(()) + } +} + +#[async_trait] +impl CredentialVendor for AzureCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "Azure credential vending: location={}, permission={}, identity={:?}", + table_location, + self.config.permission, + identity.map(|i| format!( + "api_key={}, auth_token={}", + i.api_key.is_some(), + i.auth_token.is_some() + )) + ); + + let url = uri_to_url(table_location)?; + + let container = url.host_str().ok_or_else(|| Error::InvalidInput { + source: format!("Azure URI '{}' missing container", table_location).into(), + location: snafu::location!(), + })?; + + // Extract path for directory-scoped SAS + let path = url.path().trim_start_matches('/'); + + let account = + self.config + .account_name + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "Azure credential vending requires 'credential_vendor.azure_account_name' to be set in configuration".into(), + location: snafu::location!(), + })?; + + // Dispatch based on identity + match identity { + Some(id) if id.auth_token.is_some() => { + let auth_token = id.auth_token.as_ref().unwrap(); + self.vend_with_web_identity(account, container, path, auth_token) + .await + } + Some(id) if id.api_key.is_some() => { + let api_key = id.api_key.as_ref().unwrap(); + self.vend_with_api_key(account, container, path, api_key) + .await + } + Some(_) => Err(Error::InvalidInput { + source: "Identity provided but neither auth_token nor api_key is set".into(), + location: snafu::location!(), + }), + None => { + // Static credential vending using DefaultAzureCredential + // Use directory-scoped SAS when path is provided, container-level otherwise + let (sas_token, expires_at_millis) = if path.is_empty() { + self.generate_sas_token(account, container).await? + } else { + self.generate_directory_sas_token( + account, + container, + path, + self.config.permission, + ) + .await? + }; + + let mut storage_options = HashMap::new(); + storage_options.insert("azure_storage_sas_token".to_string(), sas_token.clone()); + storage_options.insert("azure_storage_account_name".to_string(), account.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "Azure credentials vended (static): account={}, container={}, path={}, permission={}, expires_at={}, sas_token={}", + account, container, path, self.config.permission, expires_at_millis, redact_credential(&sas_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + } + } + fn provider_name(&self) -> &'static str { "azure" } diff --git a/rust/lance-namespace-impls/src/credentials/cache.rs b/rust/lance-namespace-impls/src/credentials/cache.rs new file mode 100644 index 00000000000..6e7c6c4dcf7 --- /dev/null +++ b/rust/lance-namespace-impls/src/credentials/cache.rs @@ -0,0 +1,438 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Credential caching for cloud storage access. +//! +//! This module provides a caching wrapper for credential vendors that reduces +//! the number of credential vending requests (e.g., STS calls) by caching +//! credentials until they are close to expiration. +//! +//! ## Caching Strategy +//! +//! - **Cache Key**: Table location + identity hash (api_key hash or auth_token hash) +//! - **TTL**: Half of the credential's remaining lifetime, capped at 30 minutes +//! - **Eviction**: Credentials are evicted when TTL expires or when explicitly cleared +//! +//! ## Example +//! +//! ```ignore +//! use lance_namespace_impls::credentials::cache::CachingCredentialVendor; +//! +//! let vendor = AwsCredentialVendor::new(config).await?; +//! let cached_vendor = CachingCredentialVendor::new(Box::new(vendor)); +//! +//! // First call hits the underlying vendor +//! let creds1 = cached_vendor.vend_credentials("s3://bucket/table", None).await?; +//! +//! // Subsequent calls within TTL return cached credentials +//! let creds2 = cached_vendor.vend_credentials("s3://bucket/table", None).await?; +//! ``` + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use async_trait::async_trait; +use lance_core::Result; +use lance_namespace::models::Identity; +use log::debug; +use tokio::sync::RwLock; + +use super::{CredentialVendor, VendedCredentials, VendedPermission}; + +/// Maximum cache TTL: 30 minutes. +/// Even if credentials are valid for longer, we refresh more frequently +/// to handle clock skew and ensure freshness. +const MAX_CACHE_TTL_SECS: u64 = 30 * 60; + +/// Minimum cache TTL: 1 minute. +/// If credentials expire sooner than this, we don't cache them. +const MIN_CACHE_TTL_SECS: u64 = 60; + +/// A cached credential entry with expiration tracking. +#[derive(Clone)] +struct CacheEntry { + credentials: VendedCredentials, + /// When this cache entry should be considered stale + cached_until: Instant, +} + +impl std::fmt::Debug for CacheEntry { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("CacheEntry") + .field("credentials", &"[redacted]") + .field("cached_until", &self.cached_until) + .finish() + } +} + +impl CacheEntry { + fn is_stale(&self) -> bool { + Instant::now() >= self.cached_until + } +} + +/// A caching wrapper for credential vendors. +/// +/// This wrapper caches vended credentials to reduce the number of underlying +/// credential vending operations (e.g., STS calls). Credentials are cached +/// until half their lifetime has passed, capped at 30 minutes. +#[derive(Debug)] +pub struct CachingCredentialVendor { + inner: Box, + cache: Arc>>, +} + +impl CachingCredentialVendor { + /// Create a new caching credential vendor wrapping the given vendor. + pub fn new(inner: Box) -> Self { + Self { + inner, + cache: Arc::new(RwLock::new(HashMap::new())), + } + } + + /// Build a cache key from the table location and identity. + /// + /// The key is a hash of the location and identity fields to ensure + /// different identities get different cached credentials. + fn build_cache_key(table_location: &str, identity: Option<&Identity>) -> String { + let mut hasher = std::collections::hash_map::DefaultHasher::new(); + + table_location.hash(&mut hasher); + + if let Some(id) = identity { + if let Some(ref api_key) = id.api_key { + ":api_key:".hash(&mut hasher); + api_key.hash(&mut hasher); + } + if let Some(ref auth_token) = id.auth_token { + ":auth_token:".hash(&mut hasher); + // Only hash first 64 chars of token to avoid memory issues with large tokens + let token_prefix = if auth_token.len() > 64 { + &auth_token[..64] + } else { + auth_token.as_str() + }; + token_prefix.hash(&mut hasher); + } + } else { + ":no_identity".hash(&mut hasher); + } + + format!("{:016x}", hasher.finish()) + } + + /// Calculate the cache TTL for the given credentials. + /// + /// Returns the TTL as a Duration, or None if the credentials should not be cached. + fn calculate_cache_ttl(credentials: &VendedCredentials) -> Option { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .expect("time went backwards") + .as_millis() as u64; + + if credentials.expires_at_millis <= now_millis { + // Already expired + return None; + } + + let remaining_millis = credentials.expires_at_millis - now_millis; + let remaining_secs = remaining_millis / 1000; + + // TTL is half the remaining lifetime + let ttl_secs = remaining_secs / 2; + + // Cap between MIN and MAX + if ttl_secs < MIN_CACHE_TTL_SECS { + None // Don't cache if TTL is too short + } else { + Some(Duration::from_secs(ttl_secs.min(MAX_CACHE_TTL_SECS))) + } + } + + /// Clear all cached credentials. + pub async fn clear_cache(&self) { + let mut cache = self.cache.write().await; + cache.clear(); + debug!("Credential cache cleared"); + } + + /// Get the number of cached entries. + pub async fn cache_size(&self) -> usize { + let cache = self.cache.read().await; + cache.len() + } + + /// Remove stale entries from the cache. + pub async fn evict_stale(&self) -> usize { + let mut cache = self.cache.write().await; + let before = cache.len(); + cache.retain(|_, entry| !entry.is_stale()); + let evicted = before - cache.len(); + if evicted > 0 { + debug!("Evicted {} stale credential cache entries", evicted); + } + evicted + } +} + +#[async_trait] +impl CredentialVendor for CachingCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + let cache_key = Self::build_cache_key(table_location, identity); + + // Try to get from cache first + { + let cache = self.cache.read().await; + if let Some(entry) = cache.get(&cache_key) { + if !entry.is_stale() && !entry.credentials.is_expired() { + debug!( + "Credential cache hit for location={}, provider={}", + table_location, + self.inner.provider_name() + ); + return Ok(entry.credentials.clone()); + } + } + } + + // Cache miss or stale - vend new credentials + debug!( + "Credential cache miss for location={}, provider={}", + table_location, + self.inner.provider_name() + ); + + let credentials = self + .inner + .vend_credentials(table_location, identity) + .await?; + + // Cache the new credentials if TTL is sufficient + if let Some(ttl) = Self::calculate_cache_ttl(&credentials) { + let entry = CacheEntry { + credentials: credentials.clone(), + cached_until: Instant::now() + ttl, + }; + + let mut cache = self.cache.write().await; + cache.insert(cache_key, entry); + + debug!( + "Cached credentials for location={}, ttl={}s", + table_location, + ttl.as_secs() + ); + } + + Ok(credentials) + } + + fn provider_name(&self) -> &'static str { + self.inner.provider_name() + } + + fn permission(&self) -> VendedPermission { + self.inner.permission() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicU32, Ordering}; + + /// A mock credential vendor for testing. + #[derive(Debug)] + struct MockVendor { + call_count: AtomicU32, + duration_millis: u64, + } + + impl MockVendor { + fn new(duration_millis: u64) -> Self { + Self { + call_count: AtomicU32::new(0), + duration_millis, + } + } + } + + #[async_trait] + impl CredentialVendor for MockVendor { + async fn vend_credentials( + &self, + _table_location: &str, + _identity: Option<&Identity>, + ) -> Result { + self.call_count.fetch_add(1, Ordering::SeqCst); + + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + let mut storage_options = HashMap::new(); + storage_options.insert("test_key".to_string(), "test_value".to_string()); + + Ok(VendedCredentials::new( + storage_options, + now_millis + self.duration_millis, + )) + } + + fn provider_name(&self) -> &'static str { + "mock" + } + + fn permission(&self) -> VendedPermission { + VendedPermission::Read + } + } + + #[test] + fn test_build_cache_key_no_identity() { + let key1 = CachingCredentialVendor::build_cache_key("s3://bucket/table1", None); + let key2 = CachingCredentialVendor::build_cache_key("s3://bucket/table2", None); + let key3 = CachingCredentialVendor::build_cache_key("s3://bucket/table1", None); + + assert_ne!(key1, key2, "Different locations should have different keys"); + assert_eq!(key1, key3, "Same location should have same key"); + } + + #[test] + fn test_build_cache_key_with_identity() { + let identity_api = Identity { + api_key: Some("my-api-key".to_string()), + auth_token: None, + }; + let identity_token = Identity { + api_key: None, + auth_token: Some("my-token".to_string()), + }; + + let key_no_id = CachingCredentialVendor::build_cache_key("s3://bucket/table", None); + let key_api = + CachingCredentialVendor::build_cache_key("s3://bucket/table", Some(&identity_api)); + let key_token = + CachingCredentialVendor::build_cache_key("s3://bucket/table", Some(&identity_token)); + + assert_ne!(key_no_id, key_api, "Identity should change key"); + assert_ne!(key_no_id, key_token, "Identity should change key"); + assert_ne!( + key_api, key_token, + "Different identity types should have different keys" + ); + } + + #[test] + fn test_calculate_cache_ttl() { + let now_millis = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + + // Credentials with 1 hour remaining -> TTL should be 30 minutes (capped) + let creds_1h = VendedCredentials::new(HashMap::new(), now_millis + 3600 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_1h); + assert_eq!(ttl, Some(Duration::from_secs(MAX_CACHE_TTL_SECS))); + + // Credentials with 10 minutes remaining -> TTL should be 5 minutes + let creds_10m = VendedCredentials::new(HashMap::new(), now_millis + 10 * 60 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_10m); + assert_eq!(ttl, Some(Duration::from_secs(5 * 60))); + + // Credentials with 1 minute remaining -> TTL should be None (too short) + let creds_1m = VendedCredentials::new(HashMap::new(), now_millis + 60 * 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_1m); + assert!(ttl.is_none(), "Should not cache short-lived credentials"); + + // Already expired credentials -> None + let creds_expired = VendedCredentials::new(HashMap::new(), now_millis - 1000); + let ttl = CachingCredentialVendor::calculate_cache_ttl(&creds_expired); + assert!(ttl.is_none(), "Should not cache expired credentials"); + } + + #[tokio::test] + async fn test_caching_reduces_calls() { + // Create a mock vendor with 1 hour credentials + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + // First call should hit the underlying vendor + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + // Get reference to inner mock for call count + // We can't easily get the call count from the boxed trait, so we'll check cache size + + // Second call should use cache (cache size stays at 1) + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + // Different location should create new cache entry + let _ = cached + .vend_credentials("s3://bucket/table2", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 2); + } + + #[tokio::test] + async fn test_clear_cache() { + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + assert_eq!(cached.cache_size().await, 1); + + cached.clear_cache().await; + assert_eq!(cached.cache_size().await, 0); + } + + #[tokio::test] + async fn test_different_identities_cached_separately() { + let mock = MockVendor::new(3600 * 1000); + let cached = CachingCredentialVendor::new(Box::new(mock)); + + let identity1 = Identity { + api_key: Some("key1".to_string()), + auth_token: None, + }; + let identity2 = Identity { + api_key: Some("key2".to_string()), + auth_token: None, + }; + + // Same location with different identities should cache separately + let _ = cached + .vend_credentials("s3://bucket/table", Some(&identity1)) + .await + .unwrap(); + let _ = cached + .vend_credentials("s3://bucket/table", Some(&identity2)) + .await + .unwrap(); + let _ = cached + .vend_credentials("s3://bucket/table", None) + .await + .unwrap(); + + assert_eq!(cached.cache_size().await, 3); + } +} diff --git a/rust/lance-namespace-impls/src/credentials/gcp.rs b/rust/lance-namespace-impls/src/credentials/gcp.rs index ce4bac40fa1..0749bdb1b97 100644 --- a/rust/lance-namespace-impls/src/credentials/gcp.rs +++ b/rust/lance-namespace-impls/src/credentials/gcp.rs @@ -44,12 +44,15 @@ use std::collections::HashMap; use async_trait::async_trait; +use base64::{engine::general_purpose::URL_SAFE_NO_PAD, Engine}; use google_cloud_auth::credentials; use lance_core::{Error, Result}; use lance_io::object_store::uri_to_url; -use log::{debug, info}; +use lance_namespace::models::Identity; +use log::{debug, info, warn}; use reqwest::Client; use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; use super::{redact_credential, CredentialVendor, VendedCredentials, VendedPermission}; @@ -79,6 +82,31 @@ pub struct GcpCredentialVendorConfig { /// Note: GCP token duration cannot be configured; the token lifetime /// is determined by the STS endpoint (typically 1 hour). pub permission: VendedPermission, + + /// Workload Identity Provider resource name for OIDC token exchange. + /// Required when using auth_token identity for Workload Identity Federation. + /// + /// Format: `projects/{project_number}/locations/global/workloadIdentityPools/{pool_id}/providers/{provider_id}` + /// + /// The OIDC token's issuer must match the provider's configuration. + pub workload_identity_provider: Option, + + /// Service account to impersonate after Workload Identity Federation. + /// Optional - if set, the exchanged token will be used to generate an + /// access token for this service account. + /// + /// Format: `my-sa@project.iam.gserviceaccount.com` + pub impersonation_service_account: Option, + + /// Salt for API key hashing. + /// Required when using API key authentication. + /// API keys are hashed as: SHA256(api_key + ":" + salt) + pub api_key_salt: Option, + + /// Map of SHA256(api_key + ":" + salt) -> permission level. + /// When an API key is provided, its hash is looked up in this map. + /// If found, the mapped permission is used instead of the default permission. + pub api_key_hash_permissions: HashMap, } impl GcpCredentialVendorConfig { @@ -104,6 +132,47 @@ impl GcpCredentialVendorConfig { self.permission = permission; self } + + /// Set the Workload Identity Provider for OIDC token exchange. + pub fn with_workload_identity_provider(mut self, provider: impl Into) -> Self { + self.workload_identity_provider = Some(provider.into()); + self + } + + /// Set the service account to impersonate after Workload Identity Federation. + pub fn with_impersonation_service_account( + mut self, + service_account: impl Into, + ) -> Self { + self.impersonation_service_account = Some(service_account.into()); + self + } + + /// Set the API key salt for hashing. + pub fn with_api_key_salt(mut self, salt: impl Into) -> Self { + self.api_key_salt = Some(salt.into()); + self + } + + /// Add an API key hash to permission mapping. + pub fn with_api_key_hash_permission( + mut self, + key_hash: impl Into, + permission: VendedPermission, + ) -> Self { + self.api_key_hash_permissions + .insert(key_hash.into(), permission); + self + } + + /// Set the entire API key hash permissions map. + pub fn with_api_key_hash_permissions( + mut self, + permissions: HashMap, + ) -> Self { + self.api_key_hash_permissions = permissions; + self + } } /// Access boundary rule for a single resource. @@ -459,25 +528,237 @@ impl GcpCredentialVendor { Ok((token_response.access_token, expires_at_millis)) } -} -#[async_trait] -impl CredentialVendor for GcpCredentialVendor { - async fn vend_credentials(&self, table_location: &str) -> Result { + /// Hash an API key using SHA-256 with salt (Polaris pattern). + /// Format: SHA256(api_key + ":" + salt) as hex string. + pub fn hash_api_key(api_key: &str, salt: &str) -> String { + let mut hasher = Sha256::new(); + hasher.update(format!("{}:{}", api_key, salt)); + format!("{:x}", hasher.finalize()) + } + + /// Extract a session name from a JWT token (best effort, no validation). + /// Decodes the payload and extracts 'sub' or 'email' claim. + /// Falls back to "lance-gcp-identity" if parsing fails. + fn derive_session_name_from_token(token: &str) -> String { + // JWT format: header.payload.signature + let parts: Vec<&str> = token.split('.').collect(); + if parts.len() != 3 { + return "lance-gcp-identity".to_string(); + } + + // Decode the payload (second part) + let payload = match URL_SAFE_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => { + // Try standard base64 as fallback + match base64::engine::general_purpose::STANDARD_NO_PAD.decode(parts[1]) { + Ok(bytes) => bytes, + Err(_) => return "lance-gcp-identity".to_string(), + } + } + }; + + // Parse as JSON and extract 'sub' or 'email' + let json: serde_json::Value = match serde_json::from_slice(&payload) { + Ok(v) => v, + Err(_) => return "lance-gcp-identity".to_string(), + }; + + let subject = json + .get("sub") + .or_else(|| json.get("email")) + .and_then(|v| v.as_str()) + .unwrap_or("unknown"); + + // Sanitize: keep only alphanumeric, @, -, . + let sanitized: String = subject + .chars() + .filter(|c| c.is_alphanumeric() || *c == '@' || *c == '-' || *c == '.') + .collect(); + + format!("lance-{}", sanitized) + } + + /// Normalize the Workload Identity Provider to the full audience format expected by GCP STS. + /// + /// GCP STS expects audience in the format: + /// `//iam.googleapis.com/projects/{project}/locations/global/workloadIdentityPools/{pool}/providers/{provider}` + /// + /// This function accepts either: + /// - Full format: `//iam.googleapis.com/projects/...` + /// - Short format: `projects/...` (will be prefixed with `//iam.googleapis.com/`) + fn normalize_workload_identity_audience(provider: &str) -> String { + const IAM_PREFIX: &str = "//iam.googleapis.com/"; + if provider.starts_with(IAM_PREFIX) { + provider.to_string() + } else { + format!("{}{}", IAM_PREFIX, provider) + } + } + + /// Exchange an OIDC token for GCP access token using Workload Identity Federation. + /// + /// This requires: + /// 1. A Workload Identity Pool and Provider configured in GCP + /// 2. The OIDC token's issuer to match the provider's configuration + /// 3. Optionally, a service account to impersonate after token exchange + async fn exchange_oidc_for_gcp_token(&self, oidc_token: &str) -> Result { + let workload_identity_provider = self + .config + .workload_identity_provider + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "gcp_workload_identity_provider must be configured for OIDC token exchange" + .into(), + location: snafu::location!(), + })?; + + // Normalize audience to full format expected by GCP STS + let audience = Self::normalize_workload_identity_audience(workload_identity_provider); + + // Exchange OIDC token for GCP federated token via STS + let params = [ + ( + "grant_type", + "urn:ietf:params:oauth:grant-type:token-exchange", + ), + ("subject_token_type", "urn:ietf:params:oauth:token-type:jwt"), + ( + "requested_token_type", + "urn:ietf:params:oauth:token-type:access_token", + ), + ("subject_token", oidc_token), + ("audience", audience.as_str()), + ("scope", "https://www.googleapis.com/auth/cloud-platform"), + ]; + + let response = self + .http_client + .post(STS_TOKEN_EXCHANGE_URL) + .form(¶ms) + .send() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to exchange OIDC token for GCP token: {}", + e + ))), + location: snafu::location!(), + })?; + + if !response.status().is_success() { + let status = response.status(); + let body = response.text().await.unwrap_or_default(); + return Err(Error::IO { + source: Box::new(std::io::Error::other(format!( + "GCP STS token exchange failed with status {}: {}", + status, body + ))), + location: snafu::location!(), + }); + } + + let token_response: TokenExchangeResponse = + response.json().await.map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to parse GCP STS token response: {}", + e + ))), + location: snafu::location!(), + })?; + + let federated_token = token_response.access_token; + + // If impersonation is configured, use the federated token to get an impersonated token + if let Some(ref service_account) = self.config.impersonation_service_account { + return self + .impersonate_service_account(&federated_token, service_account) + .await; + } + + Ok(federated_token) + } + + /// Vend credentials using Workload Identity Federation (for auth_token). + async fn vend_with_web_identity( + &self, + bucket: &str, + prefix: &str, + auth_token: &str, + ) -> Result { + let session_name = Self::derive_session_name_from_token(auth_token); debug!( - "GCP credential vending: location={}, permission={}", - table_location, self.config.permission + "GCP vend_with_web_identity: bucket={}, prefix={}, session={}", + bucket, prefix, session_name ); - let (bucket, prefix) = Self::parse_gcs_uri(table_location)?; + // Exchange OIDC token for GCP token + let gcp_token = self.exchange_oidc_for_gcp_token(auth_token).await?; - // Get source token from default credentials - let source_token = self.get_source_token().await?; + // Build access boundary and downscope + let access_boundary = Self::build_access_boundary(bucket, prefix, self.config.permission); + let (downscoped_token, expires_at_millis) = + self.downscope_token(&gcp_token, &access_boundary).await?; + + let mut storage_options = HashMap::new(); + storage_options.insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended (web identity): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&downscoped_token) + ); - // Build access boundary for this location and permission - let access_boundary = Self::build_access_boundary(&bucket, &prefix, self.config.permission); + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + + /// Vend credentials using API key validation. + async fn vend_with_api_key( + &self, + bucket: &str, + prefix: &str, + api_key: &str, + ) -> Result { + let salt = self + .config + .api_key_salt + .as_ref() + .ok_or_else(|| Error::InvalidInput { + source: "api_key_salt must be configured to use API key authentication".into(), + location: snafu::location!(), + })?; - // Exchange for downscoped token + let key_hash = Self::hash_api_key(api_key, salt); + + // Look up permission from hash mapping + let permission = self + .config + .api_key_hash_permissions + .get(&key_hash) + .copied() + .ok_or_else(|| { + warn!( + "Invalid API key: hash {} not found in permissions map", + &key_hash[..8] + ); + Error::InvalidInput { + source: "Invalid API key".into(), + location: snafu::location!(), + } + })?; + + debug!( + "GCP vend_with_api_key: bucket={}, prefix={}, permission={}", + bucket, prefix, permission + ); + + // Get source token using ADC and downscope with the API key's permission + let source_token = self.get_source_token().await?; + let access_boundary = Self::build_access_boundary(bucket, prefix, permission); let (downscoped_token, expires_at_millis) = self .downscope_token(&source_token, &access_boundary) .await?; @@ -490,16 +771,75 @@ impl CredentialVendor for GcpCredentialVendor { ); info!( - "GCP credentials vended: bucket={}, prefix={}, permission={}, expires_at={}, token={}", - bucket, - prefix, - self.config.permission, - expires_at_millis, - redact_credential(&downscoped_token) + "GCP credentials vended (api_key): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, permission, expires_at_millis, redact_credential(&downscoped_token) ); Ok(VendedCredentials::new(storage_options, expires_at_millis)) } +} + +#[async_trait] +impl CredentialVendor for GcpCredentialVendor { + async fn vend_credentials( + &self, + table_location: &str, + identity: Option<&Identity>, + ) -> Result { + debug!( + "GCP credential vending: location={}, permission={}, identity={:?}", + table_location, + self.config.permission, + identity.map(|i| format!( + "api_key={}, auth_token={}", + i.api_key.is_some(), + i.auth_token.is_some() + )) + ); + + let (bucket, prefix) = Self::parse_gcs_uri(table_location)?; + + // Dispatch based on identity + match identity { + Some(id) if id.auth_token.is_some() => { + let auth_token = id.auth_token.as_ref().unwrap(); + self.vend_with_web_identity(&bucket, &prefix, auth_token) + .await + } + Some(id) if id.api_key.is_some() => { + let api_key = id.api_key.as_ref().unwrap(); + self.vend_with_api_key(&bucket, &prefix, api_key).await + } + Some(_) => Err(Error::InvalidInput { + source: "Identity provided but neither auth_token nor api_key is set".into(), + location: snafu::location!(), + }), + None => { + // Static credential vending using ADC + let source_token = self.get_source_token().await?; + let access_boundary = + Self::build_access_boundary(&bucket, &prefix, self.config.permission); + let (downscoped_token, expires_at_millis) = self + .downscope_token(&source_token, &access_boundary) + .await?; + + let mut storage_options = HashMap::new(); + storage_options + .insert("google_storage_token".to_string(), downscoped_token.clone()); + storage_options.insert( + "expires_at_millis".to_string(), + expires_at_millis.to_string(), + ); + + info!( + "GCP credentials vended (static): bucket={}, prefix={}, permission={}, expires_at={}, token={}", + bucket, prefix, self.config.permission, expires_at_millis, redact_credential(&downscoped_token) + ); + + Ok(VendedCredentials::new(storage_options, expires_at_millis)) + } + } + } fn provider_name(&self) -> &'static str { "gcp" @@ -634,4 +974,26 @@ mod tests { // No condition when prefix is empty (full bucket access) assert!(rules[0].availability_condition.is_none()); } + + #[test] + fn test_normalize_workload_identity_audience() { + // Short format should be prefixed + let short = + "projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider"; + let normalized = GcpCredentialVendor::normalize_workload_identity_audience(short); + assert_eq!( + normalized, + "//iam.googleapis.com/projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider" + ); + + // Full format should be unchanged + let full = "//iam.googleapis.com/projects/123456/locations/global/workloadIdentityPools/my-pool/providers/my-provider"; + let normalized = GcpCredentialVendor::normalize_workload_identity_audience(full); + assert_eq!(normalized, full); + + // Edge case: already has prefix (idempotent) + let normalized_again = + GcpCredentialVendor::normalize_workload_identity_audience(&normalized); + assert_eq!(normalized_again, full); + } } diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 91714d73d90..2168324a308 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -16,17 +16,18 @@ use lance::dataset::{Dataset, WriteParams}; use lance::session::Session; use lance_io::object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry}; use object_store::path::Path; +use object_store::{Error as ObjectStoreError, ObjectStore as OSObjectStore, PutMode, PutOptions}; use std::collections::HashMap; use std::io::Cursor; use std::sync::Arc; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DescribeNamespaceRequest, - DescribeNamespaceResponse, DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, - DropNamespaceResponse, DropTableRequest, DropTableResponse, ListNamespacesRequest, - ListNamespacesResponse, ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, - TableExistsRequest, + CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, + DeclareTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, + DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, + DropTableRequest, DropTableResponse, Identity, ListNamespacesRequest, ListNamespacesResponse, + ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, TableExistsRequest, }; use lance_core::{box_error, Error, Result}; @@ -37,6 +38,19 @@ use crate::credentials::{ create_credential_vendor_for_location, has_credential_vendor_config, CredentialVendor, }; +/// Result of checking table status atomically. +/// +/// This struct captures the state of a table directory in a single snapshot, +/// avoiding race conditions between checking existence and other status flags. +pub(crate) struct TableStatus { + /// Whether the table directory exists (has any files) + pub(crate) exists: bool, + /// Whether the table has a `.lance-deregistered` marker file + pub(crate) is_deregistered: bool, + /// Whether the table has a `.lance-reserved` marker file (declared but not written) + pub(crate) has_reserved_file: bool, +} + /// Builder for creating a DirectoryNamespace. /// /// This builder provides a fluent API for configuring and establishing @@ -547,6 +561,13 @@ impl DirectoryNamespace { } let table_name = &path[..path.len() - 6]; + + // Use atomic check to skip deregistered tables and declared-but-not-written tables + let status = self.check_table_status(table_name).await; + if status.is_deregistered || status.has_reserved_file { + continue; + } + tables.push(table_name.to_string()); } @@ -608,6 +629,71 @@ impl DirectoryNamespace { .child(".lance-reserved") } + /// Get the deregistered marker file path for a table + fn table_deregistered_file_path(&self, table_name: &str) -> Path { + self.base_path + .child(format!("{}.lance", table_name).as_str()) + .child(".lance-deregistered") + } + + /// Atomically check table existence and deregistration status. + /// + /// This performs a single directory listing to get a consistent snapshot of the + /// table's state, avoiding race conditions between checking existence and + /// checking deregistration status. + pub(crate) async fn check_table_status(&self, table_name: &str) -> TableStatus { + let table_path = self.table_path(table_name); + match self.object_store.read_dir(table_path).await { + Ok(entries) => { + let exists = !entries.is_empty(); + let is_deregistered = entries.iter().any(|e| e.ends_with(".lance-deregistered")); + let has_reserved_file = entries.iter().any(|e| e.ends_with(".lance-reserved")); + TableStatus { + exists, + is_deregistered, + has_reserved_file, + } + } + Err(_) => TableStatus { + exists: false, + is_deregistered: false, + has_reserved_file: false, + }, + } + } + + /// Atomically create a marker file using put_if_not_exists semantics. + /// + /// This uses `PutMode::Create` which will fail if the file already exists, + /// providing atomic creation semantics to avoid race conditions. + /// + /// Returns Ok(()) if the file was created successfully. + /// Returns Err with appropriate message if the file already exists or other error. + async fn put_marker_file_atomic( + &self, + path: &Path, + file_description: &str, + ) -> std::result::Result<(), String> { + let put_opts = PutOptions { + mode: PutMode::Create, + ..Default::default() + }; + + match self + .object_store + .inner + .put_opts(path, bytes::Bytes::new().into(), put_opts) + .await + { + Ok(_) => Ok(()), + Err(ObjectStoreError::AlreadyExists { .. }) + | Err(ObjectStoreError::Precondition { .. }) => { + Err(format!("{} already exists", file_description)) + } + Err(e) => Err(format!("Failed to create {}: {}", file_description, e)), + } + } + /// Get storage options for a table, using credential vending if configured. /// /// If credential vendor properties are configured and the table location matches @@ -626,12 +712,14 @@ impl DirectoryNamespace { /// # Arguments /// /// * `table_uri` - The full URI of the table + /// * `identity` - Optional identity from the request for identity-based credential vending async fn get_storage_options_for_table( &self, table_uri: &str, + identity: Option<&Identity>, ) -> Result>> { if let Some(ref vendor) = self.credential_vendor { - let vended = vendor.vend_credentials(table_uri).await?; + let vended = vendor.vend_credentials(table_uri, identity).await?; return Ok(Some(vended.storage_options)); } Ok(self.storage_options.clone()) @@ -742,8 +830,10 @@ impl LanceNamespace for DirectoryNamespace { } Self::validate_root_namespace_id(&request.id)?; + #[allow(clippy::needless_update)] Ok(DescribeNamespaceResponse { properties: Some(HashMap::new()), + ..Default::default() }) } @@ -876,7 +966,20 @@ impl LanceNamespace for DirectoryNamespace { async fn describe_table(&self, request: DescribeTableRequest) -> Result { if let Some(ref manifest_ns) = self.manifest_ns { match manifest_ns.describe_table(request.clone()).await { - Ok(response) => return Ok(response), + Ok(mut response) => { + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref table_uri) = response.table_uri { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(table_uri, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); + } Err(_) if self.dir_listing_enabled && request.id.as_ref().is_some_and(|id| id.len() == 1) => @@ -890,21 +993,52 @@ impl LanceNamespace for DirectoryNamespace { let table_name = Self::table_name_from_id(&request.id)?; let table_uri = self.table_full_uri(&table_name); - let table_path = self.table_path(&table_name); - let dir_exists = self - .object_store - .read_dir(table_path) - .await - .map(|entries| !entries.is_empty()) - .unwrap_or(false); + // Atomically check table existence and deregistration status + let status = self.check_table_status(&table_name).await; - if !dir_exists { + if !status.exists { return Err(Error::Namespace { source: format!("Table does not exist: {}", table_name).into(), location: snafu::location!(), }); } + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + + // If not loading detailed metadata, return minimal response with just location + if !load_detailed_metadata { + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: request.id.as_ref().map(|id| { + if id.len() > 1 { + id[..id.len() - 1].to_vec() + } else { + vec![] + } + }), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() + }); + } + // Try to load the dataset to get real information match Dataset::open(&table_uri).await { Ok(mut dataset) => { @@ -913,11 +1047,20 @@ impl LanceNamespace for DirectoryNamespace { dataset = dataset.checkout_version(requested_version as u64).await?; } - let version = dataset.version().version; + let version_info = dataset.version(); let lance_schema = dataset.schema(); let arrow_schema: arrow_schema::Schema = lance_schema.into(); let json_schema = arrow_schema_to_json(&arrow_schema)?; - let storage_options = self.get_storage_options_for_table(&table_uri).await?; + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + // Convert BTreeMap to HashMap for the response + let metadata: std::collections::HashMap = + version_info.metadata.into_iter().collect(); Ok(DescribeTableResponse { table: Some(table_name), @@ -928,23 +1071,24 @@ impl LanceNamespace for DirectoryNamespace { vec![] } }), - version: Some(version as i64), + version: Some(version_info.version as i64), location: Some(table_uri.clone()), table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), storage_options, - stats: None, + metadata: Some(metadata), + ..Default::default() }) } Err(err) => { - let reserved_file_path = self.table_reserved_file_path(&table_name); - if self - .object_store - .exists(&reserved_file_path) - .await - .unwrap_or(false) - { - let storage_options = self.get_storage_options_for_table(&table_uri).await?; + // Use the reserved file status from the atomic check + if status.has_reserved_file { + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; Ok(DescribeTableResponse { table: Some(table_name), namespace: request.id.as_ref().map(|id| { @@ -954,12 +1098,10 @@ impl LanceNamespace for DirectoryNamespace { vec![] } }), - version: None, location: Some(table_uri.clone()), table_uri: Some(table_uri), - schema: None, storage_options, - stats: None, + ..Default::default() }) } else { Err(Error::Namespace { @@ -987,21 +1129,24 @@ impl LanceNamespace for DirectoryNamespace { } let table_name = Self::table_name_from_id(&request.id)?; - let table_path = self.table_path(&table_name); - let table_exists = self - .object_store - .read_dir(table_path) - .await - .map(|entries| !entries.is_empty()) - .unwrap_or(false); - if !table_exists { + // Atomically check table existence and deregistration status + let status = self.check_table_status(&table_name).await; + + if !status.exists { return Err(Error::Namespace { source: format!("Table does not exist: {}", table_name).into(), location: snafu::location!(), }); } + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + Ok(()) } @@ -1025,8 +1170,7 @@ impl LanceNamespace for DirectoryNamespace { Ok(DropTableResponse { id: request.id, location: Some(table_uri), - properties: None, - transaction_id: None, + ..Default::default() }) } @@ -1095,10 +1239,10 @@ impl LanceNamespace for DirectoryNamespace { })?; Ok(CreateTableResponse { - transaction_id: None, version: Some(1), location: Some(table_uri), storage_options: self.storage_options.clone(), + ..Default::default() }) } @@ -1107,7 +1251,20 @@ impl LanceNamespace for DirectoryNamespace { request: CreateEmptyTableRequest, ) -> Result { if let Some(ref manifest_ns) = self.manifest_ns { - return manifest_ns.create_empty_table(request).await; + #[allow(deprecated)] + let mut response = manifest_ns.create_empty_table(request.clone()).await?; + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref location) = response.location { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(location, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); } let table_name = Self::table_name_from_id(&request.id)?; @@ -1128,36 +1285,107 @@ impl LanceNamespace for DirectoryNamespace { } } - // Create the .lance-reserved file to mark the table as existing + // Atomically create the .lance-reserved file to mark the table as existing. + // This uses put_if_not_exists semantics to avoid race conditions. let reserved_file_path = self.table_reserved_file_path(&table_name); - self.object_store - .create(&reserved_file_path) + self.put_marker_file_atomic(&reserved_file_path, &format!("table {}", table_name)) .await .map_err(|e| Error::Namespace { - source: format!( - "Failed to create .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), + source: e.into(), + location: snafu::location!(), + })?; + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + Ok(CreateEmptyTableResponse { + location: Some(table_uri), + storage_options, + ..Default::default() + }) + } + + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + if let Some(ref manifest_ns) = self.manifest_ns { + let mut response = manifest_ns.declare_table(request.clone()).await?; + // Only apply identity-based credential vending when explicitly requested + if request.vend_credentials == Some(true) && self.credential_vendor.is_some() { + if let Some(ref location) = response.location { + let identity = request.identity.as_deref(); + response.storage_options = self + .get_storage_options_for_table(location, identity) + .await?; + } + } else if request.vend_credentials == Some(false) { + response.storage_options = None; + } + return Ok(response); + } + + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Validate location if provided + if let Some(location) = &request.location { + let location = location.trim_end_matches('/'); + if location != table_uri { + return Err(Error::Namespace { + source: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, location, table_uri + ) + .into(), + location: snafu::location!(), + }); + } + } + + // Check if table already has data (created via create_table). + // The atomic put only prevents races between concurrent declare_table calls, + // not between declare_table and existing data. + let status = self.check_table_status(&table_name).await; + if status.exists && !status.has_reserved_file { + // Table has data but no reserved file - it was created with data + return Err(Error::Namespace { + source: format!("Table already exists: {}", table_name).into(), location: snafu::location!(), - })? - .shutdown() + }); + } + + // Atomically create the .lance-reserved file to mark the table as declared. + // This uses put_if_not_exists semantics to avoid race conditions between + // concurrent declare_table calls. + let reserved_file_path = self.table_reserved_file_path(&table_name); + + self.put_marker_file_atomic(&reserved_file_path, &format!("table {}", table_name)) .await .map_err(|e| Error::Namespace { - source: format!( - "Failed to finalize .lance-reserved file for table {}: {}", - table_name, e - ) - .into(), + source: e.into(), location: snafu::location!(), })?; - Ok(CreateEmptyTableResponse { - transaction_id: None, + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let identity = request.identity.as_deref(); + let storage_options = if vend_credentials { + self.get_storage_options_for_table(&table_uri, identity) + .await? + } else { + None + }; + + Ok(DeclareTableResponse { location: Some(table_uri), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + ..Default::default() }) } @@ -1186,10 +1414,56 @@ impl LanceNamespace for DirectoryNamespace { return LanceNamespace::deregister_table(manifest_ns.as_ref(), request).await; } - // Without manifest, deregister_table is not supported - Err(Error::NotSupported { - source: "deregister_table is only supported when manifest mode is enabled".into(), - location: snafu::location!(), + // V1 mode: create a .lance-deregistered marker file in the table directory + let table_name = Self::table_name_from_id(&request.id)?; + let table_uri = self.table_full_uri(&table_name); + + // Check table existence and deregistration status. + // This provides better error messages for common cases. + let status = self.check_table_status(&table_name).await; + + if !status.exists { + return Err(Error::Namespace { + source: format!("Table does not exist: {}", table_name).into(), + location: snafu::location!(), + }); + } + + if status.is_deregistered { + return Err(Error::Namespace { + source: format!("Table is already deregistered: {}", table_name).into(), + location: snafu::location!(), + }); + } + + // Atomically create the .lance-deregistered marker file. + // This uses put_if_not_exists semantics to prevent race conditions + // when multiple processes try to deregister the same table concurrently. + // If a race occurs and another process already created the file, + // we'll get an AlreadyExists error which we convert to a proper message. + let deregistered_path = self.table_deregistered_file_path(&table_name); + self.put_marker_file_atomic( + &deregistered_path, + &format!("deregistration marker for table {}", table_name), + ) + .await + .map_err(|e| { + // Convert "already exists" to "already deregistered" for better UX + let message = if e.contains("already exists") { + format!("Table is already deregistered: {}", table_name) + } else { + e + }; + Error::Namespace { + source: message.into(), + location: snafu::location!(), + } + })?; + + Ok(lance_namespace::models::DeregisterTableResponse { + id: request.id, + location: Some(table_uri), + ..Default::default() }) } @@ -1877,6 +2151,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table() { let (namespace, temp_dir) = create_test_namespace().await; @@ -1921,6 +2196,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table_with_wrong_location() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -1937,6 +2213,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_create_empty_table_then_drop() { let (namespace, temp_dir) = create_test_namespace().await; @@ -1985,8 +2262,7 @@ mod tests { // List child namespaces let list_req = ListNamespacesRequest { id: Some(vec![]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2018,8 +2294,7 @@ mod tests { // List children of parent let list_req = ListNamespacesRequest { id: Some(vec!["parent".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2031,8 +2306,7 @@ mod tests { // List root should only show parent let list_req = ListNamespacesRequest { id: Some(vec![]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2063,8 +2337,7 @@ mod tests { // List tables in child namespace let list_req = ListTablesRequest { id: Some(vec!["test_ns".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -2111,8 +2384,7 @@ mod tests { // List tables let list_req = ListTablesRequest { id: Some(vec!["test_ns".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -2156,6 +2428,7 @@ mod tests { } #[tokio::test] + #[allow(deprecated)] async fn test_empty_table_in_child_namespace() { let (namespace, _temp_dir) = create_test_namespace().await; @@ -2248,6 +2521,7 @@ mod tests { // Describe namespace and verify properties let describe_req = DescribeNamespaceRequest { id: Some(vec!["test_ns".to_string()]), + ..Default::default() }; let result = namespace.describe_namespace(describe_req).await; assert!(result.is_ok()); @@ -2326,6 +2600,7 @@ mod tests { id: Some(vec!["ns1".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await.unwrap(); assert_eq!(result.tables.len(), 1); @@ -2335,6 +2610,7 @@ mod tests { id: Some(vec!["ns2".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = namespace.list_tables(list_req).await.unwrap(); assert_eq!(result.tables.len(), 1); @@ -2669,8 +2945,8 @@ mod tests { } #[tokio::test] - async fn test_register_deregister_without_manifest_fails() { - use lance_namespace::models::{DeregisterTableRequest, RegisterTableRequest}; + async fn test_register_without_manifest_fails() { + use lance_namespace::models::RegisterTableRequest; let temp_dir = TempStdDir::default(); let temp_path = temp_dir.to_str().unwrap(); @@ -2682,7 +2958,7 @@ mod tests { .await .unwrap(); - // Try to register - should fail + // Try to register - should fail (register requires manifest) let mut register_req = RegisterTableRequest::new("test_table.lance".to_string()); register_req.id = Some(vec!["test_table".to_string()]); let result = namespace.register_table(register_req).await; @@ -2692,15 +2968,8 @@ mod tests { .to_string() .contains("manifest mode is enabled")); - // Try to deregister - should fail - let mut deregister_req = DeregisterTableRequest::new(); - deregister_req.id = Some(vec!["test_table".to_string()]); - let result = namespace.deregister_table(deregister_req).await; - assert!(result.is_err()); - assert!(result - .unwrap_err() - .to_string() - .contains("manifest mode is enabled")); + // Note: deregister_table now works in V1 mode via .lance-deregistered marker files + // See test_deregister_table_v1_mode for that test case } #[tokio::test] @@ -2876,4 +3145,372 @@ mod tests { .unwrap(); assert_eq!(a_col.values(), &[100, 200]); } + + // ============================================================ + // Tests for declare_table + // ============================================================ + + #[tokio::test] + async fn test_declare_table_v1_mode() { + use lance_namespace::models::{ + DeclareTableRequest, DescribeTableRequest, TableExistsRequest, + }; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace in V1 mode (no manifest) + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .build() + .await + .unwrap(); + + // Declare a table + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.declare_table(declare_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + let location = response.location.as_ref().unwrap(); + assert!(location.ends_with("test_table.lance")); + + // Table should exist (via reserved file) + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req).await.is_ok()); + + // Describe should work but return no version/schema (not written yet) + let mut describe_req = DescribeTableRequest::new(); + describe_req.id = Some(vec!["test_table".to_string()]); + let describe_response = namespace.describe_table(describe_req).await.unwrap(); + assert!(describe_response.location.is_some()); + assert!(describe_response.version.is_none()); // Not written yet + assert!(describe_response.schema.is_none()); // Not written yet + } + + #[tokio::test] + async fn test_declare_table_with_manifest() { + use lance_namespace::models::{DeclareTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace with manifest + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(true) + .dir_listing_enabled(false) + .build() + .await + .unwrap(); + + // Declare a table + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.declare_table(declare_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + + // Table should exist in manifest + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req).await.is_ok()); + } + + #[tokio::test] + async fn test_declare_table_when_table_exists() { + use lance_namespace::models::DeclareTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .build() + .await + .unwrap(); + + // First create a table with actual data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Try to declare the same table - should fail because it already has data + let mut declare_req = DeclareTableRequest::new(); + declare_req.id = Some(vec!["test_table".to_string()]); + let result = namespace.declare_table(declare_req).await; + assert!(result.is_err()); + } + + // ============================================================ + // Tests for deregister_table in V1 mode + // ============================================================ + + #[tokio::test] + async fn test_deregister_table_v1_mode() { + use lance_namespace::models::{DeregisterTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + // Create namespace in V1 mode (no manifest, with dir listing) + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table with data + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Verify table exists + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req.clone()).await.is_ok()); + + // Deregister the table + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + let response = namespace.deregister_table(deregister_req).await.unwrap(); + + // Should return location + assert!(response.location.is_some()); + let location = response.location.as_ref().unwrap(); + assert!(location.contains("test_table")); + + // Table should no longer exist (deregistered) + let result = namespace.table_exists(exists_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + + // Physical data should still exist + let dataset = Dataset::open(location).await; + assert!(dataset.is_ok(), "Physical table data should still exist"); + } + + #[tokio::test] + async fn test_deregister_table_v1_already_deregistered() { + use lance_namespace::models::DeregisterTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Deregister once + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace + .deregister_table(deregister_req.clone()) + .await + .unwrap(); + + // Try to deregister again - should fail + let result = namespace.deregister_table(deregister_req).await; + assert!(result.is_err()); + assert!(result + .unwrap_err() + .to_string() + .contains("already deregistered")); + } + + // ============================================================ + // Tests for list_tables skipping deregistered tables + // ============================================================ + + #[tokio::test] + async fn test_list_tables_skips_deregistered_v1() { + use lance_namespace::models::DeregisterTableRequest; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create two tables + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + + let mut create_req1 = CreateTableRequest::new(); + create_req1.id = Some(vec!["table1".to_string()]); + namespace + .create_table(create_req1, bytes::Bytes::from(ipc_data.clone())) + .await + .unwrap(); + + let mut create_req2 = CreateTableRequest::new(); + create_req2.id = Some(vec!["table2".to_string()]); + namespace + .create_table(create_req2, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // List tables - should see both (root namespace = empty vec) + let mut list_req = ListTablesRequest::new(); + list_req.id = Some(vec![]); + let list_response = namespace.list_tables(list_req.clone()).await.unwrap(); + assert_eq!(list_response.tables.len(), 2); + + // Deregister table1 + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["table1".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // List tables - should only see table2 + let list_response = namespace.list_tables(list_req).await.unwrap(); + assert_eq!(list_response.tables.len(), 1); + assert!(list_response.tables.contains(&"table2".to_string())); + assert!(!list_response.tables.contains(&"table1".to_string())); + } + + // ============================================================ + // Tests for describe_table and table_exists with deregistered tables + // ============================================================ + + #[tokio::test] + async fn test_describe_table_fails_for_deregistered_v1() { + use lance_namespace::models::{DeregisterTableRequest, DescribeTableRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Describe should work before deregistration + let mut describe_req = DescribeTableRequest::new(); + describe_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.describe_table(describe_req.clone()).await.is_ok()); + + // Deregister + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // Describe should fail after deregistration + let result = namespace.describe_table(describe_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + } + + #[tokio::test] + async fn test_table_exists_fails_for_deregistered_v1() { + use lance_namespace::models::{DeregisterTableRequest, TableExistsRequest}; + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Table exists should work before deregistration + let mut exists_req = TableExistsRequest::new(); + exists_req.id = Some(vec!["test_table".to_string()]); + assert!(namespace.table_exists(exists_req.clone()).await.is_ok()); + + // Deregister + let mut deregister_req = DeregisterTableRequest::new(); + deregister_req.id = Some(vec!["test_table".to_string()]); + namespace.deregister_table(deregister_req).await.unwrap(); + + // Table exists should fail after deregistration + let result = namespace.table_exists(exists_req).await; + assert!(result.is_err()); + assert!(result.unwrap_err().to_string().contains("deregistered")); + } + + #[tokio::test] + async fn test_atomic_table_status_check() { + // This test verifies that the TableStatus check is atomic + // by ensuring a single directory listing is used + + let temp_dir = TempStdDir::default(); + let temp_path = temp_dir.to_str().unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(temp_path) + .manifest_enabled(false) + .dir_listing_enabled(true) + .build() + .await + .unwrap(); + + // Create a table + let schema = create_test_schema(); + let ipc_data = create_test_ipc_data(&schema); + let mut create_req = CreateTableRequest::new(); + create_req.id = Some(vec!["test_table".to_string()]); + namespace + .create_table(create_req, bytes::Bytes::from(ipc_data)) + .await + .unwrap(); + + // Table status should show exists=true, is_deregistered=false + let status = namespace.check_table_status("test_table").await; + assert!(status.exists); + assert!(!status.is_deregistered); + assert!(!status.has_reserved_file); + } } diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index 4791bbb9df5..bfcb9602b9a 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -24,12 +24,13 @@ use lance_index::IndexType; use lance_io::object_store::{ObjectStore, ObjectStoreParams}; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, - CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeregisterTableRequest, - DeregisterTableResponse, DescribeNamespaceRequest, DescribeNamespaceResponse, - DescribeTableRequest, DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableRequest, DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, - ListTablesRequest, ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, - RegisterTableResponse, TableExistsRequest, + CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, + DeclareTableResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableRequest, + DescribeTableResponse, DropNamespaceRequest, DropNamespaceResponse, DropTableRequest, + DropTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTablesRequest, + ListTablesResponse, NamespaceExistsRequest, RegisterTableRequest, RegisterTableResponse, + TableExistsRequest, }; use lance_namespace::schema::arrow_schema_to_json; use lance_namespace::LanceNamespace; @@ -1086,11 +1087,33 @@ impl LanceNamespace for ManifestNamespace { vec![] }; + let load_detailed_metadata = request.load_detailed_metadata.unwrap_or(false); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + match table_info { Some(info) => { // Construct full URI from relative location let table_uri = Self::construct_full_uri(&self.root, &info.location)?; + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + // If not loading detailed metadata, return minimal response with just location + if !load_detailed_metadata { + return Ok(DescribeTableResponse { + table: Some(table_name), + namespace: Some(namespace_id), + location: Some(table_uri.clone()), + table_uri: Some(table_uri), + storage_options, + ..Default::default() + }); + } + // Try to open the dataset to get version and schema match Dataset::open(&table_uri).await { Ok(mut dataset) => { @@ -1111,8 +1134,8 @@ impl LanceNamespace for ManifestNamespace { location: Some(table_uri.clone()), table_uri: Some(table_uri), schema: Some(Box::new(json_schema)), - storage_options: self.storage_options.clone(), - stats: None, + storage_options, + ..Default::default() }) } Err(_) => { @@ -1120,12 +1143,10 @@ impl LanceNamespace for ManifestNamespace { Ok(DescribeTableResponse { table: Some(table_name), namespace: Some(namespace_id), - version: None, location: Some(table_uri.clone()), table_uri: Some(table_uri), - schema: None, - storage_options: self.storage_options.clone(), - stats: None, + storage_options, + ..Default::default() }) } } @@ -1249,10 +1270,10 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(CreateTableResponse { - transaction_id: None, version: Some(1), location: Some(table_uri), storage_options: self.storage_options.clone(), + ..Default::default() }) } @@ -1296,8 +1317,7 @@ impl LanceNamespace for ManifestNamespace { Ok(DropTableResponse { id: request.id.clone(), location: Some(table_uri), - properties: None, - transaction_id: None, + ..Default::default() }) } None => Err(Error::Namespace { @@ -1369,8 +1389,10 @@ impl LanceNamespace for ManifestNamespace { // Root namespace always exists if namespace_id.is_empty() { + #[allow(clippy::needless_update)] return Ok(DescribeNamespaceResponse { properties: Some(HashMap::new()), + ..Default::default() }); } @@ -1379,8 +1401,10 @@ impl LanceNamespace for ManifestNamespace { let namespace_info = self.query_manifest_for_namespace(&object_id).await?; match namespace_info { + #[allow(clippy::needless_update)] Some(info) => Ok(DescribeNamespaceResponse { properties: info.metadata, + ..Default::default() }), None => Err(Error::Namespace { source: format!("Namespace '{}' not found", object_id).into(), @@ -1439,8 +1463,8 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(CreateNamespaceResponse { - transaction_id: None, properties: request.properties, + ..Default::default() }) } @@ -1502,10 +1526,7 @@ impl LanceNamespace for ManifestNamespace { self.delete_from_manifest(&object_id).await?; - Ok(DropNamespaceResponse { - properties: None, - transaction_id: None, - }) + Ok(DropNamespaceResponse::default()) } async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { @@ -1621,11 +1642,121 @@ impl LanceNamespace for ManifestNamespace { table_uri ); + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + Ok(CreateEmptyTableResponse { - transaction_id: None, location: Some(table_uri), - properties: None, - storage_options: self.storage_options.clone(), + storage_options, + ..Default::default() + }) + } + + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + let table_id = request.id.as_ref().ok_or_else(|| Error::InvalidInput { + source: "Table ID is required".into(), + location: location!(), + })?; + + if table_id.is_empty() { + return Err(Error::InvalidInput { + source: "Table ID cannot be empty".into(), + location: location!(), + }); + } + + let (namespace, table_name) = Self::split_object_id(table_id); + let object_id = Self::build_object_id(&namespace, &table_name); + + // Check if table already exists in manifest + let existing = self.query_manifest_for_table(&object_id).await?; + if existing.is_some() { + return Err(Error::Namespace { + source: format!("Table '{}' already exists", table_name).into(), + location: location!(), + }); + } + + // Create table location path with hash-based naming + // When dir_listing_enabled is true and it's a root table, use directory-style naming: {table_name}.lance + // Otherwise, use hash-based naming: {hash}_{object_id} + let dir_name = if namespace.is_empty() && self.dir_listing_enabled { + // Root table with directory listing enabled: use {table_name}.lance + format!("{}.lance", table_name) + } else { + // Child namespace table or dir listing disabled: use hash-based naming + Self::generate_dir_name(&object_id) + }; + let table_path = self.base_path.child(dir_name.as_str()); + let table_uri = Self::construct_full_uri(&self.root, &dir_name)?; + + // Validate location if provided + if let Some(req_location) = &request.location { + let req_location = req_location.trim_end_matches('/'); + if req_location != table_uri { + return Err(Error::Namespace { + source: format!( + "Cannot declare table {} at location {}, must be at location {}", + table_name, req_location, table_uri + ) + .into(), + location: location!(), + }); + } + } + + // Create the .lance-reserved file to mark the table as existing + let reserved_file_path = table_path.child(".lance-reserved"); + + self.object_store + .create(&reserved_file_path) + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to create .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })? + .shutdown() + .await + .map_err(|e| Error::Namespace { + source: format!( + "Failed to finalize .lance-reserved file for table {}: {}", + table_name, e + ) + .into(), + location: location!(), + })?; + + // Add entry to manifest marking this as a declared table (store dir_name, not full path) + self.insert_into_manifest(object_id, ObjectType::Table, Some(dir_name)) + .await?; + + log::info!( + "Declared table '{}' in manifest at {}", + table_name, + table_uri + ); + + // For backwards compatibility, only skip vending credentials when explicitly set to false + let vend_credentials = request.vend_credentials.unwrap_or(true); + let storage_options = if vend_credentials { + self.storage_options.clone() + } else { + None + }; + + Ok(DeclareTableResponse { + location: Some(table_uri), + storage_options, + ..Default::default() }) } @@ -1698,9 +1829,8 @@ impl LanceNamespace for ManifestNamespace { .await?; Ok(RegisterTableResponse { - transaction_id: None, location: Some(location), - properties: None, + ..Default::default() }) } @@ -1741,10 +1871,9 @@ impl LanceNamespace for ManifestNamespace { }; Ok(DeregisterTableResponse { - transaction_id: None, id: request.id.clone(), location: Some(table_uri), - properties: None, + ..Default::default() }) } } @@ -2172,6 +2301,7 @@ mod tests { // Verify namespace exists let exists_req = NamespaceExistsRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Namespace should exist"); @@ -2181,6 +2311,7 @@ mod tests { id: Some(vec![]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2225,6 +2356,7 @@ mod tests { // Verify nested namespace exists let exists_req = NamespaceExistsRequest { id: Some(vec!["parent".to_string(), "child".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Nested namespace should exist"); @@ -2234,6 +2366,7 @@ mod tests { id: Some(vec!["parent".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -2301,6 +2434,7 @@ mod tests { // Verify namespace no longer exists let exists_req = NamespaceExistsRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.namespace_exists(exists_req).await; assert!(result.is_err(), "Namespace should not exist after drop"); @@ -2379,6 +2513,7 @@ mod tests { id: Some(vec!["ns1".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = dir_namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -2415,6 +2550,7 @@ mod tests { // Describe the namespace let describe_req = DescribeNamespaceRequest { id: Some(vec!["ns1".to_string()]), + ..Default::default() }; let result = dir_namespace.describe_namespace(describe_req).await; assert!( diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 3b5d0650659..020746487a4 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -19,13 +19,13 @@ use lance_namespace::models::{ CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, - DeleteFromTableRequest, DeleteFromTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, - DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, - DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, - DescribeTableRequest, DescribeTableResponse, DescribeTransactionRequest, - DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, - ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, @@ -472,6 +472,7 @@ impl LanceNamespace for RestNamespace { request.clone(), Some(&self.delimiter), request.with_table_uri, + request.load_detailed_metadata, ) .await .map_err(convert_api_error) @@ -549,6 +550,14 @@ impl LanceNamespace for RestNamespace { .map_err(convert_api_error) } + async fn declare_table(&self, request: DeclareTableRequest) -> Result { + let id = object_id_str(&request.id, &self.delimiter)?; + + table_api::declare_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + .await + .map_err(convert_api_error) + } + async fn insert_into_table( &self, request: InsertIntoTableRequest, @@ -1029,8 +1038,7 @@ mod tests { let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, - limit: None, + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -1152,8 +1160,8 @@ mod tests { let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, limit: Some(10), + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -1191,8 +1199,8 @@ mod tests { let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), - page_token: None, limit: Some(10), + ..Default::default() }; let result = namespace.list_namespaces(request).await; @@ -1227,8 +1235,7 @@ mod tests { let request = CreateNamespaceRequest { id: Some(vec!["test".to_string(), "newnamespace".to_string()]), - properties: None, - mode: None, + ..Default::default() }; let result = namespace.create_namespace(request).await; @@ -1269,6 +1276,7 @@ mod tests { "table".to_string(), ]), mode: Some("Create".to_string()), + ..Default::default() }; let data = Bytes::from("arrow data here"); @@ -1306,6 +1314,7 @@ mod tests { "table".to_string(), ]), mode: Some("Append".to_string()), + ..Default::default() }; let data = Bytes::from("arrow data here"); diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 284b0d42fa9..4a12b92838a 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -12,7 +12,7 @@ use std::sync::Arc; use axum::{ body::Bytes, extract::{Path, Query, Request, State}, - http::StatusCode, + http::{HeaderMap, StatusCode}, response::{IntoResponse, Response}, routing::{get, post}, Json, Router, ServiceExt, @@ -80,6 +80,7 @@ impl RestAdapter { // Table data operations .route("/v1/table/:id/create", post(create_table)) .route("/v1/table/:id/create-empty", post(create_empty_table)) + .route("/v1/table/:id/declare", post(declare_table)) .route("/v1/table/:id/insert", post(insert_into_table)) .route("/v1/table/:id/merge_insert", post(merge_insert_into_table)) .route("/v1/table/:id/update", post(update_table)) @@ -311,11 +312,13 @@ fn error_to_response(err: Error) -> Response { async fn create_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_namespace(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -325,6 +328,7 @@ async fn create_namespace( async fn list_namespaces( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -332,6 +336,8 @@ async fn list_namespaces( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_namespaces(request).await { @@ -342,11 +348,13 @@ async fn list_namespaces( async fn describe_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.describe_namespace(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -356,11 +364,13 @@ async fn describe_namespace( async fn drop_namespace( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.drop_namespace(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -370,11 +380,13 @@ async fn drop_namespace( async fn namespace_exists( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.namespace_exists(request).await { Ok(_) => StatusCode::NO_CONTENT.into_response(), @@ -388,6 +400,7 @@ async fn namespace_exists( async fn list_tables( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -395,6 +408,8 @@ async fn list_tables( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_tables(request).await { @@ -405,11 +420,13 @@ async fn list_tables( async fn register_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.register_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -419,11 +436,13 @@ async fn register_table( async fn describe_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.describe_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -433,11 +452,13 @@ async fn describe_table( async fn table_exists( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.table_exists(request).await { Ok(_) => StatusCode::NO_CONTENT.into_response(), @@ -447,11 +468,14 @@ async fn table_exists( async fn drop_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { let request = DropTableRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), + identity: extract_identity(&headers), + ..Default::default() }; match backend.drop_table(request).await { @@ -462,11 +486,13 @@ async fn drop_table( async fn deregister_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.deregister_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -486,6 +512,7 @@ struct CreateTableQuery { async fn create_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, body: Bytes, @@ -493,6 +520,8 @@ async fn create_table( let request = CreateTableRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), mode: params.mode.clone(), + identity: extract_identity(&headers), + ..Default::default() }; match backend.create_table(request, body).await { @@ -501,13 +530,16 @@ async fn create_table( } } +#[allow(deprecated)] async fn create_empty_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_empty_table(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -515,6 +547,22 @@ async fn create_empty_table( } } +async fn declare_table( + State(backend): State>, + headers: HeaderMap, + Path(id): Path, + Query(params): Query, + Json(mut request): Json, +) -> Response { + request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); + + match backend.declare_table(request).await { + Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), + Err(e) => error_to_response(e), + } +} + #[derive(Debug, Deserialize)] struct InsertQuery { delimiter: Option, @@ -523,6 +571,7 @@ struct InsertQuery { async fn insert_into_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, body: Bytes, @@ -530,6 +579,8 @@ async fn insert_into_table( let request = InsertIntoTableRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), mode: params.mode.clone(), + identity: extract_identity(&headers), + ..Default::default() }; match backend.insert_into_table(request, body).await { @@ -553,6 +604,7 @@ struct MergeInsertQuery { async fn merge_insert_into_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, body: Bytes, @@ -567,6 +619,8 @@ async fn merge_insert_into_table( when_not_matched_by_source_delete_filt: params.when_not_matched_by_source_delete_filt, timeout: params.timeout, use_index: params.use_index, + identity: extract_identity(&headers), + ..Default::default() }; match backend.merge_insert_into_table(request, body).await { @@ -577,11 +631,13 @@ async fn merge_insert_into_table( async fn update_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.update_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -591,11 +647,13 @@ async fn update_table( async fn delete_from_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.delete_from_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -605,11 +663,13 @@ async fn delete_from_table( async fn query_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.query_table(request).await { Ok(bytes) => (StatusCode::OK, bytes).into_response(), @@ -619,6 +679,7 @@ async fn query_table( async fn count_table_rows( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -626,6 +687,8 @@ async fn count_table_rows( id: Some(parse_id(&id, params.delimiter.as_deref())), version: None, predicate: None, + identity: extract_identity(&headers), + ..Default::default() }; match backend.count_table_rows(request).await { @@ -640,11 +703,13 @@ async fn count_table_rows( async fn rename_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.rename_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -654,11 +719,13 @@ async fn rename_table( async fn restore_table( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.restore_table(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -668,6 +735,7 @@ async fn restore_table( async fn list_table_versions( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -675,6 +743,8 @@ async fn list_table_versions( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_table_versions(request).await { @@ -685,11 +755,14 @@ async fn list_table_versions( async fn get_table_stats( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { let request = GetTableStatsRequest { id: Some(parse_id(&id, params.delimiter.as_deref())), + identity: extract_identity(&headers), + ..Default::default() }; match backend.get_table_stats(request).await { @@ -700,12 +773,15 @@ async fn get_table_stats( async fn list_all_tables( State(backend): State>, + headers: HeaderMap, Query(params): Query, ) -> Response { let request = ListTablesRequest { id: None, page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_all_tables(request).await { @@ -720,11 +796,13 @@ async fn list_all_tables( async fn create_table_index( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_table_index(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -734,11 +812,13 @@ async fn create_table_index( async fn create_table_scalar_index( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_table_scalar_index(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -748,6 +828,7 @@ async fn create_table_scalar_index( async fn list_table_indices( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -756,6 +837,8 @@ async fn list_table_indices( version: None, page_token: None, limit: None, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_table_indices(request).await { @@ -772,6 +855,7 @@ struct IndexPathParams { async fn describe_table_index_stats( State(backend): State>, + headers: HeaderMap, Path(params): Path, Query(query): Query, ) -> Response { @@ -779,6 +863,8 @@ async fn describe_table_index_stats( id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), version: None, index_name: Some(params.index_name), + identity: extract_identity(&headers), + ..Default::default() }; match backend.describe_table_index_stats(request).await { @@ -789,12 +875,15 @@ async fn describe_table_index_stats( async fn drop_table_index( State(backend): State>, + headers: HeaderMap, Path(params): Path, Query(query): Query, ) -> Response { let request = DropTableIndexRequest { id: Some(parse_id(¶ms.id, query.delimiter.as_deref())), index_name: Some(params.index_name), + identity: extract_identity(&headers), + ..Default::default() }; match backend.drop_table_index(request).await { @@ -809,11 +898,13 @@ async fn drop_table_index( async fn alter_table_add_columns( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.alter_table_add_columns(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -823,11 +914,13 @@ async fn alter_table_add_columns( async fn alter_table_alter_columns( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.alter_table_alter_columns(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -837,11 +930,13 @@ async fn alter_table_alter_columns( async fn alter_table_drop_columns( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.alter_table_drop_columns(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -851,11 +946,13 @@ async fn alter_table_drop_columns( async fn update_table_schema_metadata( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.update_table_schema_metadata(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -869,6 +966,7 @@ async fn update_table_schema_metadata( async fn list_table_tags( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, ) -> Response { @@ -876,6 +974,8 @@ async fn list_table_tags( id: Some(parse_id(&id, params.delimiter.as_deref())), page_token: params.page_token, limit: params.limit, + identity: extract_identity(&headers), + ..Default::default() }; match backend.list_table_tags(request).await { @@ -886,11 +986,13 @@ async fn list_table_tags( async fn get_table_tag_version( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.get_table_tag_version(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -900,11 +1002,13 @@ async fn get_table_tag_version( async fn create_table_tag( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.create_table_tag(request).await { Ok(response) => (StatusCode::CREATED, Json(response)).into_response(), @@ -914,11 +1018,13 @@ async fn create_table_tag( async fn delete_table_tag( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.delete_table_tag(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -928,11 +1034,13 @@ async fn delete_table_tag( async fn update_table_tag( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.update_table_tag(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -946,11 +1054,13 @@ async fn update_table_tag( async fn explain_table_query_plan( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.explain_table_query_plan(request).await { Ok(plan) => (StatusCode::OK, plan).into_response(), @@ -960,11 +1070,13 @@ async fn explain_table_query_plan( async fn analyze_table_query_plan( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(params): Query, Json(mut request): Json, ) -> Response { request.id = Some(parse_id(&id, params.delimiter.as_deref())); + request.identity = extract_identity(&headers); match backend.analyze_table_query_plan(request).await { Ok(plan) => (StatusCode::OK, plan).into_response(), @@ -978,6 +1090,7 @@ async fn analyze_table_query_plan( async fn describe_transaction( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(_params): Query, Json(mut request): Json, @@ -991,6 +1104,7 @@ async fn describe_transaction( } else { request.id = Some(vec![id]); } + request.identity = extract_identity(&headers); match backend.describe_transaction(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -1000,6 +1114,7 @@ async fn describe_transaction( async fn alter_transaction( State(backend): State>, + headers: HeaderMap, Path(id): Path, Query(_params): Query, Json(mut request): Json, @@ -1011,6 +1126,7 @@ async fn alter_transaction( } else { request.id = Some(vec![id]); } + request.identity = extract_identity(&headers); match backend.alter_transaction(request).await { Ok(response) => (StatusCode::OK, Json(response)).into_response(), @@ -1038,6 +1154,36 @@ fn parse_id(id_str: &str, delimiter: Option<&str>) -> Vec { .collect() } +/// Extract identity information from HTTP headers +/// +/// Extracts `x-api-key` and `Authorization` (Bearer token) headers and returns +/// an Identity object if either is present. +fn extract_identity(headers: &HeaderMap) -> Option> { + let api_key = headers + .get("x-api-key") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); + + let auth_token = headers + .get("authorization") + .and_then(|v| v.to_str().ok()) + .and_then(|s| { + // Extract token from "Bearer " format + s.strip_prefix("Bearer ") + .or_else(|| s.strip_prefix("bearer ")) + .map(|t| t.to_string()) + }); + + if api_key.is_some() || auth_token.is_some() { + Some(Box::new(Identity { + api_key, + auth_token, + })) + } else { + None + } +} + #[cfg(test)] mod tests { use super::*; @@ -1181,6 +1327,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1235,6 +1382,7 @@ mod tests { id: Some(vec![format!("namespace{}", i)]), properties: None, mode: None, + ..Default::default() }; let result = fixture.namespace.create_namespace(create_req).await; assert!(result.is_ok(), "Failed to create namespace{}", i); @@ -1245,6 +1393,7 @@ mod tests { id: Some(vec![]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -1264,6 +1413,7 @@ mod tests { id: Some(vec!["parent".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1276,6 +1426,7 @@ mod tests { id: Some(vec!["parent".to_string(), "child1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1287,6 +1438,7 @@ mod tests { id: Some(vec!["parent".to_string(), "child2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1299,6 +1451,7 @@ mod tests { id: Some(vec!["parent".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_namespaces(list_req).await; assert!(result.is_ok()); @@ -1318,6 +1471,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1329,6 +1483,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; let result = fixture @@ -1369,6 +1524,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1381,6 +1537,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), format!("table{}", i)]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1394,6 +1551,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), page_token: None, limit: None, + ..Default::default() }; let result = fixture.namespace.list_tables(list_req).await; assert!(result.is_ok()); @@ -1414,6 +1572,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1425,6 +1584,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1440,6 +1600,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_empty_table_exists_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1448,6 +1609,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1484,6 +1646,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1495,6 +1658,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1572,6 +1736,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1583,6 +1748,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1593,6 +1759,7 @@ mod tests { // Drop the table let drop_req = DropTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.drop_table(drop_req).await; assert!( @@ -1611,6 +1778,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_create_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1619,6 +1787,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1665,6 +1834,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_describe_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1673,6 +1843,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1720,6 +1891,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_drop_empty_table_in_child_namespace() { let fixture = RestServerFixture::new().await; @@ -1728,6 +1900,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1747,6 +1920,7 @@ mod tests { // Drop the empty table let drop_req = DropTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.drop_table(drop_req).await; assert!( @@ -1765,6 +1939,7 @@ mod tests { } #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + #[allow(deprecated)] async fn test_deeply_nested_namespace_with_empty_table() { let fixture = RestServerFixture::new().await; @@ -1773,6 +1948,7 @@ mod tests { id: Some(vec!["level1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1784,6 +1960,7 @@ mod tests { id: Some(vec!["level1".to_string(), "level2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1799,6 +1976,7 @@ mod tests { ]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1847,6 +2025,7 @@ mod tests { id: Some(vec!["level1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1858,6 +2037,7 @@ mod tests { id: Some(vec!["level1".to_string(), "level2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1873,6 +2053,7 @@ mod tests { ]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1889,6 +2070,7 @@ mod tests { "deep_table".to_string(), ]), mode: Some("Create".to_string()), + ..Default::default() }; let result = fixture @@ -1926,6 +2108,7 @@ mod tests { id: Some(vec!["namespace1".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1937,6 +2120,7 @@ mod tests { id: Some(vec!["namespace2".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -1948,6 +2132,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["namespace1".to_string(), "shared_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1958,6 +2143,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["namespace2".to_string(), "shared_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -1968,6 +2154,7 @@ mod tests { // Drop table in namespace1 let drop_req = DropTableRequest { id: Some(vec!["namespace1".to_string(), "shared_table".to_string()]), + ..Default::default() }; fixture.namespace.drop_table(drop_req).await.unwrap(); @@ -1997,6 +2184,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2008,6 +2196,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -2040,6 +2229,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2059,6 +2249,7 @@ mod tests { // Verify namespace no longer exists let exists_req = NamespaceExistsRequest { id: Some(vec!["test_namespace".to_string()]), + ..Default::default() }; let result = fixture.namespace.namespace_exists(exists_req).await; assert!(result.is_err(), "Namespace should not exist after drop"); @@ -2079,6 +2270,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: Some(properties.clone()), mode: None, + ..Default::default() }; fixture .namespace @@ -2089,6 +2281,7 @@ mod tests { // Describe namespace and verify properties let describe_req = DescribeNamespaceRequest { id: Some(vec!["test_namespace".to_string()]), + ..Default::default() }; let result = fixture.namespace.describe_namespace(describe_req).await; assert!(result.is_ok()); @@ -2104,7 +2297,10 @@ mod tests { let fixture = RestServerFixture::new().await; // Root namespace should always exist - let exists_req = NamespaceExistsRequest { id: Some(vec![]) }; + let exists_req = NamespaceExistsRequest { + id: Some(vec![]), + ..Default::default() + }; let result = fixture.namespace.namespace_exists(exists_req).await; assert!(result.is_ok(), "Root namespace should exist"); @@ -2113,6 +2309,7 @@ mod tests { id: Some(vec![]), properties: None, mode: None, + ..Default::default() }; let result = fixture.namespace.create_namespace(create_req).await; assert!(result.is_err(), "Cannot create root namespace"); @@ -2146,6 +2343,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2160,6 +2358,7 @@ mod tests { "physical_table".to_string(), ]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -2176,6 +2375,7 @@ mod tests { location: "test_namespace$physical_table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -2210,6 +2410,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2223,6 +2424,7 @@ mod tests { location: "s3://bucket/table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -2244,6 +2446,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2257,6 +2460,7 @@ mod tests { location: "../outside/table.lance".to_string(), mode: None, properties: None, + ..Default::default() }; let result = fixture.namespace.register_table(register_req).await; @@ -2279,6 +2483,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2290,6 +2495,7 @@ mod tests { let create_table_req = CreateTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), mode: Some("Create".to_string()), + ..Default::default() }; fixture .namespace @@ -2309,6 +2515,7 @@ mod tests { // Deregister the table let deregister_req = DeregisterTableRequest { id: Some(vec!["test_namespace".to_string(), "test_table".to_string()]), + ..Default::default() }; let result = fixture.namespace.deregister_table(deregister_req).await; assert!( @@ -2354,6 +2561,7 @@ mod tests { id: Some(vec!["test_namespace".to_string()]), properties: None, mode: None, + ..Default::default() }; fixture .namespace @@ -2368,6 +2576,7 @@ mod tests { "original_table".to_string(), ]), mode: Some("Create".to_string()), + ..Default::default() }; let create_response = fixture .namespace @@ -2381,6 +2590,7 @@ mod tests { "test_namespace".to_string(), "original_table".to_string(), ]), + ..Default::default() }; fixture .namespace @@ -2410,6 +2620,7 @@ mod tests { location: relative_location.clone(), mode: None, properties: None, + ..Default::default() }; let register_response = fixture diff --git a/rust/lance-namespace/src/error.rs b/rust/lance-namespace/src/error.rs new file mode 100644 index 00000000000..71fb7c12c31 --- /dev/null +++ b/rust/lance-namespace/src/error.rs @@ -0,0 +1,404 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Lance Namespace error types. +//! +//! This module defines fine-grained error types for Lance Namespace operations. +//! Each error type has a unique numeric code that is consistent across all +//! Lance Namespace implementations (Python, Java, Rust, REST). +//! +//! # Error Handling +//! +//! Namespace operations return [`NamespaceError`] which can be converted to +//! [`lance_core::Error`] for integration with the Lance ecosystem. +//! +//! ```rust,ignore +//! use lance_namespace::{NamespaceError, ErrorCode}; +//! +//! // Create and use namespace errors +//! let err = NamespaceError::TableNotFound { +//! message: "Table 'users' not found".into(), +//! }; +//! assert_eq!(err.code(), ErrorCode::TableNotFound); +//! +//! // Convert to lance_core::Error +//! let lance_err: lance_core::Error = err.into(); +//! ``` + +use lance_core::error::ToSnafuLocation; +use snafu::Snafu; + +/// Lance Namespace error codes. +/// +/// These codes are globally unique across all Lance Namespace implementations +/// (Python, Java, Rust, REST). Use these codes for programmatic error handling. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[repr(u32)] +pub enum ErrorCode { + /// Operation not supported by this backend + Unsupported = 0, + /// The specified namespace does not exist + NamespaceNotFound = 1, + /// A namespace with this name already exists + NamespaceAlreadyExists = 2, + /// Namespace contains tables or child namespaces + NamespaceNotEmpty = 3, + /// The specified table does not exist + TableNotFound = 4, + /// A table with this name already exists + TableAlreadyExists = 5, + /// The specified table index does not exist + TableIndexNotFound = 6, + /// A table index with this name already exists + TableIndexAlreadyExists = 7, + /// The specified table tag does not exist + TableTagNotFound = 8, + /// A table tag with this name already exists + TableTagAlreadyExists = 9, + /// The specified transaction does not exist + TransactionNotFound = 10, + /// The specified table version does not exist + TableVersionNotFound = 11, + /// The specified table column does not exist + TableColumnNotFound = 12, + /// Malformed request or invalid parameters + InvalidInput = 13, + /// Optimistic concurrency conflict + ConcurrentModification = 14, + /// User lacks permission for this operation + PermissionDenied = 15, + /// Authentication credentials are missing or invalid + Unauthenticated = 16, + /// Service is temporarily unavailable + ServiceUnavailable = 17, + /// Unexpected server/implementation error + Internal = 18, + /// Table is in an invalid state for the operation + InvalidTableState = 19, + /// Table schema validation failed + TableSchemaValidationError = 20, +} + +impl ErrorCode { + /// Returns the numeric code value. + pub fn as_u32(self) -> u32 { + self as u32 + } + + /// Creates an ErrorCode from a numeric code. + /// + /// Returns `None` if the code is not recognized. + pub fn from_u32(code: u32) -> Option { + match code { + 0 => Some(Self::Unsupported), + 1 => Some(Self::NamespaceNotFound), + 2 => Some(Self::NamespaceAlreadyExists), + 3 => Some(Self::NamespaceNotEmpty), + 4 => Some(Self::TableNotFound), + 5 => Some(Self::TableAlreadyExists), + 6 => Some(Self::TableIndexNotFound), + 7 => Some(Self::TableIndexAlreadyExists), + 8 => Some(Self::TableTagNotFound), + 9 => Some(Self::TableTagAlreadyExists), + 10 => Some(Self::TransactionNotFound), + 11 => Some(Self::TableVersionNotFound), + 12 => Some(Self::TableColumnNotFound), + 13 => Some(Self::InvalidInput), + 14 => Some(Self::ConcurrentModification), + 15 => Some(Self::PermissionDenied), + 16 => Some(Self::Unauthenticated), + 17 => Some(Self::ServiceUnavailable), + 18 => Some(Self::Internal), + 19 => Some(Self::InvalidTableState), + 20 => Some(Self::TableSchemaValidationError), + _ => None, + } + } +} + +impl std::fmt::Display for ErrorCode { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let name = match self { + Self::Unsupported => "Unsupported", + Self::NamespaceNotFound => "NamespaceNotFound", + Self::NamespaceAlreadyExists => "NamespaceAlreadyExists", + Self::NamespaceNotEmpty => "NamespaceNotEmpty", + Self::TableNotFound => "TableNotFound", + Self::TableAlreadyExists => "TableAlreadyExists", + Self::TableIndexNotFound => "TableIndexNotFound", + Self::TableIndexAlreadyExists => "TableIndexAlreadyExists", + Self::TableTagNotFound => "TableTagNotFound", + Self::TableTagAlreadyExists => "TableTagAlreadyExists", + Self::TransactionNotFound => "TransactionNotFound", + Self::TableVersionNotFound => "TableVersionNotFound", + Self::TableColumnNotFound => "TableColumnNotFound", + Self::InvalidInput => "InvalidInput", + Self::ConcurrentModification => "ConcurrentModification", + Self::PermissionDenied => "PermissionDenied", + Self::Unauthenticated => "Unauthenticated", + Self::ServiceUnavailable => "ServiceUnavailable", + Self::Internal => "Internal", + Self::InvalidTableState => "InvalidTableState", + Self::TableSchemaValidationError => "TableSchemaValidationError", + }; + write!(f, "{}", name) + } +} + +/// Lance Namespace error type. +/// +/// This enum provides fine-grained error types for Lance Namespace operations. +/// Each variant corresponds to a specific error condition and has an associated +/// [`ErrorCode`] accessible via the [`code()`](NamespaceError::code) method. +/// +/// # Converting to lance_core::Error +/// +/// `NamespaceError` implements `Into`, preserving the original +/// error so it can be downcast later: +/// +/// ```rust,ignore +/// let ns_err = NamespaceError::TableNotFound { message: "...".into() }; +/// let lance_err: lance_core::Error = ns_err.into(); +/// +/// // Later, extract the original error: +/// if let lance_core::Error::Namespace { source, .. } = &lance_err { +/// if let Some(ns_err) = source.downcast_ref::() { +/// println!("Error code: {:?}", ns_err.code()); +/// } +/// } +/// ``` +#[derive(Debug, Snafu)] +#[snafu(visibility(pub))] +pub enum NamespaceError { + /// Operation not supported by this backend. + #[snafu(display("Unsupported: {message}"))] + Unsupported { message: String }, + + /// The specified namespace does not exist. + #[snafu(display("Namespace not found: {message}"))] + NamespaceNotFound { message: String }, + + /// A namespace with this name already exists. + #[snafu(display("Namespace already exists: {message}"))] + NamespaceAlreadyExists { message: String }, + + /// Namespace contains tables or child namespaces. + #[snafu(display("Namespace not empty: {message}"))] + NamespaceNotEmpty { message: String }, + + /// The specified table does not exist. + #[snafu(display("Table not found: {message}"))] + TableNotFound { message: String }, + + /// A table with this name already exists. + #[snafu(display("Table already exists: {message}"))] + TableAlreadyExists { message: String }, + + /// The specified table index does not exist. + #[snafu(display("Table index not found: {message}"))] + TableIndexNotFound { message: String }, + + /// A table index with this name already exists. + #[snafu(display("Table index already exists: {message}"))] + TableIndexAlreadyExists { message: String }, + + /// The specified table tag does not exist. + #[snafu(display("Table tag not found: {message}"))] + TableTagNotFound { message: String }, + + /// A table tag with this name already exists. + #[snafu(display("Table tag already exists: {message}"))] + TableTagAlreadyExists { message: String }, + + /// The specified transaction does not exist. + #[snafu(display("Transaction not found: {message}"))] + TransactionNotFound { message: String }, + + /// The specified table version does not exist. + #[snafu(display("Table version not found: {message}"))] + TableVersionNotFound { message: String }, + + /// The specified table column does not exist. + #[snafu(display("Table column not found: {message}"))] + TableColumnNotFound { message: String }, + + /// Malformed request or invalid parameters. + #[snafu(display("Invalid input: {message}"))] + InvalidInput { message: String }, + + /// Optimistic concurrency conflict. + #[snafu(display("Concurrent modification: {message}"))] + ConcurrentModification { message: String }, + + /// User lacks permission for this operation. + #[snafu(display("Permission denied: {message}"))] + PermissionDenied { message: String }, + + /// Authentication credentials are missing or invalid. + #[snafu(display("Unauthenticated: {message}"))] + Unauthenticated { message: String }, + + /// Service is temporarily unavailable. + #[snafu(display("Service unavailable: {message}"))] + ServiceUnavailable { message: String }, + + /// Unexpected internal error. + #[snafu(display("Internal error: {message}"))] + Internal { message: String }, + + /// Table is in an invalid state for the operation. + #[snafu(display("Invalid table state: {message}"))] + InvalidTableState { message: String }, + + /// Table schema validation failed. + #[snafu(display("Table schema validation error: {message}"))] + TableSchemaValidationError { message: String }, +} + +impl NamespaceError { + /// Returns the error code for this error. + /// + /// Use this for programmatic error handling across language boundaries. + pub fn code(&self) -> ErrorCode { + match self { + Self::Unsupported { .. } => ErrorCode::Unsupported, + Self::NamespaceNotFound { .. } => ErrorCode::NamespaceNotFound, + Self::NamespaceAlreadyExists { .. } => ErrorCode::NamespaceAlreadyExists, + Self::NamespaceNotEmpty { .. } => ErrorCode::NamespaceNotEmpty, + Self::TableNotFound { .. } => ErrorCode::TableNotFound, + Self::TableAlreadyExists { .. } => ErrorCode::TableAlreadyExists, + Self::TableIndexNotFound { .. } => ErrorCode::TableIndexNotFound, + Self::TableIndexAlreadyExists { .. } => ErrorCode::TableIndexAlreadyExists, + Self::TableTagNotFound { .. } => ErrorCode::TableTagNotFound, + Self::TableTagAlreadyExists { .. } => ErrorCode::TableTagAlreadyExists, + Self::TransactionNotFound { .. } => ErrorCode::TransactionNotFound, + Self::TableVersionNotFound { .. } => ErrorCode::TableVersionNotFound, + Self::TableColumnNotFound { .. } => ErrorCode::TableColumnNotFound, + Self::InvalidInput { .. } => ErrorCode::InvalidInput, + Self::ConcurrentModification { .. } => ErrorCode::ConcurrentModification, + Self::PermissionDenied { .. } => ErrorCode::PermissionDenied, + Self::Unauthenticated { .. } => ErrorCode::Unauthenticated, + Self::ServiceUnavailable { .. } => ErrorCode::ServiceUnavailable, + Self::Internal { .. } => ErrorCode::Internal, + Self::InvalidTableState { .. } => ErrorCode::InvalidTableState, + Self::TableSchemaValidationError { .. } => ErrorCode::TableSchemaValidationError, + } + } + + /// Creates a NamespaceError from an error code and message. + /// + /// This is useful when receiving errors from REST API or other language bindings. + pub fn from_code(code: u32, message: impl Into) -> Self { + let message = message.into(); + match ErrorCode::from_u32(code) { + Some(ErrorCode::Unsupported) => Self::Unsupported { message }, + Some(ErrorCode::NamespaceNotFound) => Self::NamespaceNotFound { message }, + Some(ErrorCode::NamespaceAlreadyExists) => Self::NamespaceAlreadyExists { message }, + Some(ErrorCode::NamespaceNotEmpty) => Self::NamespaceNotEmpty { message }, + Some(ErrorCode::TableNotFound) => Self::TableNotFound { message }, + Some(ErrorCode::TableAlreadyExists) => Self::TableAlreadyExists { message }, + Some(ErrorCode::TableIndexNotFound) => Self::TableIndexNotFound { message }, + Some(ErrorCode::TableIndexAlreadyExists) => Self::TableIndexAlreadyExists { message }, + Some(ErrorCode::TableTagNotFound) => Self::TableTagNotFound { message }, + Some(ErrorCode::TableTagAlreadyExists) => Self::TableTagAlreadyExists { message }, + Some(ErrorCode::TransactionNotFound) => Self::TransactionNotFound { message }, + Some(ErrorCode::TableVersionNotFound) => Self::TableVersionNotFound { message }, + Some(ErrorCode::TableColumnNotFound) => Self::TableColumnNotFound { message }, + Some(ErrorCode::InvalidInput) => Self::InvalidInput { message }, + Some(ErrorCode::ConcurrentModification) => Self::ConcurrentModification { message }, + Some(ErrorCode::PermissionDenied) => Self::PermissionDenied { message }, + Some(ErrorCode::Unauthenticated) => Self::Unauthenticated { message }, + Some(ErrorCode::ServiceUnavailable) => Self::ServiceUnavailable { message }, + Some(ErrorCode::Internal) => Self::Internal { message }, + Some(ErrorCode::InvalidTableState) => Self::InvalidTableState { message }, + Some(ErrorCode::TableSchemaValidationError) => { + Self::TableSchemaValidationError { message } + } + None => Self::Internal { message }, + } + } +} + +/// Converts a NamespaceError into a lance_core::Error. +/// +/// The original `NamespaceError` is preserved in the `source` field and can be +/// extracted via downcasting for programmatic error handling. +impl From for lance_core::Error { + #[track_caller] + fn from(err: NamespaceError) -> Self { + Self::Namespace { + source: Box::new(err), + location: std::panic::Location::caller().to_snafu_location(), + } + } +} + +/// Result type for namespace operations. +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_error_code_roundtrip() { + for code in 0..=20 { + let error_code = ErrorCode::from_u32(code).unwrap(); + assert_eq!(error_code.as_u32(), code); + } + } + + #[test] + fn test_unknown_error_code() { + assert!(ErrorCode::from_u32(999).is_none()); + } + + #[test] + fn test_namespace_error_code() { + let err = NamespaceError::TableNotFound { + message: "test table".to_string(), + }; + assert_eq!(err.code(), ErrorCode::TableNotFound); + assert_eq!(err.code().as_u32(), 4); + } + + #[test] + fn test_from_code() { + let err = NamespaceError::from_code(4, "table not found"); + assert_eq!(err.code(), ErrorCode::TableNotFound); + assert!(err.to_string().contains("table not found")); + } + + #[test] + fn test_from_unknown_code() { + let err = NamespaceError::from_code(999, "unknown error"); + assert_eq!(err.code(), ErrorCode::Internal); + } + + #[test] + fn test_convert_to_lance_error() { + let ns_err = NamespaceError::TableNotFound { + message: "users".to_string(), + }; + let lance_err: lance_core::Error = ns_err.into(); + + // Verify it's a Namespace error + match &lance_err { + lance_core::Error::Namespace { source, .. } => { + // Downcast to get the original error + let downcast = source.downcast_ref::(); + assert!(downcast.is_some()); + assert_eq!(downcast.unwrap().code(), ErrorCode::TableNotFound); + } + _ => panic!("Expected Namespace error"), + } + } + + #[test] + fn test_error_display() { + let err = NamespaceError::TableNotFound { + message: "users".to_string(), + }; + assert_eq!(err.to_string(), "Table not found: users"); + } +} diff --git a/rust/lance-namespace/src/lib.rs b/rust/lance-namespace/src/lib.rs index 51bd18a2fb5..6fd9a9b7ab2 100644 --- a/rust/lance-namespace/src/lib.rs +++ b/rust/lance-namespace/src/lib.rs @@ -5,7 +5,17 @@ //! //! A Rust client for the Lance Namespace API that provides a unified interface //! for managing namespaces and tables across different backend implementations. +//! +//! # Error Handling +//! +//! This crate provides fine-grained error types through the [`error`] module. +//! Each error type has a unique numeric code that is consistent across all +//! Lance Namespace implementations (Python, Java, Rust, REST). +//! +//! See [`error::ErrorCode`] for the list of error codes and +//! [`error::NamespaceError`] for the error types. +pub mod error; pub mod namespace; pub mod schema; @@ -13,6 +23,9 @@ pub mod schema; pub use lance_core::{Error, Result}; pub use namespace::LanceNamespace; +// Re-export error types +pub use error::{ErrorCode, NamespaceError, Result as NamespaceResult}; + // Re-export reqwest client for convenience pub use lance_namespace_reqwest_client as reqwest_client; diff --git a/rust/lance-namespace/src/namespace.rs b/rust/lance-namespace/src/namespace.rs index 60c206530f4..3e27df15ba7 100644 --- a/rust/lance-namespace/src/namespace.rs +++ b/rust/lance-namespace/src/namespace.rs @@ -16,13 +16,13 @@ use lance_namespace_reqwest_client::models::{ CreateNamespaceRequest, CreateNamespaceResponse, CreateTableIndexRequest, CreateTableIndexResponse, CreateTableRequest, CreateTableResponse, CreateTableScalarIndexResponse, CreateTableTagRequest, CreateTableTagResponse, - DeleteFromTableRequest, DeleteFromTableResponse, DeleteTableTagRequest, DeleteTableTagResponse, - DeregisterTableRequest, DeregisterTableResponse, DescribeNamespaceRequest, - DescribeNamespaceResponse, DescribeTableIndexStatsRequest, DescribeTableIndexStatsResponse, - DescribeTableRequest, DescribeTableResponse, DescribeTransactionRequest, - DescribeTransactionResponse, DropNamespaceRequest, DropNamespaceResponse, - DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, DropTableResponse, - ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, + DeclareTableRequest, DeclareTableResponse, DeleteFromTableRequest, DeleteFromTableResponse, + DeleteTableTagRequest, DeleteTableTagResponse, DeregisterTableRequest, DeregisterTableResponse, + DescribeNamespaceRequest, DescribeNamespaceResponse, DescribeTableIndexStatsRequest, + DescribeTableIndexStatsResponse, DescribeTableRequest, DescribeTableResponse, + DescribeTransactionRequest, DescribeTransactionResponse, DropNamespaceRequest, + DropNamespaceResponse, DropTableIndexRequest, DropTableIndexResponse, DropTableRequest, + DropTableResponse, ExplainTableQueryPlanRequest, GetTableStatsRequest, GetTableStatsResponse, GetTableTagVersionRequest, GetTableTagVersionResponse, InsertIntoTableRequest, InsertIntoTableResponse, ListNamespacesRequest, ListNamespacesResponse, ListTableIndicesRequest, ListTableIndicesResponse, ListTableTagsRequest, ListTableTagsResponse, @@ -39,9 +39,26 @@ use lance_namespace_reqwest_client::models::{ /// This trait defines the interface that all Lance namespace implementations /// must provide. Each method corresponds to a specific operation on namespaces /// or tables. +/// +/// # Error Handling +/// +/// All operations may return the following common errors (via [`crate::NamespaceError`]): +/// +/// - [`crate::ErrorCode::Unsupported`] - Operation not supported by this backend +/// - [`crate::ErrorCode::InvalidInput`] - Invalid request parameters +/// - [`crate::ErrorCode::PermissionDenied`] - Insufficient permissions +/// - [`crate::ErrorCode::Unauthenticated`] - Invalid credentials +/// - [`crate::ErrorCode::ServiceUnavailable`] - Service temporarily unavailable +/// - [`crate::ErrorCode::Internal`] - Unexpected internal error +/// +/// See individual method documentation for operation-specific errors. #[async_trait] pub trait LanceNamespace: Send + Sync + std::fmt::Debug { /// List namespaces. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the parent namespace does not exist. async fn list_namespaces( &self, _request: ListNamespacesRequest, @@ -53,6 +70,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Describe a namespace. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. async fn describe_namespace( &self, _request: DescribeNamespaceRequest, @@ -64,6 +85,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Create a new namespace. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceAlreadyExists`] if a namespace with the same name already exists. async fn create_namespace( &self, _request: CreateNamespaceRequest, @@ -75,6 +100,11 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Drop a namespace. + /// + /// # Errors + /// + /// - [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. + /// - [`crate::ErrorCode::NamespaceNotEmpty`] if the namespace contains tables or child namespaces. async fn drop_namespace( &self, _request: DropNamespaceRequest, @@ -86,6 +116,10 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { } /// Check if a namespace exists. + /// + /// # Errors + /// + /// Returns [`crate::ErrorCode::NamespaceNotFound`] if the namespace does not exist. async fn namespace_exists(&self, _request: NamespaceExistsRequest) -> Result<()> { Err(Error::NotSupported { source: "namespace_exists not implemented".into(), @@ -170,7 +204,23 @@ pub trait LanceNamespace: Send + Sync + std::fmt::Debug { }) } + /// Declare a table (metadata only operation). + async fn declare_table(&self, _request: DeclareTableRequest) -> Result { + Err(Error::NotSupported { + source: "declare_table not implemented".into(), + location: Location::new(file!(), line!(), column!()), + }) + } + /// Create an empty table (metadata only operation). + /// + /// # Deprecated + /// + /// Use [`declare_table`](Self::declare_table) instead. Support will be removed in 3.0.0. + #[deprecated( + since = "2.0.0", + note = "Use declare_table instead. Support will be removed in 3.0.0." + )] async fn create_empty_table( &self, _request: CreateEmptyTableRequest, diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 1079a72d600..7249594783d 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -110,7 +110,9 @@ pub use blob::BlobFile; use hash_joiner::HashJoiner; use lance_core::box_error; pub use lance_core::ROW_ID; -use lance_namespace::models::{CreateEmptyTableRequest, DescribeTableRequest}; +use lance_namespace::models::{ + CreateEmptyTableRequest, DeclareTableRequest, DeclareTableResponse, DescribeTableRequest, +}; use lance_table::feature_flags::{apply_feature_flags, can_read_dataset}; pub use schema_evolution::{ BatchInfo, BatchUDF, ColumnAlteration, NewColumnTransform, UDFCheckpointStore, @@ -825,23 +827,45 @@ impl Dataset { match write_params.mode { WriteMode::Create => { - let request = CreateEmptyTableRequest { + let declare_request = DeclareTableRequest { id: Some(table_id.clone()), - location: None, - properties: None, + ..Default::default() }; - let response = - namespace - .create_empty_table(request) - .await - .map_err(|e| Error::Namespace { + // Try declare_table first, fall back to deprecated create_empty_table + // for backward compatibility with older namespace implementations. + // create_empty_table support will be removed in 3.0.0. + #[allow(deprecated)] + let response = match namespace.declare_table(declare_request).await { + Ok(resp) => resp, + Err(Error::NotSupported { .. }) => { + let fallback_request = CreateEmptyTableRequest { + id: Some(table_id.clone()), + ..Default::default() + }; + let fallback_resp = namespace + .create_empty_table(fallback_request) + .await + .map_err(|e| Error::Namespace { + source: Box::new(e), + location: location!(), + })?; + DeclareTableResponse { + transaction_id: fallback_resp.transaction_id, + location: fallback_resp.location, + storage_options: fallback_resp.storage_options, + } + } + Err(e) => { + return Err(Error::Namespace { source: Box::new(e), location: location!(), - })?; + }); + } + }; let uri = response.location.ok_or_else(|| Error::Namespace { source: Box::new(std::io::Error::other( - "Table location not found in create_empty_table response", + "Table location not found in declare_table response", )), location: location!(), })?; @@ -875,8 +899,7 @@ impl Dataset { WriteMode::Append | WriteMode::Overwrite => { let request = DescribeTableRequest { id: Some(table_id.clone()), - version: None, - with_table_uri: None, + ..Default::default() }; let response = namespace diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 332ba504cf9..6bac1b553a3 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -136,8 +136,7 @@ impl DatasetBuilder { ) -> Result { let request = DescribeTableRequest { id: Some(table_id.clone()), - version: None, - with_table_uri: None, + ..Default::default() }; let response = namespace From 5b559ce1b17af1cf2b7834efe39a9010e7ff5c27 Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 21:11:17 -0800 Subject: [PATCH 7/8] feat: support dynamic context for lance namespace (#5710) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This feature is similar to lancedb HeaderProvider, but implemented in a more generic way. In lance-namespace 0.4.5, we introduced per-request context which is a free-form map that can be passed in, and processed by different implementations differently. Based on that, we add a `DynamicContextProvider`. Then specifically for `RestNamespace`, we define that any context starting with `headers.` will be translated to a request level header. Because of the requirement to dynamically inject per-request headers, we also moved to use raw reqwest client now for all HTTP requests in RestNamespace. Adapted for release branch by using Python::with_gil instead of Python::attach (pyo3 0.25 vs 0.26). 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- java/lance-jni/src/namespace.rs | 175 ++- .../lance/namespace/DirectoryNamespace.java | 113 +- .../namespace/DynamicContextProvider.java | 77 + .../org/lance/namespace/RestNamespace.java | 117 +- .../namespace/DynamicContextProviderTest.java | 307 ++++ .../lance/namespace/TestContextProvider.java | 36 + python/python/lance/namespace.py | 202 ++- python/python/tests/test_namespace_rest.py | 63 + python/src/namespace.rs | 110 +- rust/lance-namespace-impls/Cargo.toml | 2 +- rust/lance-namespace-impls/src/connect.rs | 83 +- rust/lance-namespace-impls/src/context.rs | 161 +++ rust/lance-namespace-impls/src/dir.rs | 44 +- rust/lance-namespace-impls/src/lib.rs | 2 + rust/lance-namespace-impls/src/rest.rs | 1234 ++++++++++++----- .../lance-namespace-impls/src/rest_adapter.rs | 126 ++ 16 files changed, 2473 insertions(+), 379 deletions(-) create mode 100644 java/src/main/java/org/lance/namespace/DynamicContextProvider.java create mode 100644 java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java create mode 100644 java/src/test/java/org/lance/namespace/TestContextProvider.java create mode 100644 rust/lance-namespace-impls/src/context.rs diff --git a/java/lance-jni/src/namespace.rs b/java/lance-jni/src/namespace.rs index 4b1d5a82d21..b9db171c064 100644 --- a/java/lance-jni/src/namespace.rs +++ b/java/lance-jni/src/namespace.rs @@ -1,23 +1,121 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; +use std::sync::Arc; + use bytes::Bytes; -use jni::objects::{JByteArray, JMap, JObject, JString}; +use jni::objects::{GlobalRef, JByteArray, JMap, JObject, JString, JValue}; use jni::sys::{jbyteArray, jlong, jstring}; use jni::JNIEnv; use lance_namespace::models::*; use lance_namespace::LanceNamespace as LanceNamespaceTrait; use lance_namespace_impls::{ - ConnectBuilder, DirectoryNamespace, DirectoryNamespaceBuilder, RestAdapter, RestAdapterConfig, - RestNamespace, RestNamespaceBuilder, + ConnectBuilder, DirectoryNamespace, DirectoryNamespaceBuilder, DynamicContextProvider, + OperationInfo, RestAdapter, RestAdapterConfig, RestNamespace, RestNamespaceBuilder, }; use serde::{Deserialize, Serialize}; -use std::sync::Arc; use crate::error::{Error, Result}; use crate::utils::to_rust_map; use crate::RT; +/// Java-implemented dynamic context provider. +/// +/// Wraps a Java object that implements the DynamicContextProvider interface. +pub struct JavaDynamicContextProvider { + java_provider: GlobalRef, + jvm: Arc, +} + +impl JavaDynamicContextProvider { + /// Create a new Java context provider wrapper. + pub fn new(env: &mut JNIEnv, java_provider: &JObject) -> Result { + let java_provider = env.new_global_ref(java_provider)?; + let jvm = Arc::new(env.get_java_vm()?); + Ok(Self { java_provider, jvm }) + } +} + +impl std::fmt::Debug for JavaDynamicContextProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "JavaDynamicContextProvider") + } +} + +impl DynamicContextProvider for JavaDynamicContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + // Attach to JVM + let mut env = match self.jvm.attach_current_thread() { + Ok(env) => env, + Err(e) => { + log::error!("Failed to attach to JVM: {}", e); + return HashMap::new(); + } + }; + + // Create Java strings for parameters + let operation = match env.new_string(&info.operation) { + Ok(s) => s, + Err(e) => { + log::error!("Failed to create operation string: {}", e); + return HashMap::new(); + } + }; + + let object_id = match env.new_string(&info.object_id) { + Ok(s) => s, + Err(e) => { + log::error!("Failed to create object_id string: {}", e); + return HashMap::new(); + } + }; + + // Call provideContext(String, String) -> Map + let result = env.call_method( + &self.java_provider, + "provideContext", + "(Ljava/lang/String;Ljava/lang/String;)Ljava/util/Map;", + &[JValue::Object(&operation), JValue::Object(&object_id)], + ); + + match result { + Ok(jvalue) => match jvalue.l() { + Ok(obj) if !obj.is_null() => { + // Convert Java Map to Rust HashMap + convert_java_map_to_hashmap(&mut env, &obj).unwrap_or_default() + } + Ok(_) => HashMap::new(), + Err(e) => { + log::error!("provideContext did not return object: {}", e); + HashMap::new() + } + }, + Err(e) => { + log::error!("Failed to call provideContext: {}", e); + HashMap::new() + } + } + } +} + +fn convert_java_map_to_hashmap( + env: &mut JNIEnv, + map_obj: &JObject, +) -> Result> { + let jmap = JMap::from_env(env, map_obj)?; + let mut result = HashMap::new(); + + let mut iter = jmap.iter(env)?; + while let Some((key, value)) = iter.next(env)? { + let key_str: String = env.get_string(&JString::from(key))?.into(); + let value_str: String = env.get_string(&JString::from(value))?.into(); + result.insert(key_str, value_str); + } + + Ok(result) +} + /// Blocking wrapper for DirectoryNamespace pub struct BlockingDirectoryNamespace { pub(crate) inner: DirectoryNamespace, @@ -40,20 +138,47 @@ pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createNative( ) -> jlong { ok_or_throw_with_return!( env, - create_directory_namespace_internal(&mut env, properties_map), + create_directory_namespace_internal(&mut env, properties_map, None), 0 ) } -fn create_directory_namespace_internal(env: &mut JNIEnv, properties_map: JObject) -> Result { +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_DirectoryNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_directory_namespace_internal(&mut env, properties_map, Some(context_provider)), + 0 + ) +} + +fn create_directory_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { // Convert Java HashMap to Rust HashMap let jmap = JMap::from_env(env, &properties_map)?; let properties = to_rust_map(env, &jmap)?; // Build DirectoryNamespace using builder - let builder = DirectoryNamespaceBuilder::from_properties(properties, None).map_err(|e| { - Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) - })?; + let mut builder = + DirectoryNamespaceBuilder::from_properties(properties, None).map_err(|e| { + Error::runtime_error(format!("Failed to create DirectoryNamespaceBuilder: {}", e)) + })?; + + // Add context provider if provided + if let Some(provider_obj) = context_provider { + if !provider_obj.is_null() { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + builder = builder.context_provider(Arc::new(java_provider)); + } + } let namespace = RT .block_on(builder.build()) @@ -537,21 +662,47 @@ pub extern "system" fn Java_org_lance_namespace_RestNamespace_createNative( ) -> jlong { ok_or_throw_with_return!( env, - create_rest_namespace_internal(&mut env, properties_map), + create_rest_namespace_internal(&mut env, properties_map, None), 0 ) } -fn create_rest_namespace_internal(env: &mut JNIEnv, properties_map: JObject) -> Result { +#[no_mangle] +pub extern "system" fn Java_org_lance_namespace_RestNamespace_createNativeWithProvider( + mut env: JNIEnv, + _obj: JObject, + properties_map: JObject, + context_provider: JObject, +) -> jlong { + ok_or_throw_with_return!( + env, + create_rest_namespace_internal(&mut env, properties_map, Some(context_provider)), + 0 + ) +} + +fn create_rest_namespace_internal( + env: &mut JNIEnv, + properties_map: JObject, + context_provider: Option, +) -> Result { // Convert Java HashMap to Rust HashMap let jmap = JMap::from_env(env, &properties_map)?; let properties = to_rust_map(env, &jmap)?; // Build RestNamespace using builder - let builder = RestNamespaceBuilder::from_properties(properties).map_err(|e| { + let mut builder = RestNamespaceBuilder::from_properties(properties).map_err(|e| { Error::runtime_error(format!("Failed to create RestNamespaceBuilder: {}", e)) })?; + // Add context provider if provided + if let Some(provider_obj) = context_provider { + if !provider_obj.is_null() { + let java_provider = JavaDynamicContextProvider::new(env, &provider_obj)?; + builder = builder.context_provider(Arc::new(java_provider)); + } + } + let namespace = builder.build(); let blocking_namespace = BlockingRestNamespace { inner: namespace }; diff --git a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java index a0796739a3c..3ffe2b82f01 100644 --- a/java/src/main/java/org/lance/namespace/DirectoryNamespace.java +++ b/java/src/main/java/org/lance/namespace/DirectoryNamespace.java @@ -21,7 +21,10 @@ import org.apache.arrow.memory.BufferAllocator; import java.io.Closeable; +import java.lang.reflect.Constructor; +import java.util.HashMap; import java.util.Map; +import java.util.Optional; /** * DirectoryNamespace implementation that provides Lance namespace functionality for directory-based @@ -149,11 +152,43 @@ public DirectoryNamespace() {} @Override public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** + * Initialize with a dynamic context provider. + * + *

If contextProvider is null and the properties contain {@code dynamic_context_provider.impl}, + * the provider will be loaded from the class path. The class must implement {@link + * DynamicContextProvider} and have a constructor accepting {@code Map}. + * + * @param configProperties Configuration properties for the namespace + * @param allocator Arrow buffer allocator + * @param contextProvider Optional provider for per-request context (e.g., dynamic auth headers) + */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { if (this.nativeDirectoryNamespaceHandle != 0) { throw new IllegalStateException("DirectoryNamespace already initialized"); } this.allocator = allocator; - this.nativeDirectoryNamespaceHandle = createNative(configProperties); + + // If no explicit provider, try to create from properties + DynamicContextProvider provider = contextProvider; + if (provider == null) { + provider = createProviderFromProperties(configProperties).orElse(null); + } + + // Filter out provider properties before passing to native layer + Map filteredProperties = filterProviderProperties(configProperties); + + if (provider != null) { + this.nativeDirectoryNamespaceHandle = createNativeWithProvider(filteredProperties, provider); + } else { + this.nativeDirectoryNamespaceHandle = createNative(filteredProperties); + } } @Override @@ -399,6 +434,9 @@ private static T fromJson(String json, Class clazz) { // Native methods private native long createNative(Map properties); + private native long createNativeWithProvider( + Map properties, DynamicContextProvider contextProvider); + private native void releaseNative(long handle); private native String namespaceIdNative(long handle); @@ -453,4 +491,77 @@ private native String mergeInsertIntoTableNative( private native String describeTransactionNative(long handle, String requestJson); private native String alterTransactionNative(long handle, String requestJson); + + // ========================================================================== + // Provider loading helpers + // ========================================================================== + + private static final String PROVIDER_PREFIX = "dynamic_context_provider."; + private static final String IMPL_KEY = "dynamic_context_provider.impl"; + + /** + * Create a context provider from properties if configured. + * + *

Loads the class specified by {@code dynamic_context_provider.impl} from the class path and + * instantiates it with the extracted provider properties. + */ + private static Optional createProviderFromProperties( + Map properties) { + String className = properties.get(IMPL_KEY); + if (className == null || className.isEmpty()) { + return Optional.empty(); + } + + // Extract provider-specific properties (strip prefix, exclude impl key) + Map providerProps = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + String key = entry.getKey(); + if (key.startsWith(PROVIDER_PREFIX) && !key.equals(IMPL_KEY)) { + String propName = key.substring(PROVIDER_PREFIX.length()); + providerProps.put(propName, entry.getValue()); + } + } + + try { + Class providerClass = Class.forName(className); + if (!DynamicContextProvider.class.isAssignableFrom(providerClass)) { + throw new IllegalArgumentException( + String.format( + "Class '%s' does not implement DynamicContextProvider interface", className)); + } + + @SuppressWarnings("unchecked") + Class typedClass = + (Class) providerClass; + + Constructor constructor = + typedClass.getConstructor(Map.class); + return Optional.of(constructor.newInstance(providerProps)); + + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException( + String.format("Failed to load context provider class '%s': %s", className, e), e); + } catch (NoSuchMethodException e) { + throw new IllegalArgumentException( + String.format( + "Context provider class '%s' must have a public constructor " + + "that accepts Map", + className), + e); + } catch (ReflectiveOperationException e) { + throw new IllegalArgumentException( + String.format("Failed to instantiate context provider '%s': %s", className, e), e); + } + } + + /** Filter out dynamic_context_provider.* properties from the map. */ + private static Map filterProviderProperties(Map properties) { + Map filtered = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + if (!entry.getKey().startsWith(PROVIDER_PREFIX)) { + filtered.put(entry.getKey(), entry.getValue()); + } + } + return filtered; + } } diff --git a/java/src/main/java/org/lance/namespace/DynamicContextProvider.java b/java/src/main/java/org/lance/namespace/DynamicContextProvider.java new file mode 100644 index 00000000000..77b10c892a4 --- /dev/null +++ b/java/src/main/java/org/lance/namespace/DynamicContextProvider.java @@ -0,0 +1,77 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import java.util.Map; + +/** + * Interface for providing dynamic per-request context to namespace operations. + * + *

Implementations can generate per-request context (e.g., authentication headers) based on the + * operation being performed. The provider is called synchronously before each namespace operation. + * + *

For RestNamespace, context keys that start with {@code headers.} are converted to HTTP headers + * by stripping the prefix. For example, {@code {"headers.Authorization": "Bearer abc123"}} becomes + * the {@code Authorization: Bearer abc123} header. Keys without the {@code headers.} prefix are + * ignored for HTTP headers but may be used for other purposes. + * + *

Example implementation: + * + *

+ * public class MyContextProvider implements DynamicContextProvider {
+ *   @Override
+ *   public Map<String, String> provideContext(String operation, String objectId) {
+ *     Map<String, String> context = new HashMap<>();
+ *     context.put("headers.Authorization", "Bearer " + getAuthToken());
+ *     context.put("headers.X-Request-Id", UUID.randomUUID().toString());
+ *     return context;
+ *   }
+ * }
+ * 
+ * + *

Usage with DirectoryNamespace: + * + *

+ * DynamicContextProvider provider = new MyContextProvider();
+ * Map<String, String> properties = Map.of("root", "/path/to/data");
+ * DirectoryNamespace namespace = new DirectoryNamespace();
+ * namespace.initialize(properties, allocator, provider);
+ * 
+ * + *

Usage with RestNamespace: + * + *

+ * DynamicContextProvider provider = new MyContextProvider();
+ * Map<String, String> properties = Map.of("uri", "https://api.example.com");
+ * RestNamespace namespace = new RestNamespace();
+ * namespace.initialize(properties, provider);
+ * 
+ */ +public interface DynamicContextProvider { + + /** + * Provide context for a namespace operation. + * + *

This method is called synchronously before each namespace operation. Implementations should + * be thread-safe as multiple operations may be performed concurrently. + * + * @param operation The operation name (e.g., "list_tables", "describe_table", "create_namespace") + * @param objectId The object identifier (namespace or table ID in delimited form, e.g., + * "workspace$table_name") + * @return Map of context key-value pairs. For HTTP headers, use keys with the "headers." prefix + * (e.g., "headers.Authorization"). Return an empty map if no additional context is needed. + * Must not return null. + */ + Map provideContext(String operation, String objectId); +} diff --git a/java/src/main/java/org/lance/namespace/RestNamespace.java b/java/src/main/java/org/lance/namespace/RestNamespace.java index b55eeb2f200..840e9f3d690 100644 --- a/java/src/main/java/org/lance/namespace/RestNamespace.java +++ b/java/src/main/java/org/lance/namespace/RestNamespace.java @@ -21,7 +21,10 @@ import org.apache.arrow.memory.BufferAllocator; import java.io.Closeable; +import java.lang.reflect.Constructor; +import java.util.HashMap; import java.util.Map; +import java.util.Optional; /** * RestNamespace implementation that provides Lance namespace functionality via REST API endpoints. @@ -74,11 +77,47 @@ public RestNamespace() {} @Override public void initialize(Map configProperties, BufferAllocator allocator) { + initialize(configProperties, allocator, null); + } + + /** + * Initialize with a dynamic context provider. + * + *

The context provider is called before each namespace operation and can return per-request + * context (e.g., authentication headers). Context keys that start with {@code headers.} are + * converted to HTTP headers by stripping the prefix. + * + *

If contextProvider is null and the properties contain {@code dynamic_context_provider.impl}, + * the provider will be loaded from the class path. The class must implement {@link + * DynamicContextProvider} and have a constructor accepting {@code Map}. + * + * @param configProperties Configuration properties for the namespace + * @param allocator Arrow buffer allocator + * @param contextProvider Optional provider for per-request context (e.g., dynamic auth headers) + */ + public void initialize( + Map configProperties, + BufferAllocator allocator, + DynamicContextProvider contextProvider) { if (this.nativeRestNamespaceHandle != 0) { throw new IllegalStateException("RestNamespace already initialized"); } this.allocator = allocator; - this.nativeRestNamespaceHandle = createNative(configProperties); + + // If no explicit provider, try to create from properties + DynamicContextProvider provider = contextProvider; + if (provider == null) { + provider = createProviderFromProperties(configProperties).orElse(null); + } + + // Filter out provider properties before passing to native layer + Map filteredProperties = filterProviderProperties(configProperties); + + if (provider != null) { + this.nativeRestNamespaceHandle = createNativeWithProvider(filteredProperties, provider); + } else { + this.nativeRestNamespaceHandle = createNative(filteredProperties); + } } @Override @@ -321,6 +360,9 @@ private static T fromJson(String json, Class clazz) { // Native methods private native long createNative(Map properties); + private native long createNativeWithProvider( + Map properties, DynamicContextProvider contextProvider); + private native void releaseNative(long handle); private native String namespaceIdNative(long handle); @@ -375,4 +417,77 @@ private native String mergeInsertIntoTableNative( private native String describeTransactionNative(long handle, String requestJson); private native String alterTransactionNative(long handle, String requestJson); + + // ========================================================================== + // Provider loading helpers + // ========================================================================== + + private static final String PROVIDER_PREFIX = "dynamic_context_provider."; + private static final String IMPL_KEY = "dynamic_context_provider.impl"; + + /** + * Create a context provider from properties if configured. + * + *

Loads the class specified by {@code dynamic_context_provider.impl} from the class path and + * instantiates it with the extracted provider properties. + */ + private static Optional createProviderFromProperties( + Map properties) { + String className = properties.get(IMPL_KEY); + if (className == null || className.isEmpty()) { + return Optional.empty(); + } + + // Extract provider-specific properties (strip prefix, exclude impl key) + Map providerProps = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + String key = entry.getKey(); + if (key.startsWith(PROVIDER_PREFIX) && !key.equals(IMPL_KEY)) { + String propName = key.substring(PROVIDER_PREFIX.length()); + providerProps.put(propName, entry.getValue()); + } + } + + try { + Class providerClass = Class.forName(className); + if (!DynamicContextProvider.class.isAssignableFrom(providerClass)) { + throw new IllegalArgumentException( + String.format( + "Class '%s' does not implement DynamicContextProvider interface", className)); + } + + @SuppressWarnings("unchecked") + Class typedClass = + (Class) providerClass; + + Constructor constructor = + typedClass.getConstructor(Map.class); + return Optional.of(constructor.newInstance(providerProps)); + + } catch (ClassNotFoundException e) { + throw new IllegalArgumentException( + String.format("Failed to load context provider class '%s': %s", className, e), e); + } catch (NoSuchMethodException e) { + throw new IllegalArgumentException( + String.format( + "Context provider class '%s' must have a public constructor " + + "that accepts Map", + className), + e); + } catch (ReflectiveOperationException e) { + throw new IllegalArgumentException( + String.format("Failed to instantiate context provider '%s': %s", className, e), e); + } + } + + /** Filter out dynamic_context_provider.* properties from the map. */ + private static Map filterProviderProperties(Map properties) { + Map filtered = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + if (!entry.getKey().startsWith(PROVIDER_PREFIX)) { + filtered.put(entry.getKey(), entry.getValue()); + } + } + return filtered; + } } diff --git a/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java b/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java new file mode 100644 index 00000000000..7959eb9be58 --- /dev/null +++ b/java/src/test/java/org/lance/namespace/DynamicContextProviderTest.java @@ -0,0 +1,307 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import org.lance.namespace.model.*; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.nio.file.Path; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.jupiter.api.Assertions.*; + +/** Tests for DynamicContextProvider interface. */ +public class DynamicContextProviderTest { + @TempDir Path tempDir; + + private BufferAllocator allocator; + + @BeforeEach + void setUp() { + allocator = new RootAllocator(Long.MAX_VALUE); + } + + @AfterEach + void tearDown() { + if (allocator != null) { + allocator.close(); + } + } + + @Test + void testDirectoryNamespaceWithContextProvider() { + AtomicInteger callCount = new AtomicInteger(0); + + DynamicContextProvider provider = + (operation, objectId) -> { + callCount.incrementAndGet(); + Map context = new HashMap<>(); + context.put("headers.Authorization", "Bearer test-token-123"); + context.put("headers.X-Request-Id", "req-" + operation); + return context; + }; + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + namespace.initialize(config, allocator, provider); + + // Perform operations to verify the provider is called + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + namespace.listNamespaces(listReq); + + // The provider should have been called for each operation + // Note: DirectoryNamespace stores the provider but may not actively use context + // until the underlying Rust code is updated to use it for credential vending + assertNotNull(namespace.namespaceId()); + } + } + + @Test + void testDirectoryNamespaceWithNullProvider() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + + // Should work with null provider (backward compatibility) + namespace.initialize(config, allocator, null); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + + @Test + void testContextProviderReturnsEmptyMap() { + DynamicContextProvider provider = (operation, objectId) -> new HashMap<>(); + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + namespace.initialize(config, allocator, provider); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + CreateNamespaceResponse resp = namespace.createNamespace(createReq); + + assertNotNull(resp); + } + } + + @Test + void testRestNamespaceWithContextProviderIntegration() { + AtomicInteger callCount = new AtomicInteger(0); + + DynamicContextProvider provider = + (operation, objectId) -> { + callCount.incrementAndGet(); + Map context = new HashMap<>(); + context.put("headers.Authorization", "Bearer xyz-token"); + context.put("headers.X-Trace-Id", "trace-" + System.currentTimeMillis()); + return context; + }; + + // Start a test REST server with DirectoryNamespace backend + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + // Create RestNamespace client with context provider + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + namespace.initialize(clientConfig, allocator, provider); + + // Perform operations - context provider should be called + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + // Verify provider was called for REST operations + assertTrue(callCount.get() >= 2, "Context provider should be called for each operation"); + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + } + + @Test + void testContextProviderReceivesCorrectOperationInfo() { + Map capturedOperations = new HashMap<>(); + + DynamicContextProvider provider = + (operation, objectId) -> { + capturedOperations.put(operation, objectId); + return new HashMap<>(); + }; + + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + namespace.initialize(clientConfig, allocator, provider); + + // Create namespace + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + // List namespaces + ListNamespacesRequest listReq = new ListNamespacesRequest(); + namespace.listNamespaces(listReq); + + // Verify operations were captured + assertTrue(capturedOperations.containsKey("create_namespace")); + assertTrue(capturedOperations.containsKey("list_namespaces")); + } + } + } + + // ========================================================================== + // Class path based provider tests + // ========================================================================== + + @Test + void testDirectoryNamespaceWithClassPathProvider() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + config.put("dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + config.put("dynamic_context_provider.token", "my-secret-token"); + config.put("dynamic_context_provider.prefix", "Token"); + + namespace.initialize(config, allocator); + + // Verify namespace works + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + + @Test + void testRestNamespaceWithClassPathProvider() { + Map backendConfig = new HashMap<>(); + backendConfig.put("root", tempDir.toString()); + + try (RestAdapter adapter = new RestAdapter("dir", backendConfig, "127.0.0.1", null)) { + adapter.start(); + int port = adapter.getPort(); + + try (RestNamespace namespace = new RestNamespace()) { + Map clientConfig = new HashMap<>(); + clientConfig.put("uri", "http://127.0.0.1:" + port); + clientConfig.put( + "dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + clientConfig.put("dynamic_context_provider.token", "secret-api-key"); + + namespace.initialize(clientConfig, allocator); + + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + ListNamespacesRequest listReq = new ListNamespacesRequest(); + ListNamespacesResponse listResp = namespace.listNamespaces(listReq); + + assertNotNull(listResp); + assertTrue(listResp.getNamespaces().contains("workspace")); + } + } + } + + @Test + void testUnknownProviderClassThrowsException() { + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + config.put("dynamic_context_provider.impl", "com.nonexistent.NonExistentProvider"); + + assertThrows( + IllegalArgumentException.class, + () -> namespace.initialize(config, allocator), + "Failed to load context provider class"); + } + } + + @Test + void testExplicitProviderTakesPrecedence() { + AtomicInteger explicitCallCount = new AtomicInteger(0); + + DynamicContextProvider explicitProvider = + (operation, objectId) -> { + explicitCallCount.incrementAndGet(); + Map ctx = new HashMap<>(); + ctx.put("headers.Authorization", "Bearer explicit"); + return ctx; + }; + + try (DirectoryNamespace namespace = new DirectoryNamespace()) { + Map config = new HashMap<>(); + config.put("root", tempDir.toString()); + // Even though we specify a class path, explicit provider should take precedence + config.put("dynamic_context_provider.impl", "org.lance.namespace.TestContextProvider"); + config.put("dynamic_context_provider.token", "ignored"); + + // Pass explicit provider - should take precedence over properties + namespace.initialize(config, allocator, explicitProvider); + + // Verify namespace works + CreateNamespaceRequest createReq = + new CreateNamespaceRequest().id(Arrays.asList("workspace")); + namespace.createNamespace(createReq); + + // Namespace should work + assertNotNull(namespace.namespaceId()); + } + } +} diff --git a/java/src/test/java/org/lance/namespace/TestContextProvider.java b/java/src/test/java/org/lance/namespace/TestContextProvider.java new file mode 100644 index 00000000000..4eea30c88c3 --- /dev/null +++ b/java/src/test/java/org/lance/namespace/TestContextProvider.java @@ -0,0 +1,36 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.namespace; + +import java.util.HashMap; +import java.util.Map; + +/** Test implementation of DynamicContextProvider for testing class path loading. */ +public class TestContextProvider implements DynamicContextProvider { + private final String token; + private final String prefix; + + public TestContextProvider(Map properties) { + this.token = properties.get("token"); + this.prefix = properties.getOrDefault("prefix", "Bearer"); + } + + @Override + public Map provideContext(String operation, String objectId) { + Map context = new HashMap<>(); + context.put("headers.Authorization", prefix + " " + token); + context.put("headers.X-Operation", operation); + return context; + } +} diff --git a/python/python/lance/namespace.py b/python/python/lance/namespace.py index d879ddcb99f..8796414daca 100644 --- a/python/python/lance/namespace.py +++ b/python/python/lance/namespace.py @@ -7,11 +7,13 @@ 1. Native Rust-backed namespace implementations (DirectoryNamespace, RestNamespace) 2. Storage options integration with LanceNamespace for automatic credential refresh 3. Plugin registry for external namespace implementations +4. Dynamic context provider registry for per-request context injection The LanceNamespace ABC interface is provided by the lance_namespace package. """ -from typing import Dict, List +from abc import ABC, abstractmethod +from typing import Dict, List, Optional from lance_namespace import ( CreateEmptyTableRequest, @@ -61,9 +63,148 @@ "RestNamespace", "RestAdapter", "LanceNamespaceStorageOptionsProvider", + "DynamicContextProvider", ] +# ============================================================================= +# Dynamic Context Provider +# ============================================================================= + + +class DynamicContextProvider(ABC): + """Abstract base class for dynamic context providers. + + Implementations provide per-request context (e.g., authentication headers) + based on the operation being performed. The provider is called synchronously + before each namespace operation. + + For RestNamespace, context keys that start with `headers.` are converted to + HTTP headers by stripping the prefix. For example, `{"headers.Authorization": + "Bearer token"}` becomes the `Authorization: Bearer token` header. + + Example + ------- + >>> # Define a provider class + >>> class MyProvider(DynamicContextProvider): + ... def __init__(self, api_key: str): + ... self.api_key = api_key + ... + ... def provide_context(self, info: dict) -> dict: + ... return { + ... "headers.Authorization": f"Bearer {self.api_key}", + ... } + ... + >>> # Create provider instance and use directly + >>> provider = MyProvider(api_key="secret") + >>> provider.provide_context({"operation": "list_tables", "object_id": "ns"}) + {'headers.Authorization': 'Bearer secret'} + """ + + @abstractmethod + def provide_context(self, info: Dict[str, str]) -> Dict[str, str]: + """Provide context for a namespace operation. + + Parameters + ---------- + info : dict + Information about the operation: + - operation: The operation name (e.g., "list_tables", "describe_table") + - object_id: The object identifier (namespace or table ID) + + Returns + ------- + dict + Context key-value pairs. For HTTP headers, use keys with the + "headers." prefix (e.g., "headers.Authorization"). + """ + pass + + +def _create_context_provider_from_properties( + properties: Dict[str, str], +) -> Optional[DynamicContextProvider]: + """Create a context provider instance from properties. + + Extracts `dynamic_context_provider.*` properties and creates a provider + instance by dynamically loading the class from the given class path. + + Parameters + ---------- + properties : dict + The full properties dict that may contain dynamic_context_provider.* keys. + + Returns + ------- + DynamicContextProvider or None + The created provider instance, or None if no provider is configured. + + Raises + ------ + ValueError + If dynamic_context_provider.impl is set but the class cannot be loaded. + """ + import importlib + + prefix = "dynamic_context_provider." + impl_key = "dynamic_context_provider.impl" + + impl_path = properties.get(impl_key) + if not impl_path: + return None + + # Parse the class path (e.g., "my_module.submodule.MyClass") + if "." not in impl_path: + raise ValueError( + f"Invalid context provider class path '{impl_path}'. " + f"Expected format: 'module.ClassName' (e.g., 'my_module.MyProvider')" + ) + + module_path, class_name = impl_path.rsplit(".", 1) + + try: + module = importlib.import_module(module_path) + provider_class = getattr(module, class_name) + except ModuleNotFoundError as e: + raise ValueError( + f"Failed to import module '{module_path}' for context provider: {e}" + ) from e + except AttributeError as e: + raise ValueError( + f"Class '{class_name}' not found in module '{module_path}': {e}" + ) from e + + # Extract provider-specific properties (strip prefix, exclude impl key) + provider_props = {} + for key, value in properties.items(): + if key.startswith(prefix) and key != impl_key: + prop_name = key[len(prefix) :] + provider_props[prop_name] = value + + # Create the provider instance + return provider_class(**provider_props) + + +def _filter_context_provider_properties(properties: Dict[str, str]) -> Dict[str, str]: + """Remove dynamic_context_provider.* properties from the dict. + + These properties are handled at the Python level and should not be + passed to the Rust layer. + + Parameters + ---------- + properties : dict + The full properties dict. + + Returns + ------- + dict + Properties with dynamic_context_provider.* keys removed. + """ + prefix = "dynamic_context_provider." + return {k: v for k, v in properties.items() if not k.startswith(prefix)} + + class DirectoryNamespace(LanceNamespace): """Directory-based Lance Namespace implementation backed by Rust. @@ -140,14 +281,40 @@ class DirectoryNamespace(LanceNamespace): ... "credential_vendor.aws_role_arn": "arn:aws:iam::123456789012:role/MyRole", ... "credential_vendor.aws_duration_millis": "3600000", ... }) + + With dynamic context provider: + + >>> import tempfile + >>> class MyProvider(DynamicContextProvider): + ... def __init__(self, token: str): + ... self.token = token + ... def provide_context(self, info: dict) -> dict: + ... return {"headers.Authorization": f"Bearer {self.token}"} + ... + >>> provider = MyProvider(token="secret-token") + >>> with tempfile.TemporaryDirectory() as tmpdir: + ... ns = lance.namespace.DirectoryNamespace( + ... root=tmpdir, + ... context_provider=provider, + ... ) + ... _ = ns.namespace_id() # verify it works """ - def __init__(self, session=None, **properties): + def __init__(self, session=None, context_provider=None, **properties): # Convert all values to strings as expected by Rust from_properties str_properties = {str(k): str(v) for k, v in properties.items()} + # Create context provider from properties if configured + if context_provider is None: + context_provider = _create_context_provider_from_properties(str_properties) + + # Filter out dynamic_context_provider.* properties before passing to Rust + filtered_properties = _filter_context_provider_properties(str_properties) + # Create the underlying Rust namespace - self._inner = PyDirectoryNamespace(session=session, **str_properties) + self._inner = PyDirectoryNamespace( + session=session, context_provider=context_provider, **filtered_properties + ) def namespace_id(self) -> str: """Return a human-readable unique identifier for this namespace instance.""" @@ -254,9 +421,25 @@ class RestNamespace(LanceNamespace): >>> # Using the connect() factory function from lance_namespace >>> import lance_namespace >>> ns = lance_namespace.connect("rest", {"uri": "http://localhost:4099"}) + + With dynamic context provider: + + >>> class AuthProvider(DynamicContextProvider): + ... def __init__(self, api_key: str): + ... self.api_key = api_key + ... def provide_context(self, info: dict) -> dict: + ... return {"headers.Authorization": f"Bearer {self.api_key}"} + ... + >>> provider = AuthProvider(api_key="my-secret-key") + >>> ns = lance.namespace.RestNamespace( + ... uri="http://localhost:4099", + ... context_provider=provider, + ... ) + >>> ns.namespace_id() # verify it works + 'RestNamespace { endpoint: "http://localhost:4099", delimiter: "$" }' """ - def __init__(self, **properties): + def __init__(self, context_provider=None, **properties): if PyRestNamespace is None: raise RuntimeError( "RestNamespace is not available. " @@ -266,8 +449,17 @@ def __init__(self, **properties): # Convert all values to strings as expected by Rust from_properties str_properties = {str(k): str(v) for k, v in properties.items()} + # Create context provider from properties if configured + if context_provider is None: + context_provider = _create_context_provider_from_properties(str_properties) + + # Filter out dynamic_context_provider.* properties before passing to Rust + filtered_properties = _filter_context_provider_properties(str_properties) + # Create the underlying Rust namespace - self._inner = PyRestNamespace(**str_properties) + self._inner = PyRestNamespace( + context_provider=context_provider, **filtered_properties + ) def namespace_id(self) -> str: """Return a human-readable unique identifier for this namespace instance.""" diff --git a/python/python/tests/test_namespace_rest.py b/python/python/tests/test_namespace_rest.py index 7fa3a65c5f1..de1a57ace8d 100644 --- a/python/python/tests/test_namespace_rest.py +++ b/python/python/tests/test_namespace_rest.py @@ -680,3 +680,66 @@ def test_connect_with_custom_delimiter(self): ipc_data = table_to_ipc_bytes(table_data) response = ns.create_table(create_req, ipc_data) assert response is not None + + +class TestDynamicContextProvider: + """Tests for DynamicContextProvider with RestNamespace.""" + + def test_rest_namespace_with_explicit_provider(self): + """Test RestNamespace with an explicit context provider.""" + call_count = {"count": 0} + + class TestProvider(lance.namespace.DynamicContextProvider): + def provide_context(self, info): + call_count["count"] += 1 + return { + "headers.Authorization": "Bearer test-token", + "headers.X-Request-Id": f"req-{info.get('operation', 'unknown')}", + } + + with tempfile.TemporaryDirectory() as tmpdir: + backend_config = {"root": tmpdir} + + with lance.namespace.RestAdapter("dir", backend_config, port=0) as adapter: + ns = lance.namespace.RestNamespace( + uri=f"http://127.0.0.1:{adapter.port}", + context_provider=TestProvider(), + ) + + # Perform operations + create_req = CreateNamespaceRequest(id=["workspace"]) + ns.create_namespace(create_req) + + list_req = ListNamespacesRequest(id=[]) + ns.list_namespaces(list_req) + + # Context provider should have been called + assert call_count["count"] >= 2 + + def test_explicit_provider_takes_precedence(self): + """Test that explicit provider takes precedence over class path.""" + explicit_called = {"called": False} + + class ExplicitProvider(lance.namespace.DynamicContextProvider): + def provide_context(self, info): + explicit_called["called"] = True + return {"headers.Authorization": "Bearer explicit"} + + with tempfile.TemporaryDirectory() as tmpdir: + backend_config = {"root": tmpdir} + + with lance.namespace.RestAdapter("dir", backend_config, port=0) as adapter: + # Pass both explicit provider and class path - explicit should win + ns = lance.namespace.RestNamespace( + context_provider=ExplicitProvider(), + **{ + "uri": f"http://127.0.0.1:{adapter.port}", + "dynamic_context_provider.impl": "nonexistent.Provider", + }, + ) + + create_req = CreateNamespaceRequest(id=["workspace"]) + ns.create_namespace(create_req) + + # Explicit provider should have been used + assert explicit_called["called"] diff --git a/python/src/namespace.rs b/python/src/namespace.rs index cc579248943..53d180f9cc6 100644 --- a/python/src/namespace.rs +++ b/python/src/namespace.rs @@ -7,11 +7,11 @@ use std::collections::HashMap; use std::sync::Arc; use bytes::Bytes; -use lance_namespace_impls::DirectoryNamespaceBuilder; #[cfg(feature = "rest")] use lance_namespace_impls::RestNamespaceBuilder; #[cfg(feature = "rest-adapter")] use lance_namespace_impls::{ConnectBuilder, RestAdapter, RestAdapterConfig, RestAdapterHandle}; +use lance_namespace_impls::{DirectoryNamespaceBuilder, DynamicContextProvider, OperationInfo}; use pyo3::prelude::*; use pyo3::types::{PyBytes, PyDict}; use pythonize::{depythonize, pythonize}; @@ -19,6 +19,73 @@ use pythonize::{depythonize, pythonize}; use crate::error::PythonErrorExt; use crate::session::Session; +/// Python-implemented dynamic context provider. +/// +/// Wraps a Python object that has a `provide_context(info: dict) -> dict` method. +/// For RestNamespace, context keys that start with `headers.` are converted to +/// HTTP headers by stripping the prefix. +pub struct PyDynamicContextProvider { + provider: Py, +} + +impl Clone for PyDynamicContextProvider { + fn clone(&self) -> Self { + Python::with_gil(|py| Self { + provider: self.provider.clone_ref(py), + }) + } +} + +impl PyDynamicContextProvider { + /// Create a new Python context provider wrapper. + pub fn new(provider: Py) -> Self { + Self { provider } + } +} + +impl std::fmt::Debug for PyDynamicContextProvider { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "PyDynamicContextProvider") + } +} + +impl DynamicContextProvider for PyDynamicContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + Python::with_gil(|py| { + // Create Python dict for operation info + let py_info = PyDict::new(py); + if py_info.set_item("operation", &info.operation).is_err() { + return HashMap::new(); + } + if py_info.set_item("object_id", &info.object_id).is_err() { + return HashMap::new(); + } + + // Call the provider's provide_context method + let result = self + .provider + .call_method1(py, "provide_context", (py_info,)); + + match result { + Ok(headers_py) => { + // Convert Python dict to Rust HashMap + let bound_headers = headers_py.bind(py); + if let Ok(dict) = bound_headers.downcast::() { + dict_to_hashmap(dict).unwrap_or_default() + } else { + log::warn!("Context provider did not return a dict"); + HashMap::new() + } + } + Err(e) => { + log::error!("Failed to call context provider: {}", e); + HashMap::new() + } + } + }) + } +} + /// Convert Python dict to HashMap fn dict_to_hashmap(dict: &Bound<'_, PyDict>) -> PyResult> { let mut map = HashMap::new(); @@ -39,10 +106,18 @@ pub struct PyDirectoryNamespace { #[pymethods] impl PyDirectoryNamespace { /// Create a new DirectoryNamespace from properties + /// + /// # Arguments + /// + /// * `session` - Optional Lance session for sharing storage connections + /// * `context_provider` - Optional object with `provide_context(info: dict) -> dict` method + /// for providing dynamic per-request context + /// * `**properties` - Namespace configuration properties #[new] - #[pyo3(signature = (session = None, **properties))] + #[pyo3(signature = (session = None, context_provider = None, **properties))] fn new( session: Option<&Bound<'_, Session>>, + context_provider: Option<&Bound<'_, PyAny>>, properties: Option<&Bound<'_, PyDict>>, ) -> PyResult { let mut props = HashMap::new(); @@ -53,7 +128,7 @@ impl PyDirectoryNamespace { let session_arc = session.map(|s| s.borrow().inner.clone()); - let builder = + let mut builder = DirectoryNamespaceBuilder::from_properties(props, session_arc).map_err(|e| { pyo3::exceptions::PyValueError::new_err(format!( "Failed to create DirectoryNamespace: {}", @@ -61,6 +136,12 @@ impl PyDirectoryNamespace { )) })?; + // Add context provider if provided + if let Some(provider) = context_provider { + let py_provider = PyDynamicContextProvider::new(provider.clone().unbind()); + builder = builder.context_provider(Arc::new(py_provider)); + } + let namespace = crate::rt().block_on(None, builder.build())?.infer_error()?; Ok(Self { @@ -212,22 +293,39 @@ pub struct PyRestNamespace { #[pymethods] impl PyRestNamespace { /// Create a new RestNamespace from properties + /// + /// # Arguments + /// + /// * `context_provider` - Optional object with `provide_context(info: dict) -> dict` method + /// for providing dynamic per-request context. Context keys that start with `headers.` + /// are converted to HTTP headers by stripping the prefix. For example, + /// `{"headers.Authorization": "Bearer token"}` becomes the `Authorization` header. + /// * `**properties` - Namespace configuration properties (uri, delimiter, header.*, etc.) #[new] - #[pyo3(signature = (**properties))] - fn new(properties: Option<&Bound<'_, PyDict>>) -> PyResult { + #[pyo3(signature = (context_provider = None, **properties))] + fn new( + context_provider: Option<&Bound<'_, PyAny>>, + properties: Option<&Bound<'_, PyDict>>, + ) -> PyResult { let mut props = HashMap::new(); if let Some(dict) = properties { props = dict_to_hashmap(dict)?; } - let builder = RestNamespaceBuilder::from_properties(props).map_err(|e| { + let mut builder = RestNamespaceBuilder::from_properties(props).map_err(|e| { pyo3::exceptions::PyValueError::new_err(format!( "Failed to create RestNamespace: {}", e )) })?; + // Add context provider if provided + if let Some(provider) = context_provider { + let py_provider = PyDynamicContextProvider::new(provider.clone().unbind()); + builder = builder.context_provider(Arc::new(py_provider)); + } + let namespace = builder.build(); Ok(Self { diff --git a/rust/lance-namespace-impls/Cargo.toml b/rust/lance-namespace-impls/Cargo.toml index 85ee4a6989f..b41e7f44e01 100644 --- a/rust/lance-namespace-impls/Cargo.toml +++ b/rust/lance-namespace-impls/Cargo.toml @@ -13,7 +13,7 @@ rust-version.workspace = true [features] default = ["dir-aws", "dir-azure", "dir-gcp", "dir-oss", "dir-huggingface"] -rest = ["dep:reqwest"] +rest = ["dep:reqwest", "dep:serde"] rest-adapter = ["dep:axum", "dep:tower", "dep:tower-http", "dep:serde"] # Cloud storage features for directory implementation - align with lance-io dir-gcp = ["lance-io/gcp", "lance/gcp"] diff --git a/rust/lance-namespace-impls/src/connect.rs b/rust/lance-namespace-impls/src/connect.rs index aa84e2fd6c1..ba26fda3643 100644 --- a/rust/lance-namespace-impls/src/connect.rs +++ b/rust/lance-namespace-impls/src/connect.rs @@ -10,6 +10,8 @@ use lance::session::Session; use lance_core::{Error, Result}; use lance_namespace::LanceNamespace; +use crate::context::DynamicContextProvider; + /// Builder for creating Lance namespace connections. /// /// This builder provides a fluent API for configuring and establishing @@ -46,11 +48,53 @@ use lance_namespace::LanceNamespace; /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +/// +/// ## With Dynamic Context Provider +/// +/// ```no_run +/// # use lance_namespace_impls::{ConnectBuilder, DynamicContextProvider, OperationInfo}; +/// # use std::collections::HashMap; +/// # use std::sync::Arc; +/// # async fn example() -> Result<(), Box> { +/// #[derive(Debug)] +/// struct MyProvider; +/// +/// impl DynamicContextProvider for MyProvider { +/// fn provide_context(&self, info: &OperationInfo) -> HashMap { +/// let mut ctx = HashMap::new(); +/// ctx.insert("headers.Authorization".to_string(), "Bearer token".to_string()); +/// ctx +/// } +/// } +/// +/// let namespace = ConnectBuilder::new("rest") +/// .property("uri", "https://api.example.com") +/// .context_provider(Arc::new(MyProvider)) +/// .connect() +/// .await?; +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] pub struct ConnectBuilder { impl_name: String, properties: HashMap, session: Option>, + context_provider: Option>, +} + +impl std::fmt::Debug for ConnectBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("ConnectBuilder") + .field("impl_name", &self.impl_name) + .field("properties", &self.properties) + .field("session", &self.session) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl ConnectBuilder { @@ -64,6 +108,7 @@ impl ConnectBuilder { impl_name: impl_name.into(), properties: HashMap::new(), session: None, + context_provider: None, } } @@ -102,6 +147,20 @@ impl ConnectBuilder { self } + /// Set a dynamic context provider for per-request context. + /// + /// The provider will be called before each operation to generate + /// additional context. For RestNamespace, context keys that start with + /// `headers.` are converted to HTTP headers by stripping the prefix. + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build and establish the connection to the namespace. /// /// # Returns @@ -119,8 +178,12 @@ impl ConnectBuilder { #[cfg(feature = "rest")] "rest" => { // Create REST implementation (REST doesn't use session) - crate::rest::RestNamespaceBuilder::from_properties(self.properties) - .map(|builder| Arc::new(builder.build()) as Arc) + let mut builder = + crate::rest::RestNamespaceBuilder::from_properties(self.properties)?; + if let Some(provider) = self.context_provider { + builder = builder.context_provider(provider); + } + Ok(Arc::new(builder.build()) as Arc) } #[cfg(not(feature = "rest"))] "rest" => Err(Error::Namespace { @@ -130,13 +193,17 @@ impl ConnectBuilder { }), "dir" => { // Create directory implementation (always available) - crate::dir::DirectoryNamespaceBuilder::from_properties( + let mut builder = crate::dir::DirectoryNamespaceBuilder::from_properties( self.properties, self.session, - )? - .build() - .await - .map(|ns| Arc::new(ns) as Arc) + )?; + if let Some(provider) = self.context_provider { + builder = builder.context_provider(provider); + } + builder + .build() + .await + .map(|ns| Arc::new(ns) as Arc) } _ => Err(Error::Namespace { source: format!( diff --git a/rust/lance-namespace-impls/src/context.rs b/rust/lance-namespace-impls/src/context.rs new file mode 100644 index 00000000000..028eb342bac --- /dev/null +++ b/rust/lance-namespace-impls/src/context.rs @@ -0,0 +1,161 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Dynamic context provider for per-request context overrides. +//! +//! This module provides the [`DynamicContextProvider`] trait that enables +//! per-request context injection (e.g., dynamic authentication headers). +//! +//! ## Usage +//! +//! Implement the trait and pass to namespace builders: +//! +//! ```ignore +//! use lance_namespace_impls::{RestNamespaceBuilder, DynamicContextProvider, OperationInfo}; +//! use std::collections::HashMap; +//! use std::sync::Arc; +//! +//! #[derive(Debug)] +//! struct MyProvider; +//! +//! impl DynamicContextProvider for MyProvider { +//! fn provide_context(&self, info: &OperationInfo) -> HashMap { +//! let mut context = HashMap::new(); +//! context.insert("headers.Authorization".to_string(), format!("Bearer {}", get_current_token())); +//! context.insert("headers.X-Request-Id".to_string(), generate_request_id()); +//! context +//! } +//! } +//! +//! let namespace = RestNamespaceBuilder::new("https://api.example.com") +//! .context_provider(Arc::new(MyProvider)) +//! .build(); +//! ``` +//! +//! For RestNamespace, context keys that start with `headers.` are converted to HTTP headers +//! by stripping the prefix. For example, `{"headers.Authorization": "Bearer abc123"}` +//! becomes the `Authorization: Bearer abc123` header. Keys without the `headers.` prefix +//! are ignored for HTTP headers but may be used for other purposes. + +use std::collections::HashMap; + +/// Information about the namespace operation being executed. +/// +/// This is passed to the [`DynamicContextProvider`] to allow it to make +/// context decisions based on the operation. +#[derive(Debug, Clone)] +pub struct OperationInfo { + /// The operation name (e.g., "list_tables", "describe_table", "create_namespace") + pub operation: String, + /// The object ID for the operation (namespace or table identifier). + /// This is the delimited string form, e.g., "workspace$table_name". + pub object_id: String, +} + +impl OperationInfo { + /// Create a new OperationInfo. + pub fn new(operation: impl Into, object_id: impl Into) -> Self { + Self { + operation: operation.into(), + object_id: object_id.into(), + } + } +} + +/// Trait for providing dynamic request context. +/// +/// Implementations can generate per-request context (e.g., authentication headers) +/// based on the operation being performed. The provider is called synchronously +/// before each namespace operation. +/// +/// For RestNamespace, context keys that start with `headers.` are converted to +/// HTTP headers by stripping the prefix. For example, `{"headers.Authorization": "Bearer token"}` +/// becomes the `Authorization: Bearer token` header. +/// +/// ## Thread Safety +/// +/// Implementations must be `Send + Sync` as the provider may be called from +/// multiple threads concurrently. +/// +/// ## Error Handling +/// +/// If the provider needs to signal an error, it should return an empty HashMap +/// and log the error. The namespace operation will proceed without the +/// additional context. +pub trait DynamicContextProvider: Send + Sync + std::fmt::Debug { + /// Provide context for a namespace operation. + /// + /// # Arguments + /// + /// * `info` - Information about the operation being performed + /// + /// # Returns + /// + /// Returns a HashMap of context key-value pairs. For HTTP headers, use keys + /// with the `headers.` prefix (e.g., `headers.Authorization`). + /// Returns an empty HashMap if no additional context is needed. + fn provide_context(&self, info: &OperationInfo) -> HashMap; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[derive(Debug)] + struct MockContextProvider { + prefix: String, + } + + impl DynamicContextProvider for MockContextProvider { + fn provide_context(&self, info: &OperationInfo) -> HashMap { + let mut context = HashMap::new(); + context.insert( + "test-header".to_string(), + format!("{}-{}", self.prefix, info.operation), + ); + context.insert("object-id".to_string(), info.object_id.clone()); + context + } + } + + #[test] + fn test_operation_info_creation() { + let info = OperationInfo::new("describe_table", "workspace$my_table"); + assert_eq!(info.operation, "describe_table"); + assert_eq!(info.object_id, "workspace$my_table"); + } + + #[test] + fn test_context_provider_basic() { + let provider = MockContextProvider { + prefix: "test".to_string(), + }; + + let info = OperationInfo::new("list_tables", "workspace$ns"); + + let context = provider.provide_context(&info); + assert_eq!( + context.get("test-header"), + Some(&"test-list_tables".to_string()) + ); + assert_eq!(context.get("object-id"), Some(&"workspace$ns".to_string())); + } + + #[test] + fn test_empty_context() { + #[derive(Debug)] + struct EmptyProvider; + + impl DynamicContextProvider for EmptyProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + HashMap::new() + } + } + + let provider = EmptyProvider; + let info = OperationInfo::new("list_tables", "ns"); + + let context = provider.provide_context(&info); + assert!(context.is_empty()); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 2168324a308..4d6a88419ee 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -21,6 +21,7 @@ use std::collections::HashMap; use std::io::Cursor; use std::sync::Arc; +use crate::context::DynamicContextProvider; use lance_namespace::models::{ CreateEmptyTableRequest, CreateEmptyTableResponse, CreateNamespaceRequest, CreateNamespaceResponse, CreateTableRequest, CreateTableResponse, DeclareTableRequest, @@ -85,7 +86,7 @@ pub(crate) struct TableStatus { /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct DirectoryNamespaceBuilder { root: String, storage_options: Option>, @@ -94,6 +95,26 @@ pub struct DirectoryNamespaceBuilder { dir_listing_enabled: bool, inline_optimization_enabled: bool, credential_vendor_properties: HashMap, + context_provider: Option>, +} + +impl std::fmt::Debug for DirectoryNamespaceBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("DirectoryNamespaceBuilder") + .field("root", &self.root) + .field("storage_options", &self.storage_options) + .field("manifest_enabled", &self.manifest_enabled) + .field("dir_listing_enabled", &self.dir_listing_enabled) + .field( + "inline_optimization_enabled", + &self.inline_optimization_enabled, + ) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl DirectoryNamespaceBuilder { @@ -111,6 +132,7 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled: true, // Default to enabled for backwards compatibility inline_optimization_enabled: true, credential_vendor_properties: HashMap::new(), + context_provider: None, } } @@ -271,6 +293,7 @@ impl DirectoryNamespaceBuilder { dir_listing_enabled, inline_optimization_enabled, credential_vendor_properties, + context_provider: None, }) } @@ -362,6 +385,20 @@ impl DirectoryNamespaceBuilder { self } + /// Set a dynamic context provider for per-request context. + /// + /// The provider can be used to generate additional context for operations. + /// For DirectoryNamespace, the context is stored but not directly used + /// in operations (unlike RestNamespace where it's converted to HTTP headers). + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build the DirectoryNamespace. /// /// # Returns @@ -423,6 +460,7 @@ impl DirectoryNamespaceBuilder { manifest_ns, dir_listing_enabled: self.dir_listing_enabled, credential_vendor, + context_provider: self.context_provider, }) } @@ -492,6 +530,10 @@ pub struct DirectoryNamespace { /// Credential vendor created once during initialization. /// Used to vend temporary credentials for table access. credential_vendor: Option>, + /// Dynamic context provider for per-request context. + /// Stored but not directly used in operations (available for future extensions). + #[allow(dead_code)] + context_provider: Option>, } impl std::fmt::Debug for DirectoryNamespace { diff --git a/rust/lance-namespace-impls/src/lib.rs b/rust/lance-namespace-impls/src/lib.rs index 88248841bcb..83fb93ddc0e 100644 --- a/rust/lance-namespace-impls/src/lib.rs +++ b/rust/lance-namespace-impls/src/lib.rs @@ -69,6 +69,7 @@ //! ``` pub mod connect; +pub mod context; pub mod credentials; pub mod dir; @@ -80,6 +81,7 @@ pub mod rest_adapter; // Re-export connect builder pub use connect::ConnectBuilder; +pub use context::{DynamicContextProvider, OperationInfo}; pub use dir::{manifest::ManifestNamespace, DirectoryNamespace, DirectoryNamespaceBuilder}; // Re-export credential vending diff --git a/rust/lance-namespace-impls/src/rest.rs b/rust/lance-namespace-impls/src/rest.rs index 020746487a4..0eae07e4ce2 100644 --- a/rust/lance-namespace-impls/src/rest.rs +++ b/rust/lance-namespace-impls/src/rest.rs @@ -4,13 +4,16 @@ //! REST implementation of Lance Namespace use std::collections::HashMap; +use std::str::FromStr; +use std::sync::Arc; use async_trait::async_trait; use bytes::Bytes; +use reqwest::header::{HeaderName, HeaderValue}; -use lance_namespace::apis::{ - configuration::Configuration, namespace_api, table_api, tag_api, transaction_api, -}; +use crate::context::{DynamicContextProvider, OperationInfo}; + +use lance_namespace::apis::urlencode; use lance_namespace::models::{ AlterTableAddColumnsRequest, AlterTableAddColumnsResponse, AlterTableAlterColumnsRequest, AlterTableAlterColumnsResponse, AlterTableDropColumnsRequest, AlterTableDropColumnsResponse, @@ -36,11 +39,102 @@ use lance_namespace::models::{ UpdateTableRequest, UpdateTableResponse, UpdateTableSchemaMetadataRequest, UpdateTableSchemaMetadataResponse, UpdateTableTagRequest, UpdateTableTagResponse, }; +use serde::{de::DeserializeOwned, Serialize}; use lance_core::{box_error, Error, Result}; use lance_namespace::LanceNamespace; +/// HTTP client wrapper that supports per-request header injection. +/// +/// This client wraps a single `reqwest::Client` and applies dynamic headers +/// to each request without recreating the client. This is more efficient than +/// creating a new client per request when using a `DynamicContextProvider`. +/// +/// The design follows lancedb's `RestfulLanceDbClient` pattern where headers +/// are applied to the built request using `headers_mut()` before execution. +#[derive(Clone)] +struct RestClient { + client: reqwest::Client, + base_path: String, + base_headers: HashMap, + context_provider: Option>, +} + +impl std::fmt::Debug for RestClient { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RestClient") + .field("base_path", &self.base_path) + .field("base_headers", &self.base_headers) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } +} + +impl RestClient { + /// Apply base headers and dynamic context headers to a request. + /// + /// This method mutates the request's headers directly, which is more efficient + /// than creating a new client with default_headers for each request. + fn apply_headers(&self, request: &mut reqwest::Request, operation: &str, object_id: &str) { + let request_headers = request.headers_mut(); + + // First apply base headers + for (key, value) in &self.base_headers { + if let (Ok(header_name), Ok(header_value)) = + (HeaderName::from_str(key), HeaderValue::from_str(value)) + { + request_headers.insert(header_name, header_value); + } + } + + // Then apply context headers (override base headers if conflict) + if let Some(provider) = &self.context_provider { + let info = OperationInfo::new(operation, object_id); + let context = provider.provide_context(&info); + + const HEADERS_PREFIX: &str = "headers."; + for (key, value) in context { + if let Some(header_name) = key.strip_prefix(HEADERS_PREFIX) { + if let (Ok(header_name), Ok(header_value)) = ( + HeaderName::from_str(header_name), + HeaderValue::from_str(&value), + ) { + request_headers.insert(header_name, header_value); + } + } + } + } + } + + /// Execute a request with dynamic headers applied. + /// + /// This method builds the request, applies headers, and executes it. + async fn execute( + &self, + req_builder: reqwest::RequestBuilder, + operation: &str, + object_id: &str, + ) -> std::result::Result { + let mut request = req_builder.build()?; + self.apply_headers(&mut request, operation, object_id); + self.client.execute(request).await + } + + /// Get the base path URL + fn base_path(&self) -> &str { + &self.base_path + } + + /// Get a reference to the underlying reqwest client + fn client(&self) -> &reqwest::Client { + &self.client + } +} + /// Builder for creating a RestNamespace. /// /// This builder provides a fluent API for configuring and establishing @@ -59,7 +153,7 @@ use lance_namespace::LanceNamespace; /// # Ok(()) /// # } /// ``` -#[derive(Debug, Clone)] +#[derive(Clone)] pub struct RestNamespaceBuilder { uri: String, delimiter: String, @@ -68,6 +162,25 @@ pub struct RestNamespaceBuilder { key_file: Option, ssl_ca_cert: Option, assert_hostname: bool, + context_provider: Option>, +} + +impl std::fmt::Debug for RestNamespaceBuilder { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("RestNamespaceBuilder") + .field("uri", &self.uri) + .field("delimiter", &self.delimiter) + .field("headers", &self.headers) + .field("cert_file", &self.cert_file) + .field("key_file", &self.key_file) + .field("ssl_ca_cert", &self.ssl_ca_cert) + .field("assert_hostname", &self.assert_hostname) + .field( + "context_provider", + &self.context_provider.as_ref().map(|_| "Some(...)"), + ) + .finish() + } } impl RestNamespaceBuilder { @@ -88,6 +201,7 @@ impl RestNamespaceBuilder { key_file: None, ssl_ca_cert: None, assert_hostname: true, + context_provider: None, } } @@ -172,6 +286,7 @@ impl RestNamespaceBuilder { key_file, ssl_ca_cert, assert_hostname, + context_provider: None, }) } @@ -246,6 +361,44 @@ impl RestNamespaceBuilder { self } + /// Set a dynamic context provider for per-request context. + /// + /// The provider will be called before each HTTP request to generate + /// additional context. Context keys that start with `headers.` are converted + /// to HTTP headers by stripping the prefix. For example, `headers.Authorization` + /// becomes the `Authorization` header. Keys without the `headers.` prefix are ignored. + /// + /// # Arguments + /// + /// * `provider` - The context provider implementation + /// + /// # Examples + /// + /// ```ignore + /// use lance_namespace_impls::{RestNamespaceBuilder, DynamicContextProvider, OperationInfo}; + /// use std::collections::HashMap; + /// use std::sync::Arc; + /// + /// #[derive(Debug)] + /// struct MyProvider; + /// + /// impl DynamicContextProvider for MyProvider { + /// fn provide_context(&self, info: &OperationInfo) -> HashMap { + /// let mut ctx = HashMap::new(); + /// ctx.insert("auth-token".to_string(), "my-token".to_string()); + /// ctx + /// } + /// } + /// + /// let namespace = RestNamespaceBuilder::new("http://localhost:8080") + /// .context_provider(Arc::new(MyProvider)) + /// .build(); + /// ``` + pub fn context_provider(mut self, provider: Arc) -> Self { + self.context_provider = Some(provider); + self + } + /// Build the RestNamespace. /// /// # Returns @@ -268,29 +421,6 @@ fn object_id_str(id: &Option>, delimiter: &str) -> Result { } } -/// Convert API error to lance core error -fn convert_api_error(err: lance_namespace::apis::Error) -> Error { - use lance_namespace::apis::Error as ApiError; - match err { - ApiError::Reqwest(e) => Error::IO { - source: box_error(e), - location: snafu::location!(), - }, - ApiError::Serde(e) => Error::Namespace { - source: format!("Serialization error: {}", e).into(), - location: snafu::location!(), - }, - ApiError::Io(e) => Error::IO { - source: box_error(e), - location: snafu::location!(), - }, - ApiError::ResponseError(e) => Error::Namespace { - source: format!("Response error: {:?}", e).into(), - location: snafu::location!(), - }, - } -} - /// REST implementation of Lance Namespace /// /// # Examples @@ -307,7 +437,8 @@ fn convert_api_error(err: lance_namespace::apis::Error) - #[derive(Clone)] pub struct RestNamespace { delimiter: String, - reqwest_config: Configuration, + /// REST client that handles per-request header injection efficiently. + rest_client: RestClient, } impl std::fmt::Debug for RestNamespace { @@ -325,23 +456,9 @@ impl std::fmt::Display for RestNamespace { impl RestNamespace { /// Create a new REST namespace from builder pub(crate) fn from_builder(builder: RestNamespaceBuilder) -> Self { - // Build reqwest client with custom headers if provided + // Build reqwest client WITHOUT default headers - we'll apply headers per-request let mut client_builder = reqwest::Client::builder(); - // Add custom headers to the client - if !builder.headers.is_empty() { - let mut headers = reqwest::header::HeaderMap::new(); - for (key, value) in &builder.headers { - if let (Ok(header_name), Ok(header_value)) = ( - reqwest::header::HeaderName::from_bytes(key.as_bytes()), - reqwest::header::HeaderValue::from_str(value), - ) { - headers.insert(header_name, header_value); - } - } - client_builder = client_builder.default_headers(headers); - } - // Configure mTLS if certificate and key files are provided if let (Some(cert_file), Some(key_file)) = (&builder.cert_file, &builder.key_file) { if let (Ok(cert), Ok(key)) = (std::fs::read(cert_file), std::fs::read(key_file)) { @@ -367,28 +484,218 @@ impl RestNamespace { .build() .unwrap_or_else(|_| reqwest::Client::new()); - let mut reqwest_config = Configuration::new(); - reqwest_config.client = client; - reqwest_config.base_path = builder.uri; + // Create the RestClient that handles per-request header injection + let rest_client = RestClient { + client, + base_path: builder.uri, + base_headers: builder.headers, + context_provider: builder.context_provider, + }; Self { delimiter: builder.delimiter, - reqwest_config, + rest_client, } } - /// Create a new REST namespace with custom configuration (for testing) - #[cfg(test)] - pub fn with_configuration(delimiter: String, reqwest_config: Configuration) -> Self { - Self { - delimiter, - reqwest_config, + /// Execute a GET request and parse JSON response. + async fn get_json( + &self, + path: &str, + query: &[(&str, &str)], + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().get(&url).query(query); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with JSON body and parse JSON response. + async fn post_json( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request that returns nothing (204 No Content expected). + async fn post_json_no_content( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result<()> { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + if status.is_success() { + Ok(()) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with binary body and parse JSON response. + async fn post_binary_json( + &self, + path: &str, + query: &[(&str, &str)], + body: Vec, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).body(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + if status.is_success() { + serde_json::from_str(&content).map_err(|e| Error::Namespace { + source: format!("Failed to parse response: {}", e).into(), + location: snafu::location!(), + }) + } else { + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } + } + + /// Execute a POST request with JSON body and get binary response. + #[allow(dead_code)] + async fn post_json_binary( + &self, + path: &str, + query: &[(&str, &str)], + body: &T, + operation: &str, + object_id: &str, + ) -> Result { + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self.rest_client.client().post(&url).query(query).json(body); + + let resp = self + .rest_client + .execute(req_builder, operation, object_id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + + let status = resp.status(); + if status.is_success() { + resp.bytes().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + }) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) } } /// Get the base endpoint URL for this namespace pub fn endpoint(&self) -> &str { - &self.reqwest_config.base_path + self.rest_client.base_path() } } @@ -399,16 +706,20 @@ impl LanceNamespace for RestNamespace { request: ListNamespacesRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::list_namespaces( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_namespaces", &id).await } async fn describe_namespace( @@ -416,10 +727,11 @@ impl LanceNamespace for RestNamespace { request: DescribeNamespaceRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::describe_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/describe", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_namespace", &id) .await - .map_err(convert_api_error) } async fn create_namespace( @@ -427,79 +739,93 @@ impl LanceNamespace for RestNamespace { request: CreateNamespaceRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::create_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_namespace", &id) .await - .map_err(convert_api_error) } async fn drop_namespace(&self, request: DropNamespaceRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::drop_namespace(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/drop", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_namespace", &id) .await - .map_err(convert_api_error) } async fn namespace_exists(&self, request: NamespaceExistsRequest) -> Result<()> { let id = object_id_str(&request.id, &self.delimiter)?; - - namespace_api::namespace_exists(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/exists", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json_no_content(&path, &query, &request, "namespace_exists", &id) .await - .map_err(convert_api_error) } async fn list_tables(&self, request: ListTablesRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::list_tables( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/namespace/{}/table/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_tables", &id).await } async fn describe_table(&self, request: DescribeTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::describe_table( - &self.reqwest_config, - &id, - request.clone(), - Some(&self.delimiter), - request.with_table_uri, - request.load_detailed_metadata, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/describe", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let with_uri_str; + if let Some(with_uri) = request.with_table_uri { + with_uri_str = with_uri.to_string(); + query.push(("with_table_uri", with_uri_str.as_str())); + } + let detailed_str; + if let Some(detailed) = request.load_detailed_metadata { + detailed_str = detailed.to_string(); + query.push(("load_detailed_metadata", detailed_str.as_str())); + } + self.post_json(&path, &query, &request, "describe_table", &id) + .await } async fn register_table(&self, request: RegisterTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::register_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/register", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "register_table", &id) .await - .map_err(convert_api_error) } async fn table_exists(&self, request: TableExistsRequest) -> Result<()> { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::table_exists(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/exists", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json_no_content(&path, &query, &request, "table_exists", &id) .await - .map_err(convert_api_error) } async fn drop_table(&self, request: DropTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::drop_table(&self.reqwest_config, &id, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/drop", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_table", &id) .await - .map_err(convert_api_error) } async fn deregister_table( @@ -507,18 +833,19 @@ impl LanceNamespace for RestNamespace { request: DeregisterTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::deregister_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/deregister", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "deregister_table", &id) .await - .map_err(convert_api_error) } async fn count_table_rows(&self, request: CountTableRowsRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::count_table_rows(&self.reqwest_config, &id, request, Some(&self.delimiter)) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/count_rows", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.get_json(&path, &query, "count_table_rows", &id).await } async fn create_table( @@ -527,16 +854,16 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::create_table( - &self.reqwest_config, - &id, - request_data.to_vec(), - Some(&self.delimiter), - request.mode.as_deref(), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let mode_str; + if let Some(ref mode) = request.mode { + mode_str = mode.clone(); + query.push(("mode", mode_str.as_str())); + } + self.post_binary_json(&path, &query, request_data.to_vec(), "create_table", &id) + .await } async fn create_empty_table( @@ -544,18 +871,20 @@ impl LanceNamespace for RestNamespace { request: CreateEmptyTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::create_empty_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create-empty", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_empty_table", &id) .await - .map_err(convert_api_error) } async fn declare_table(&self, request: DeclareTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::declare_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/declare", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "declare_table", &id) .await - .map_err(convert_api_error) } async fn insert_into_table( @@ -564,16 +893,22 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::insert_into_table( - &self.reqwest_config, - &id, + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/insert", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let mode_str; + if let Some(ref mode) = request.mode { + mode_str = mode.clone(); + query.push(("mode", mode_str.as_str())); + } + self.post_binary_json( + &path, + &query, request_data.to_vec(), - Some(&self.delimiter), - request.mode.as_deref(), + "insert_into_table", + &id, ) .await - .map_err(convert_api_error) } async fn merge_insert_into_table( @@ -582,36 +917,72 @@ impl LanceNamespace for RestNamespace { request_data: Bytes, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); let on = request.on.as_deref().ok_or_else(|| Error::Namespace { source: "'on' field is required for merge insert".into(), location: snafu::location!(), })?; - table_api::merge_insert_into_table( - &self.reqwest_config, - &id, - on, + let path = format!("/v1/table/{}/merge_insert", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str()), ("on", on)]; + + let when_matched_update_all_str; + if let Some(v) = request.when_matched_update_all { + when_matched_update_all_str = v.to_string(); + query.push(( + "when_matched_update_all", + when_matched_update_all_str.as_str(), + )); + } + if let Some(ref v) = request.when_matched_update_all_filt { + query.push(("when_matched_update_all_filt", v.as_str())); + } + let when_not_matched_insert_all_str; + if let Some(v) = request.when_not_matched_insert_all { + when_not_matched_insert_all_str = v.to_string(); + query.push(( + "when_not_matched_insert_all", + when_not_matched_insert_all_str.as_str(), + )); + } + let when_not_matched_by_source_delete_str; + if let Some(v) = request.when_not_matched_by_source_delete { + when_not_matched_by_source_delete_str = v.to_string(); + query.push(( + "when_not_matched_by_source_delete", + when_not_matched_by_source_delete_str.as_str(), + )); + } + if let Some(ref v) = request.when_not_matched_by_source_delete_filt { + query.push(("when_not_matched_by_source_delete_filt", v.as_str())); + } + if let Some(ref v) = request.timeout { + query.push(("timeout", v.as_str())); + } + let use_index_str; + if let Some(v) = request.use_index { + use_index_str = v.to_string(); + query.push(("use_index", use_index_str.as_str())); + } + + self.post_binary_json( + &path, + &query, request_data.to_vec(), - Some(&self.delimiter), - request.when_matched_update_all, - request.when_matched_update_all_filt.as_deref(), - request.when_not_matched_insert_all, - request.when_not_matched_by_source_delete, - request.when_not_matched_by_source_delete_filt.as_deref(), - request.timeout.as_deref(), - request.use_index, + "merge_insert_into_table", + &id, ) .await - .map_err(convert_api_error) } async fn update_table(&self, request: UpdateTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::update_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "update_table", &id) .await - .map_err(convert_api_error) } async fn delete_from_table( @@ -619,27 +990,52 @@ impl LanceNamespace for RestNamespace { request: DeleteFromTableRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::delete_from_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_from_table", &id) .await - .map_err(convert_api_error) } async fn query_table(&self, request: QueryTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/query", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + + let url = format!("{}{}", self.rest_client.base_path(), path); + let req_builder = self + .rest_client + .client() + .post(&url) + .query(&query) + .json(&request); + + let resp = self + .rest_client + .execute(req_builder, "query_table", &id) + .await + .map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; - let response = - table_api::query_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) - .await - .map_err(convert_api_error)?; - - // Convert response to bytes - let bytes = response.bytes().await.map_err(|e| Error::IO { - source: box_error(e), - location: snafu::location!(), - })?; - - Ok(bytes) + let status = resp.status(); + if status.is_success() { + resp.bytes().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + }) + } else { + let content = resp.text().await.map_err(|e| Error::IO { + source: box_error(e), + location: snafu::location!(), + })?; + Err(Error::Namespace { + source: format!("Response error: status={}, content={}", status, content).into(), + location: snafu::location!(), + }) + } } async fn create_table_index( @@ -647,10 +1043,11 @@ impl LanceNamespace for RestNamespace { request: CreateTableIndexRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::create_table_index(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create_index", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_index", &id) .await - .map_err(convert_api_error) } async fn list_table_indices( @@ -658,10 +1055,11 @@ impl LanceNamespace for RestNamespace { request: ListTableIndicesRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::list_table_indices(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/index/list", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "list_table_indices", &id) .await - .map_err(convert_api_error) } async fn describe_table_index_stats( @@ -669,20 +1067,16 @@ impl LanceNamespace for RestNamespace { request: DescribeTableIndexStatsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - // Note: The index_name parameter seems to be missing from the request structure - // This might need to be adjusted based on the actual API - let index_name = ""; // This should come from somewhere in the request - - table_api::describe_table_index_stats( - &self.reqwest_config, - &id, - index_name, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let index_name = request.index_name.as_deref().unwrap_or(""); + let path = format!( + "/v1/table/{}/index/{}/stats", + encoded_id, + urlencode(index_name) + ); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_table_index_stats", &id) + .await } async fn describe_transaction( @@ -690,15 +1084,11 @@ impl LanceNamespace for RestNamespace { request: DescribeTransactionRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - transaction_api::describe_transaction( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/transaction/{}/describe", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "describe_transaction", &id) + .await } async fn alter_transaction( @@ -706,15 +1096,11 @@ impl LanceNamespace for RestNamespace { request: AlterTransactionRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - transaction_api::alter_transaction( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/transaction/{}/alter", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_transaction", &id) + .await } async fn create_table_scalar_index( @@ -722,15 +1108,11 @@ impl LanceNamespace for RestNamespace { request: CreateTableIndexRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::create_table_scalar_index( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/create_scalar_index", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_scalar_index", &id) + .await } async fn drop_table_index( @@ -738,39 +1120,50 @@ impl LanceNamespace for RestNamespace { request: DropTableIndexRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - + let encoded_id = urlencode(&id); let index_name = request.index_name.as_deref().unwrap_or(""); - - table_api::drop_table_index(&self.reqwest_config, &id, index_name, Some(&self.delimiter)) + let path = format!( + "/v1/table/{}/index/{}/drop", + encoded_id, + urlencode(index_name) + ); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "drop_table_index", &id) .await - .map_err(convert_api_error) } async fn list_all_tables(&self, request: ListTablesRequest) -> Result { - table_api::list_all_tables( - &self.reqwest_config, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let path = "/v1/table"; + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(path, &query, "list_all_tables", "").await } async fn restore_table(&self, request: RestoreTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::restore_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/restore", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "restore_table", &id) .await - .map_err(convert_api_error) } async fn rename_table(&self, request: RenameTableRequest) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::rename_table(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/rename", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "rename_table", &id) .await - .map_err(convert_api_error) } async fn list_table_versions( @@ -778,16 +1171,21 @@ impl LanceNamespace for RestNamespace { request: ListTableVersionsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::list_table_versions( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/version/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_table_versions", &id) + .await } async fn update_table_schema_metadata( @@ -795,18 +1193,19 @@ impl LanceNamespace for RestNamespace { request: UpdateTableSchemaMetadataRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/schema_metadata/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; let metadata = request.metadata.unwrap_or_default(); - - let result = table_api::update_table_schema_metadata( - &self.reqwest_config, - &id, - metadata, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error)?; - + let result: HashMap = self + .post_json( + &path, + &query, + &metadata, + "update_table_schema_metadata", + &id, + ) + .await?; Ok(UpdateTableSchemaMetadataResponse { metadata: Some(result), ..Default::default() @@ -818,10 +1217,11 @@ impl LanceNamespace for RestNamespace { request: GetTableStatsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::get_table_stats(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/stats", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "get_table_stats", &id) .await - .map_err(convert_api_error) } async fn explain_table_query_plan( @@ -829,15 +1229,11 @@ impl LanceNamespace for RestNamespace { request: ExplainTableQueryPlanRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::explain_table_query_plan( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/explain_plan", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "explain_table_query_plan", &id) + .await } async fn analyze_table_query_plan( @@ -845,15 +1241,11 @@ impl LanceNamespace for RestNamespace { request: AnalyzeTableQueryPlanRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::analyze_table_query_plan( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/analyze_plan", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "analyze_table_query_plan", &id) + .await } async fn alter_table_add_columns( @@ -861,15 +1253,11 @@ impl LanceNamespace for RestNamespace { request: AlterTableAddColumnsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::alter_table_add_columns( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/add_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_add_columns", &id) + .await } async fn alter_table_alter_columns( @@ -877,15 +1265,11 @@ impl LanceNamespace for RestNamespace { request: AlterTableAlterColumnsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::alter_table_alter_columns( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/alter_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_alter_columns", &id) + .await } async fn alter_table_drop_columns( @@ -893,15 +1277,11 @@ impl LanceNamespace for RestNamespace { request: AlterTableDropColumnsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - table_api::alter_table_drop_columns( - &self.reqwest_config, - &id, - request, - Some(&self.delimiter), - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/drop_columns", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "alter_table_drop_columns", &id) + .await } async fn list_table_tags( @@ -909,16 +1289,20 @@ impl LanceNamespace for RestNamespace { request: ListTableTagsRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - tag_api::list_table_tags( - &self.reqwest_config, - &id, - Some(&self.delimiter), - request.page_token.as_deref(), - request.limit, - ) - .await - .map_err(convert_api_error) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/list", encoded_id); + let mut query = vec![("delimiter", self.delimiter.as_str())]; + let page_token_str; + if let Some(ref pt) = request.page_token { + page_token_str = pt.clone(); + query.push(("page_token", page_token_str.as_str())); + } + let limit_str; + if let Some(limit) = request.limit { + limit_str = limit.to_string(); + query.push(("limit", limit_str.as_str())); + } + self.get_json(&path, &query, "list_table_tags", &id).await } async fn get_table_tag_version( @@ -926,10 +1310,11 @@ impl LanceNamespace for RestNamespace { request: GetTableTagVersionRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - tag_api::get_table_tag_version(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/version", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "get_table_tag_version", &id) .await - .map_err(convert_api_error) } async fn create_table_tag( @@ -937,10 +1322,11 @@ impl LanceNamespace for RestNamespace { request: CreateTableTagRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - tag_api::create_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/create", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "create_table_tag", &id) .await - .map_err(convert_api_error) } async fn delete_table_tag( @@ -948,10 +1334,11 @@ impl LanceNamespace for RestNamespace { request: DeleteTableTagRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - tag_api::delete_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/delete", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "delete_table_tag", &id) .await - .map_err(convert_api_error) } async fn update_table_tag( @@ -959,16 +1346,18 @@ impl LanceNamespace for RestNamespace { request: UpdateTableTagRequest, ) -> Result { let id = object_id_str(&request.id, &self.delimiter)?; - - tag_api::update_table_tag(&self.reqwest_config, &id, request, Some(&self.delimiter)) + let encoded_id = urlencode(&id); + let path = format!("/v1/table/{}/tags/update", encoded_id); + let query = [("delimiter", self.delimiter.as_str())]; + self.post_json(&path, &query, &request, "update_table_tag", &id) .await - .map_err(convert_api_error) } fn namespace_id(&self) -> String { format!( "RestNamespace {{ endpoint: {:?}, delimiter: {:?} }}", - self.reqwest_config.base_path, self.delimiter + self.rest_client.base_path(), + self.delimiter ) } } @@ -1153,10 +1542,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), @@ -1192,10 +1578,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = ListNamespacesRequest { id: Some(vec!["test".to_string()]), @@ -1228,10 +1611,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = CreateNamespaceRequest { id: Some(vec!["test".to_string(), "newnamespace".to_string()]), @@ -1264,10 +1644,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = CreateTableRequest { id: Some(vec![ @@ -1302,10 +1679,7 @@ mod tests { .await; // Create namespace with mock server URL - let mut reqwest_config = Configuration::new(); - reqwest_config.base_path = mock_server.uri(); - - let namespace = RestNamespace::with_configuration("$".to_string(), reqwest_config); + let namespace = RestNamespaceBuilder::new(mock_server.uri()).build(); let request = InsertIntoTableRequest { id: Some(vec![ @@ -1325,4 +1699,176 @@ mod tests { let response = result.unwrap(); assert_eq!(response.transaction_id, Some("txn-123".to_string())); } + + // Integration tests for DynamicContextProvider + + #[derive(Debug)] + struct TestContextProvider { + headers: HashMap, + } + + impl DynamicContextProvider for TestContextProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + self.headers.clone() + } + } + + #[tokio::test] + async fn test_context_provider_headers_sent() { + let mock_server = MockServer::start().await; + + // Mock expects the context header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "X-Context-Token", + "dynamic-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create context provider + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Context-Token".to_string(), + "dynamic-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_base_headers_merged_with_context_headers() { + let mock_server = MockServer::start().await; + + // Mock expects BOTH base header AND context header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer base-token", + )) + .and(wiremock::matchers::header( + "X-Context-Token", + "dynamic-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create context provider + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Context-Token".to_string(), + "dynamic-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + // Create namespace with base header AND context provider + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-token") + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_context_headers_override_base_headers() { + let mock_server = MockServer::start().await; + + // Mock expects the CONTEXT header value (not base) + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer context-override-token", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Context provider that overrides Authorization header + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.Authorization".to_string(), + "Bearer context-override-token".to_string(), + ); + let provider = Arc::new(TestContextProvider { + headers: context_headers, + }); + + // Create namespace with base header that will be overridden + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-token") + .context_provider(provider) + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } + + #[tokio::test] + async fn test_no_context_provider_uses_base_headers_only() { + let mock_server = MockServer::start().await; + + // Mock expects only the base header + Mock::given(method("GET")) + .and(path("/v1/namespace/test/list")) + .and(wiremock::matchers::header( + "Authorization", + "Bearer base-only", + )) + .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({ + "namespaces": [] + }))) + .mount(&mock_server) + .await; + + // Create namespace WITHOUT context provider, only base headers + let namespace = RestNamespaceBuilder::new(mock_server.uri()) + .header("Authorization", "Bearer base-only") + .build(); + + let request = ListNamespacesRequest { + id: Some(vec!["test".to_string()]), + ..Default::default() + }; + + let result = namespace.list_namespaces(request).await; + assert!(result.is_ok(), "Failed: {:?}", result.err()); + } } diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 4a12b92838a..899863793ff 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2776,5 +2776,131 @@ mod tests { .unwrap(); assert_eq!(a_col.values(), &[100, 200]); } + + // ============================================================================ + // DynamicContextProvider Integration Test + // ============================================================================ + + use crate::context::{DynamicContextProvider, OperationInfo}; + use std::collections::HashMap; + + /// Test context provider that adds custom headers to every request. + #[derive(Debug)] + struct TestDynamicContextProvider { + headers: HashMap, + } + + impl DynamicContextProvider for TestDynamicContextProvider { + fn provide_context(&self, _info: &OperationInfo) -> HashMap { + self.headers.clone() + } + } + + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] + async fn test_rest_namespace_with_context_provider() { + let temp_dir = TempDir::new().unwrap(); + let temp_path = temp_dir.path().to_str().unwrap().to_string(); + + // Create DirectoryNamespace backend with manifest enabled + let backend = DirectoryNamespaceBuilder::new(&temp_path) + .manifest_enabled(true) + .build() + .await + .unwrap(); + let backend = Arc::new(backend); + + // Start REST server + let config = RestAdapterConfig { + port: 0, + ..Default::default() + }; + + let server = RestAdapter::new(backend.clone(), config); + let server_handle = server.start().await.unwrap(); + let actual_port = server_handle.port(); + + // Create context provider that adds custom headers + let mut context_headers = HashMap::new(); + context_headers.insert( + "headers.X-Custom-Auth".to_string(), + "test-auth-token".to_string(), + ); + context_headers.insert( + "headers.X-Request-Source".to_string(), + "integration-test".to_string(), + ); + + let provider = Arc::new(TestDynamicContextProvider { + headers: context_headers, + }); + + // Create RestNamespace client with context provider and base headers + let server_url = format!("http://127.0.0.1:{}", actual_port); + let namespace = RestNamespaceBuilder::new(&server_url) + .delimiter("$") + .header("X-Base-Header", "base-value") + .context_provider(provider) + .build(); + + // Create a namespace - should work with context provider + let create_req = CreateNamespaceRequest { + id: Some(vec!["context_test_ns".to_string()]), + properties: None, + mode: None, + identity: None, + context: None, + }; + let result = namespace.create_namespace(create_req).await; + assert!(result.is_ok(), "Failed to create namespace: {:?}", result); + + // List namespaces - should also work + let list_req = ListNamespacesRequest { + id: Some(vec![]), + limit: Some(10), + page_token: None, + identity: None, + context: None, + }; + let result = namespace.list_namespaces(list_req).await; + assert!(result.is_ok(), "Failed to list namespaces: {:?}", result); + let response = result.unwrap(); + assert!( + response.namespaces.contains(&"context_test_ns".to_string()), + "Namespace not found in list" + ); + + // Create a table - should work with context provider + let table_data = create_test_arrow_data(); + let create_table_req = CreateTableRequest { + id: Some(vec![ + "context_test_ns".to_string(), + "test_table".to_string(), + ]), + mode: Some("create".to_string()), + identity: None, + context: None, + }; + let result = namespace.create_table(create_table_req, table_data).await; + assert!(result.is_ok(), "Failed to create table: {:?}", result); + + // Describe the table - should work with context provider + let describe_req = DescribeTableRequest { + id: Some(vec![ + "context_test_ns".to_string(), + "test_table".to_string(), + ]), + with_table_uri: None, + load_detailed_metadata: None, + vend_credentials: None, + version: None, + identity: None, + context: None, + }; + let result = namespace.describe_table(describe_req).await; + assert!(result.is_ok(), "Failed to describe table: {:?}", result); + + // Cleanup + server_handle.shutdown(); + } } } From 0a2eef7a924500d624edb7b91ebf4b98082134ff Mon Sep 17 00:00:00 2001 From: Jack Ye Date: Tue, 20 Jan 2026 23:08:42 -0800 Subject: [PATCH 8/8] feat: introduce storage options accessor --- java/lance-jni/src/blocking_dataset.rs | 191 +++--- java/lance-jni/src/file_reader.rs | 4 +- java/lance-jni/src/file_writer.rs | 4 +- java/lance-jni/src/fragment.rs | 10 - java/lance-jni/src/traits.rs | 22 + java/lance-jni/src/transaction.rs | 52 +- java/lance-jni/src/utils.rs | 42 +- java/src/main/java/org/lance/Dataset.java | 57 +- java/src/main/java/org/lance/Fragment.java | 13 +- .../java/org/lance/OpenDatasetBuilder.java | 20 +- java/src/main/java/org/lance/ReadOptions.java | 23 - java/src/main/java/org/lance/Transaction.java | 25 +- .../java/org/lance/WriteDatasetBuilder.java | 20 - .../java/org/lance/WriteFragmentBuilder.java | 16 - java/src/main/java/org/lance/WriteParams.java | 18 +- .../org/lance/NamespaceIntegrationTest.java | 21 +- python/python/lance/__init__.py | 26 +- python/python/lance/dataset.py | 83 ++- python/python/lance/file.py | 15 - python/python/lance/lance/__init__.pyi | 3 - .../tests/test_namespace_integration.py | 11 +- python/src/dataset.rs | 141 +++-- python/src/file.rs | 72 +-- python/src/lib.rs | 1 + python/src/storage_options.rs | 130 +++- rust/lance-io/src/object_store.rs | 82 ++- rust/lance-io/src/object_store/providers.rs | 2 +- .../src/object_store/providers/aws.rs | 575 +++++++++-------- .../src/object_store/providers/azure.rs | 31 +- .../src/object_store/providers/gcp.rs | 21 +- .../src/object_store/providers/huggingface.rs | 16 +- .../src/object_store/providers/local.rs | 4 +- .../src/object_store/providers/memory.rs | 6 +- .../src/object_store/providers/oss.rs | 3 +- .../src/object_store/storage_options.rs | 582 +++++++++++++++++- rust/lance-namespace-impls/src/dir.rs | 24 +- .../lance-namespace-impls/src/dir/manifest.rs | 6 +- .../lance-namespace-impls/src/rest_adapter.rs | 15 +- rust/lance-table/src/io/commit.rs | 14 +- rust/lance/src/dataset.rs | 151 +++-- rust/lance/src/dataset/builder.rs | 145 ++++- rust/lance/src/dataset/fragment/write.rs | 8 +- rust/lance/src/io.rs | 5 +- rust/lance/src/io/commit/s3_test.rs | 10 +- 44 files changed, 1744 insertions(+), 976 deletions(-) diff --git a/java/lance-jni/src/blocking_dataset.rs b/java/lance-jni/src/blocking_dataset.rs index b15132ad00b..ad14d02d6da 100644 --- a/java/lance-jni/src/blocking_dataset.rs +++ b/java/lance-jni/src/blocking_dataset.rs @@ -59,16 +59,32 @@ pub struct BlockingDataset { } impl BlockingDataset { - /// Get the storage options provider that was used when opening this dataset - pub fn get_storage_options_provider(&self) -> Option> { - self.inner.storage_options_provider() + /// Get the initial storage options used to open this dataset. + /// + /// Returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + pub fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. + pub fn latest_storage_options(&self) -> Result>> { + RT.block_on(async { self.inner.latest_storage_options().await }) + .map(|opt| opt.map(|opts| opts.0)) + .map_err(|e| Error::io_error(e.to_string())) } pub fn drop(uri: &str, storage_options: HashMap) -> Result<()> { RT.block_on(async move { let registry = Arc::new(ObjectStoreRegistry::default()); let object_store_params = ObjectStoreParams { - storage_options: Some(storage_options.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (object_store, path) = @@ -100,20 +116,29 @@ impl BlockingDataset { storage_options: HashMap, serialized_manifest: Option<&[u8]>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> Result { - let mut store_params = ObjectStoreParams { + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + storage_options, + provider, + ), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_provider(provider), + )), + (true, None) => None, + }; + + let store_params = ObjectStoreParams { block_size: block_size.map(|size| size as usize), - storage_options: Some(storage_options.clone()), + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - if let Some(provider) = storage_options_provider.clone() { - store_params.storage_options_provider = Some(provider); - } let params = ReadParams { index_cache_size_bytes: index_cache_size_bytes as usize, metadata_cache_size_bytes: metadata_cache_size_bytes as usize, @@ -126,14 +151,6 @@ impl BlockingDataset { if let Some(ver) = version { builder = builder.with_version(ver as u64); } - builder = builder.with_storage_options(storage_options); - if let Some(provider) = storage_options_provider.clone() { - builder = builder.with_storage_options_provider(provider) - } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - builder = builder - .with_s3_credentials_refresh_offset(std::time::Duration::from_secs(offset_seconds)); - } if let Some(serialized_manifest) = serialized_manifest { builder = builder.with_serialized_manifest(serialized_manifest)?; @@ -149,12 +166,19 @@ impl BlockingDataset { read_version: Option, storage_options: HashMap, ) -> Result { + let accessor = if storage_options.is_empty() { + None + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )) + }; let inner = RT.block_on(Dataset::commit( uri, operation, read_version, Some(ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: accessor, ..Default::default() }), None, @@ -336,7 +360,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -351,7 +374,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiSchema<'local>( enable_stable_row_ids, data_storage_version, storage_options_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -368,7 +390,6 @@ fn inner_create_with_ffi_schema<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; let c_schema = unsafe { FFI_ArrowSchema::from_raw(c_schema_ptr) }; @@ -386,7 +407,6 @@ fn inner_create_with_ffi_schema<'local>( data_storage_version, storage_options_obj, JObject::null(), // No provider for schema-only creation - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -418,7 +438,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( enable_stable_row_ids: JObject, // Optional data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -434,7 +453,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStream<'local>( data_storage_version, storage_options_obj, JObject::null(), - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -453,7 +471,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -469,7 +486,6 @@ pub extern "system" fn Java_org_lance_Dataset_createWithFfiStreamAndProvider<'lo data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -487,7 +503,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -502,7 +517,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -519,7 +533,6 @@ fn create_dataset<'local>( data_storage_version: JObject, storage_options_obj: JObject, storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, reader: impl RecordBatchReader + Send + 'static, ) -> Result> { let path_str = path.extract(env)?; @@ -534,7 +547,6 @@ fn create_dataset<'local>( &data_storage_version, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, )?; let dataset = BlockingDataset::write(reader, &path_str, Some(write_params))?; @@ -929,7 +941,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw!( env, @@ -943,7 +954,6 @@ pub extern "system" fn Java_org_lance_Dataset_openNative<'local>( storage_options_obj, serialized_manifest, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ) ) } @@ -959,7 +969,6 @@ fn inner_open_native<'local>( storage_options_obj: JObject, // Map serialized_manifest: JObject, // Optional storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let path_str: String = path.extract(env)?; let version = env.get_int_opt(&version_obj)?; @@ -994,35 +1003,6 @@ fn inner_open_native<'local>( let storage_options_provider_arc = storage_options_provider.map(|v| Arc::new(v) as Arc); - // Extract s3_credentials_refresh_offset_seconds - let s3_credentials_refresh_offset_seconds = - if !s3_credentials_refresh_offset_seconds_obj.is_null() { - let is_present = env - .call_method( - &s3_credentials_refresh_offset_seconds_obj, - "isPresent", - "()Z", - &[], - )? - .z()?; - if is_present { - let value = env - .call_method( - &s3_credentials_refresh_offset_seconds_obj, - "get", - "()Ljava/lang/Object;", - &[], - )? - .l()?; - let long_value = env.call_method(&value, "longValue", "()J", &[])?.j()?; - Some(long_value as u64) - } else { - None - } - } else { - None - }; - let serialized_manifest = env.get_bytes_opt(&serialized_manifest)?; let dataset = BlockingDataset::open( &path_str, @@ -1033,7 +1013,6 @@ fn inner_open_native<'local>( storage_options, serialized_manifest, storage_options_provider_arc, - s3_credentials_refresh_offset_seconds, )?; dataset.into_java(env) } @@ -1229,6 +1208,58 @@ fn inner_latest_version_id(env: &mut JNIEnv, java_dataset: JObject) -> Result( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_initial_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_initial_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.initial_storage_options() + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + +#[no_mangle] +pub extern "system" fn Java_org_lance_Dataset_nativeGetLatestStorageOptions<'local>( + mut env: JNIEnv<'local>, + java_dataset: JObject, +) -> JObject<'local> { + ok_or_throw!( + env, + inner_get_latest_storage_options(&mut env, java_dataset) + ) +} + +fn inner_get_latest_storage_options<'local>( + env: &mut JNIEnv<'local>, + java_dataset: JObject, +) -> Result> { + let storage_options = { + let dataset_guard = + unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; + dataset_guard.latest_storage_options()? + }; + match storage_options { + Some(opts) => opts.into_java(env), + None => Ok(JObject::null()), + } +} + #[no_mangle] pub extern "system" fn Java_org_lance_Dataset_nativeCheckoutLatest( mut env: JNIEnv, @@ -1358,20 +1389,16 @@ fn inner_shallow_clone<'local>( let new_ds = { let mut dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(java_dataset, NATIVE_DATASET) }?; - RT.block_on( - dataset_guard.inner.shallow_clone( - &target_path_str, - reference, - storage_options - .map(|options| { - Some(ObjectStoreParams { - storage_options: Some(options), - ..Default::default() - }) - }) - .unwrap_or(None), - ), - )? + RT.block_on(dataset_guard.inner.shallow_clone( + &target_path_str, + reference, + storage_options.map(|options| ObjectStoreParams { + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(options), + )), + ..Default::default() + }), + ))? }; BlockingDataset { inner: new_ds }.into_java(env) diff --git a/java/lance-jni/src/file_reader.rs b/java/lance-jni/src/file_reader.rs index 11591b3acea..ccaac121579 100644 --- a/java/lance-jni/src/file_reader.rs +++ b/java/lance-jni/src/file_reader.rs @@ -112,7 +112,9 @@ fn inner_open<'local>( let storage_options = to_rust_map(env, &jmap)?; let reader = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/file_writer.rs b/java/lance-jni/src/file_writer.rs index 600d7de2845..ebc5b1c328b 100644 --- a/java/lance-jni/src/file_writer.rs +++ b/java/lance-jni/src/file_writer.rs @@ -94,7 +94,9 @@ fn inner_open<'local>( let writer = RT.block_on(async move { let object_params = ObjectStoreParams { - storage_options: Some(storage_options), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), ..Default::default() }; let (obj_store, path) = ObjectStore::from_uri_and_params( diff --git a/java/lance-jni/src/fragment.rs b/java/lance-jni/src/fragment.rs index 775ad0d906d..72377413c26 100644 --- a/java/lance-jni/src/fragment.rs +++ b/java/lance-jni/src/fragment.rs @@ -91,7 +91,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'local> { ok_or_throw_with_return!( env, @@ -108,7 +107,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiArray<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::default() ) @@ -128,7 +126,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let c_array_ptr = arrow_array_addr as *mut FFI_ArrowArray; let c_schema_ptr = arrow_schema_addr as *mut FFI_ArrowSchema; @@ -154,7 +151,6 @@ fn inner_create_with_ffi_array<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -173,7 +169,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> JObject<'a> { ok_or_throw_with_return!( env, @@ -189,7 +184,6 @@ pub extern "system" fn Java_org_lance_Fragment_createWithFfiStream<'a>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj ), JObject::null() ) @@ -208,7 +202,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional ) -> Result> { let stream_ptr = arrow_array_stream_addr as *mut FFI_ArrowArrayStream; let reader = unsafe { ArrowArrayStreamReader::from_raw(stream_ptr) }?; @@ -224,7 +217,6 @@ fn inner_create_with_ffi_stream<'local>( data_storage_version, storage_options_obj, storage_options_provider_obj, - s3_credentials_refresh_offset_seconds_obj, reader, ) } @@ -241,7 +233,6 @@ fn create_fragment<'a>( data_storage_version: JObject, // Optional storage_options_obj: JObject, // Map storage_options_provider_obj: JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: JObject, // Optional source: impl StreamingWriteSource, ) -> Result> { let path_str = dataset_uri.extract(env)?; @@ -256,7 +247,6 @@ fn create_fragment<'a>( &data_storage_version, &storage_options_obj, &storage_options_provider_obj, - &s3_credentials_refresh_offset_seconds_obj, )?; let fragments = RT.block_on(FileFragment::create_fragments( diff --git a/java/lance-jni/src/traits.rs b/java/lance-jni/src/traits.rs index 7da64d453c2..ebc53b1679a 100644 --- a/java/lance-jni/src/traits.rs +++ b/java/lance-jni/src/traits.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors +use std::collections::HashMap; + use jni::objects::{JIntArray, JLongArray, JMap, JObject, JString, JValue, JValueGen}; use jni::JNIEnv; @@ -224,6 +226,26 @@ impl IntoJava for &String { } } +impl IntoJava for HashMap { + fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { + let hash_map = env.new_object("java/util/HashMap", "()V", &[])?; + for (key, value) in self { + let java_key = env.new_string(&key)?; + let java_value = env.new_string(&value)?; + env.call_method( + &hash_map, + "put", + "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;", + &[ + JValueGen::Object(&java_key.into()), + JValueGen::Object(&java_value.into()), + ], + )?; + } + Ok(hash_map) + } +} + impl IntoJava for JLance> { fn into_java<'a>(self, env: &mut JNIEnv<'a>) -> Result> { let obj = match self.0 { diff --git a/java/lance-jni/src/transaction.rs b/java/lance-jni/src/transaction.rs index 32ffe3c99e0..d28ee7865a1 100644 --- a/java/lance-jni/src/transaction.rs +++ b/java/lance-jni/src/transaction.rs @@ -674,27 +674,49 @@ fn inner_commit_transaction<'local>( .call_method(&java_transaction, "writeParams", "()Ljava/util/Map;", &[])? .l()?; let write_param_jmap = JMap::from_env(env, &write_param_jobj)?; - let mut write_param = to_rust_map(env, &write_param_jmap)?; + let write_param = to_rust_map(env, &write_param_jmap)?; - // Extract s3_credentials_refresh_offset_seconds from write_param - let s3_credentials_refresh_offset = write_param - .remove("s3_credentials_refresh_offset_seconds") - .and_then(|v| v.parse::().ok()) - .map(std::time::Duration::from_secs) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); - - // Get the Dataset's storage_options_provider - let storage_options_provider = { + // Get the Dataset's storage_options_accessor and merge with write_param + let storage_options_accessor = { let dataset_guard = unsafe { env.get_rust_field::<_, _, BlockingDataset>(&java_dataset, NATIVE_DATASET) }?; - dataset_guard.get_storage_options_provider() + let existing_accessor = dataset_guard.inner.storage_options_accessor(); + + // Merge write_param with existing accessor's initial options + match existing_accessor { + Some(accessor) => { + let mut merged = accessor + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(write_param); + if let Some(provider) = accessor.provider().cloned() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider( + merged, provider, + ), + )) + } else { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(merged), + )) + } + } + None => { + if !write_param.is_empty() { + Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(write_param), + )) + } else { + None + } + } + } }; - // Build ObjectStoreParams using write_param for storage_options and provider from Dataset + // Build ObjectStoreParams using the merged accessor let store_params = ObjectStoreParams { - storage_options: Some(write_param), - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_accessor, ..Default::default() }; diff --git a/java/lance-jni/src/utils.rs b/java/lance-jni/src/utils.rs index dc6f1e6e60f..8a266791f10 100644 --- a/java/lance-jni/src/utils.rs +++ b/java/lance-jni/src/utils.rs @@ -48,7 +48,6 @@ pub fn extract_write_params( data_storage_version: &JObject, storage_options_obj: &JObject, storage_options_provider_obj: &JObject, // Optional - s3_credentials_refresh_offset_seconds_obj: &JObject, // Optional ) -> Result { let mut write_params = WriteParams::default(); @@ -76,26 +75,31 @@ pub fn extract_write_params( extract_storage_options(env, storage_options_obj)?; // Extract storage options provider if present - let storage_options_provider = env.get_optional(storage_options_provider_obj, |env, obj| { - let provider_obj = env - .call_method(obj, "get", "()Ljava/lang/Object;", &[])? - .l()?; - JavaStorageOptionsProvider::new(env, provider_obj) - })?; - - let storage_options_provider_arc: Option> = - storage_options_provider.map(|v| Arc::new(v) as Arc); - - // Extract s3_credentials_refresh_offset_seconds if present - let s3_credentials_refresh_offset = env - .get_long_opt(s3_credentials_refresh_offset_seconds_obj)? - .map(|v| std::time::Duration::from_secs(v as u64)) - .unwrap_or_else(|| std::time::Duration::from_secs(10)); + let storage_options_provider: Option> = env + .get_optional(storage_options_provider_obj, |env, optional_obj| { + let provider_obj = env + .call_method(optional_obj, "get", "()Ljava/lang/Object;", &[])? + .l()?; + JavaStorageOptionsProvider::new(env, provider_obj) + })? + .map(|p| Arc::new(p) as Arc); + + // Create storage options accessor from storage_options and provider + let accessor = match (storage_options.is_empty(), storage_options_provider) { + (false, Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )), + (false, None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(storage_options), + )), + (true, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (true, None) => None, + }; write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(storage_options), - storage_options_provider: storage_options_provider_arc, - s3_credentials_refresh_offset, + storage_options_accessor: accessor, ..Default::default() }); Ok(write_params) diff --git a/java/src/main/java/org/lance/Dataset.java b/java/src/main/java/org/lance/Dataset.java index 21572214eda..0249732eb00 100644 --- a/java/src/main/java/org/lance/Dataset.java +++ b/java/src/main/java/org/lance/Dataset.java @@ -139,8 +139,7 @@ public static Dataset create( params.getMode(), params.getEnableStableRowIds(), params.getDataStorageVersion(), - params.getStorageOptions(), - params.getS3CredentialsRefreshOffsetSeconds()); + params.getStorageOptions()); dataset.allocator = allocator; return dataset; } @@ -198,8 +197,7 @@ static Dataset create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); dataset.allocator = allocator; return dataset; } @@ -213,8 +211,7 @@ private static native Dataset createWithFfiSchema( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds); + Map storageOptions); private static native Dataset createWithFfiStream( long arrowStreamMemoryAddress, @@ -225,8 +222,7 @@ private static native Dataset createWithFfiStream( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds); + Map storageOptions); private static native Dataset createWithFfiStreamAndProvider( long arrowStreamMemoryAddress, @@ -238,8 +234,7 @@ private static native Dataset createWithFfiStreamAndProvider( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Open a dataset from the specified path. @@ -317,8 +312,7 @@ static Dataset open( options.getMetadataCacheSizeBytes(), options.getStorageOptions(), options.getSerializedManifest(), - options.getStorageOptionsProvider(), - options.getS3CredentialsRefreshOffsetSeconds()); + options.getStorageOptionsProvider()); dataset.allocator = allocator; dataset.selfManagedAllocator = selfManagedAllocator; return dataset; @@ -332,8 +326,7 @@ private static native Dataset openNative( long metadataCacheSizeBytes, Map storageOptions, Optional serializedManifest, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Creates a builder for opening a dataset. @@ -686,6 +679,42 @@ public long latestVersion() { private native long nativeGetLatestVersionId(); + /** + * Get the initial storage options used to open this dataset. + * + *

This returns the options that were provided when the dataset was opened, without any refresh + * from the provider. Returns null if no storage options were provided. + * + * @return the initial storage options, or null if none were provided + */ + public Map getInitialStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetInitialStorageOptions(); + } + } + + private native Map nativeGetInitialStorageOptions(); + + /** + * Get the latest storage options, potentially refreshed from the provider. + * + *

If a storage options provider was configured and credentials are expiring, this will refresh + * them. + * + * @return the latest storage options (static or refreshed from provider), or null if no storage + * options were configured for this dataset + * @throws RuntimeException if an error occurs while fetching/refreshing options from the provider + */ + public Map getLatestStorageOptions() { + try (LockManager.ReadLock readLock = lockManager.acquireReadLock()) { + Preconditions.checkArgument(nativeDatasetHandle != 0, "Dataset is closed"); + return nativeGetLatestStorageOptions(); + } + } + + private native Map nativeGetLatestStorageOptions(); + /** Checkout the dataset to the latest version. */ public void checkoutLatest() { try (LockManager.WriteLock writeLock = lockManager.acquireWriteLock()) { diff --git a/java/src/main/java/org/lance/Fragment.java b/java/src/main/java/org/lance/Fragment.java index 812fb49548c..1454fea5476 100644 --- a/java/src/main/java/org/lance/Fragment.java +++ b/java/src/main/java/org/lance/Fragment.java @@ -209,7 +209,6 @@ private native FragmentUpdateResult nativeUpdateColumns( * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } * @@ -275,8 +274,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } } @@ -328,8 +326,7 @@ public static List create( params.getEnableStableRowIds(), params.getDataStorageVersion(), params.getStorageOptions(), - Optional.ofNullable(storageOptionsProvider), - params.getS3CredentialsRefreshOffsetSeconds()); + Optional.ofNullable(storageOptionsProvider)); } /** @@ -348,8 +345,7 @@ private static native List createWithFfiArray( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); /** * Create a fragment from the given arrow stream. @@ -366,6 +362,5 @@ private static native List createWithFfiStream( Optional enableStableRowIds, Optional dataStorageVersion, Map storageOptions, - Optional storageOptionsProvider, - Optional s3CredentialsRefreshOffsetSeconds); + Optional storageOptionsProvider); } diff --git a/java/src/main/java/org/lance/OpenDatasetBuilder.java b/java/src/main/java/org/lance/OpenDatasetBuilder.java index ae082e14ceb..fc350fb0fcf 100644 --- a/java/src/main/java/org/lance/OpenDatasetBuilder.java +++ b/java/src/main/java/org/lance/OpenDatasetBuilder.java @@ -58,7 +58,6 @@ public class OpenDatasetBuilder { private LanceNamespace namespace; private List tableId; private ReadOptions options = new ReadOptions.Builder().build(); - private boolean ignoreNamespaceTableStorageOptions = false; /** Creates a new builder instance. Package-private, use Dataset.open() instead. */ OpenDatasetBuilder() {} @@ -128,19 +127,6 @@ public OpenDatasetBuilder readOptions(ReadOptions options) { return this; } - /** - * Sets whether to ignore storage options from the namespace's describeTable(). - * - * @param ignoreNamespaceTableStorageOptions If true, storage options returned from - * describeTable() will be ignored (treated as null) - * @return this builder instance - */ - public OpenDatasetBuilder ignoreNamespaceTableStorageOptions( - boolean ignoreNamespaceTableStorageOptions) { - this.ignoreNamespaceTableStorageOptions = ignoreNamespaceTableStorageOptions; - return this; - } - /** * Opens the dataset with the configured parameters. * @@ -204,8 +190,7 @@ private Dataset buildFromNamespace() { throw new IllegalArgumentException("Namespace did not return a table location"); } - Map namespaceStorageOptions = - ignoreNamespaceTableStorageOptions ? null : response.getStorageOptions(); + Map namespaceStorageOptions = response.getStorageOptions(); ReadOptions.Builder optionsBuilder = new ReadOptions.Builder() @@ -221,9 +206,6 @@ private Dataset buildFromNamespace() { options.getVersion().ifPresent(optionsBuilder::setVersion); options.getBlockSize().ifPresent(optionsBuilder::setBlockSize); options.getSerializedManifest().ifPresent(optionsBuilder::setSerializedManifest); - options - .getS3CredentialsRefreshOffsetSeconds() - .ifPresent(optionsBuilder::setS3CredentialsRefreshOffsetSeconds); Map storageOptions = new HashMap<>(options.getStorageOptions()); if (namespaceStorageOptions != null) { diff --git a/java/src/main/java/org/lance/ReadOptions.java b/java/src/main/java/org/lance/ReadOptions.java index 9d08c834008..0a7a0343a79 100644 --- a/java/src/main/java/org/lance/ReadOptions.java +++ b/java/src/main/java/org/lance/ReadOptions.java @@ -32,7 +32,6 @@ public class ReadOptions { private final Optional serializedManifest; private final Map storageOptions; private final Optional storageOptionsProvider; - private final Optional s3CredentialsRefreshOffsetSeconds; private ReadOptions(Builder builder) { this.version = builder.version; @@ -42,7 +41,6 @@ private ReadOptions(Builder builder) { this.storageOptions = builder.storageOptions; this.serializedManifest = builder.serializedManifest; this.storageOptionsProvider = builder.storageOptionsProvider; - this.s3CredentialsRefreshOffsetSeconds = builder.s3CredentialsRefreshOffsetSeconds; } public Optional getVersion() { @@ -73,10 +71,6 @@ public Optional getStorageOptionsProvider() { return storageOptionsProvider; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -100,7 +94,6 @@ public static class Builder { private Map storageOptions = new HashMap<>(); private Optional serializedManifest = Optional.empty(); private Optional storageOptionsProvider = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** * Set the version of the dataset to read. If not set, read from latest version. @@ -221,22 +214,6 @@ public Builder setStorageOptionsProvider(StorageOptionsProvider storageOptionsPr return this; } - /** - * Set the number of seconds before credential expiration to trigger a refresh. - * - *

Default is 60 seconds. Only applicable when using AWS S3 with temporary credentials. For - * example, if set to 60, credentials will be refreshed when they have less than 60 seconds - * remaining before expiration. This should be set shorter than the credential lifetime to avoid - * using expired credentials. - * - * @param s3CredentialsRefreshOffsetSeconds the refresh offset in seconds - * @return this builder - */ - public Builder setS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public ReadOptions build() { return new ReadOptions(this); } diff --git a/java/src/main/java/org/lance/Transaction.java b/java/src/main/java/org/lance/Transaction.java index 67bc5f8d93d..2d565c73258 100644 --- a/java/src/main/java/org/lance/Transaction.java +++ b/java/src/main/java/org/lance/Transaction.java @@ -118,7 +118,6 @@ public static class Builder { private Operation operation; private Map writeParams; private Map transactionProperties; - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder(Dataset dataset) { this.dataset = dataset; @@ -140,21 +139,6 @@ public Builder writeParams(Map writeParams) { return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For - * example, if credentials expire at T+60s and this is set to 10, credentials will be refreshed - * at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public Builder s3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public Builder operation(Operation operation) { validateState(); this.operation = operation; @@ -171,15 +155,8 @@ private void validateState() { public Transaction build() { Preconditions.checkState(operation != null, "TransactionBuilder has no operations"); - // Merge s3_credentials_refresh_offset_seconds into writeParams if present - Map finalWriteParams = - writeParams != null ? new HashMap<>(writeParams) : new HashMap<>(); - s3CredentialsRefreshOffsetSeconds.ifPresent( - value -> - finalWriteParams.put("s3_credentials_refresh_offset_seconds", String.valueOf(value))); - return new Transaction( - dataset, readVersion, uuid, operation, finalWriteParams, transactionProperties); + dataset, readVersion, uuid, operation, writeParams, transactionProperties); } } } diff --git a/java/src/main/java/org/lance/WriteDatasetBuilder.java b/java/src/main/java/org/lance/WriteDatasetBuilder.java index dc90b425291..01f6fcbb80b 100644 --- a/java/src/main/java/org/lance/WriteDatasetBuilder.java +++ b/java/src/main/java/org/lance/WriteDatasetBuilder.java @@ -80,7 +80,6 @@ public class WriteDatasetBuilder { private Optional maxBytesPerFile = Optional.empty(); private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); /** Creates a new builder instance. Package-private, use Dataset.write() instead. */ WriteDatasetBuilder() { @@ -274,21 +273,6 @@ public WriteDatasetBuilder dataStorageVersion(WriteParams.LanceFileVersion dataS return this; } - /** - * Sets the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param s3CredentialsRefreshOffsetSeconds Refresh offset in seconds - * @return this builder instance - */ - public WriteDatasetBuilder s3CredentialsRefreshOffsetSeconds( - long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - /** * Executes the write operation and returns the created dataset. * @@ -412,8 +396,6 @@ private Dataset executeWithNamespace() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); @@ -436,8 +418,6 @@ private Dataset executeWithUri() { maxBytesPerFile.ifPresent(paramsBuilder::withMaxBytesPerFile); enableStableRowIds.ifPresent(paramsBuilder::withEnableStableRowIds); dataStorageVersion.ifPresent(paramsBuilder::withDataStorageVersion); - s3CredentialsRefreshOffsetSeconds.ifPresent( - paramsBuilder::withS3CredentialsRefreshOffsetSeconds); WriteParams params = paramsBuilder.build(); diff --git a/java/src/main/java/org/lance/WriteFragmentBuilder.java b/java/src/main/java/org/lance/WriteFragmentBuilder.java index 76882b14a29..56ce06a7b0a 100644 --- a/java/src/main/java/org/lance/WriteFragmentBuilder.java +++ b/java/src/main/java/org/lance/WriteFragmentBuilder.java @@ -37,7 +37,6 @@ * .allocator(allocator) * .data(vectorSchemaRoot) * .storageOptions(storageOptions) - * .s3CredentialsRefreshOffsetSeconds(10) * .execute(); * } */ @@ -134,21 +133,6 @@ public WriteFragmentBuilder storageOptionsProvider(StorageOptionsProvider provid return this; } - /** - * Set the S3 credentials refresh offset in seconds. - * - *

This parameter controls how long before credential expiration to refresh them. For example, - * if credentials expire at T+60s and this is set to 10, credentials will be refreshed at T+50s. - * - * @param seconds refresh offset in seconds - * @return this builder - */ - public WriteFragmentBuilder s3CredentialsRefreshOffsetSeconds(long seconds) { - ensureWriteParamsBuilder(); - this.writeParamsBuilder.withS3CredentialsRefreshOffsetSeconds(seconds); - return this; - } - /** * Set the maximum number of rows per file. * diff --git a/java/src/main/java/org/lance/WriteParams.java b/java/src/main/java/org/lance/WriteParams.java index a0ce1c8c375..1b5a2dceeb9 100644 --- a/java/src/main/java/org/lance/WriteParams.java +++ b/java/src/main/java/org/lance/WriteParams.java @@ -56,7 +56,6 @@ public String getVersionString() { private final Optional enableStableRowIds; private final Optional dataStorageVersion; private Map storageOptions = new HashMap<>(); - private final Optional s3CredentialsRefreshOffsetSeconds; private WriteParams( Optional maxRowsPerFile, @@ -65,8 +64,7 @@ private WriteParams( Optional mode, Optional enableStableRowIds, Optional dataStorageVersion, - Map storageOptions, - Optional s3CredentialsRefreshOffsetSeconds) { + Map storageOptions) { this.maxRowsPerFile = maxRowsPerFile; this.maxRowsPerGroup = maxRowsPerGroup; this.maxBytesPerFile = maxBytesPerFile; @@ -74,7 +72,6 @@ private WriteParams( this.enableStableRowIds = enableStableRowIds; this.dataStorageVersion = dataStorageVersion; this.storageOptions = storageOptions; - this.s3CredentialsRefreshOffsetSeconds = s3CredentialsRefreshOffsetSeconds; } public Optional getMaxRowsPerFile() { @@ -110,10 +107,6 @@ public Map getStorageOptions() { return storageOptions; } - public Optional getS3CredentialsRefreshOffsetSeconds() { - return s3CredentialsRefreshOffsetSeconds; - } - @Override public String toString() { return MoreObjects.toStringHelper(this) @@ -134,7 +127,6 @@ public static class Builder { private Optional enableStableRowIds = Optional.empty(); private Optional dataStorageVersion = Optional.empty(); private Map storageOptions = new HashMap<>(); - private Optional s3CredentialsRefreshOffsetSeconds = Optional.empty(); public Builder withMaxRowsPerFile(int maxRowsPerFile) { this.maxRowsPerFile = Optional.of(maxRowsPerFile); @@ -171,11 +163,6 @@ public Builder withEnableStableRowIds(boolean enableStableRowIds) { return this; } - public Builder withS3CredentialsRefreshOffsetSeconds(long s3CredentialsRefreshOffsetSeconds) { - this.s3CredentialsRefreshOffsetSeconds = Optional.of(s3CredentialsRefreshOffsetSeconds); - return this; - } - public WriteParams build() { return new WriteParams( maxRowsPerFile, @@ -184,8 +171,7 @@ public WriteParams build() { mode, enableStableRowIds, dataStorageVersion, - storageOptions, - s3CredentialsRefreshOffsetSeconds); + storageOptions); } } } diff --git a/java/src/test/java/org/lance/NamespaceIntegrationTest.java b/java/src/test/java/org/lance/NamespaceIntegrationTest.java index ad0b55dccdc..2d6f8ab1443 100644 --- a/java/src/test/java/org/lance/NamespaceIntegrationTest.java +++ b/java/src/test/java/org/lance/NamespaceIntegrationTest.java @@ -203,6 +203,8 @@ private Map modifyStorageOptions( long expiresAtMillis = System.currentTimeMillis() + (credentialExpiresInSeconds * 1000L); modified.put("expires_at_millis", String.valueOf(expiresAtMillis)); + // Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified.put("refresh_offset_millis", "1000"); return modified; } @@ -326,11 +328,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace WITH refresh enabled - // Use 10-second refresh offset, so credentials effectively expire at T+50s - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(10) // Refresh 10s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -451,7 +449,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration .execute()) { assertEquals(2, dataset.countRows()); } @@ -461,11 +458,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { assertEquals(1, namespace.getCreateCallCount(), "createEmptyTable should be called once"); // Open dataset through namespace with refresh enabled - // Use 2-second refresh offset so credentials effectively expire at T+3s (5s - 2s) - ReadOptions readOptions = - new ReadOptions.Builder() - .setS3CredentialsRefreshOffsetSeconds(2) // Refresh 2s before expiration - .build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); int callCountBeforeOpen = namespace.getDescribeCallCount(); try (Dataset dsFromNamespace = @@ -692,7 +685,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { }; // Use the write builder to create a dataset through namespace - // Set a 1-second refresh offset. Credentials expire at T+60s, so refresh at T+59s. // Write completes instantly, so NO describeTable call should happen for refresh. try (Dataset dataset = Dataset.write() @@ -701,7 +693,6 @@ public VectorSchemaRoot getVectorSchemaRoot() { .namespace(namespace) .tableId(Arrays.asList(tableName)) .mode(WriteParams.WriteMode.CREATE) - .s3CredentialsRefreshOffsetSeconds(1) .execute()) { // Verify createEmptyTable was called exactly ONCE @@ -732,9 +723,7 @@ public VectorSchemaRoot getVectorSchemaRoot() { "describeTable should still be 0 after close (no refresh needed)"); // Now open the dataset through namespace with long-lived credentials (60s expiration) - // With 1s refresh offset, credentials are valid for 59s - plenty of time for reads - ReadOptions readOptions = - new ReadOptions.Builder().setS3CredentialsRefreshOffsetSeconds(1).build(); + ReadOptions readOptions = new ReadOptions.Builder().build(); try (Dataset dsFromNamespace = Dataset.open() diff --git a/python/python/lance/__init__.py b/python/python/lance/__init__.py index aa05c70286d..83587ffaf72 100644 --- a/python/python/lance/__init__.py +++ b/python/python/lance/__init__.py @@ -95,8 +95,6 @@ def dataset( session: Optional[Session] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """ Opens the Lance dataset from the address specified. @@ -164,26 +162,13 @@ def dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - Only applicable when using `namespace` and `table_id`. If True, storage - options returned from the namespace's describe_table() will be ignored - (treated as None). If False (default), storage options from describe_table() - will be used and a dynamic storage options provider will be created to - automatically refresh credentials before they expire. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace - - Storage options from describe_table() will be used unless - `ignore_namespace_table_storage_options=True` + - Storage options from describe_table() will be used automatically + - A dynamic storage options provider will be created to refresh credentials - Initial storage options from describe_table() will be merged with any provided `storage_options` """ @@ -216,10 +201,7 @@ def dataset( if uri is None: raise ValueError("Namespace did not return a 'location' for the table") - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + namespace_storage_options = response.storage_options if namespace_storage_options: storage_options_provider = LanceNamespaceStorageOptionsProvider( @@ -247,7 +229,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) if version is None and asof is not None: ts_cutoff = sanitize_ts(asof) @@ -272,7 +253,6 @@ def dataset( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) else: return ds diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 0707c314574..2b13b9a8beb 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -422,7 +422,6 @@ def __init__( read_params: Optional[Dict[str, Any]] = None, session: Optional[Session] = None, storage_options_provider: Optional[Any] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): uri = os.fspath(uri) if isinstance(uri, Path) else uri self._uri = uri @@ -454,7 +453,6 @@ def __init__( read_params=read_params, session=session, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) self._default_scan_options = default_scan_options self._read_params = read_params @@ -2224,7 +2222,52 @@ def latest_version(self) -> int: """ return self._ds.latest_version() - def checkout_version(self, version: int | str | Tuple[str, int]) -> "LanceDataset": + @property + def initial_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the initial storage options used to open this dataset. + + This returns the options that were provided when the dataset was opened, + without any refresh from the provider. Returns None if no storage options + were provided. + """ + return self._ds.initial_storage_options() + + def latest_storage_options(self) -> Optional[Dict[str, str]]: + """ + Get the latest storage options, potentially refreshed from the provider. + + If a storage options provider was configured and credentials are expiring, + this will refresh them. + + Returns + ------- + Optional[Dict[str, str]] + - Storage options dict if configured (static or refreshed from provider) + - None if no storage options were configured for this dataset + + Raises + ------ + IOError + If an error occurs while fetching/refreshing options from the provider + """ + return self._ds.latest_storage_options() + + @property + def storage_options_accessor(self): + """ + Get the storage options accessor for this dataset. + + The accessor bundles static storage options and optional dynamic provider, + handling caching and refresh logic internally. + + Returns None if neither storage options nor a provider were configured. + """ + return self._ds.storage_options_accessor() + + def checkout_version( + self, version: int | str | Tuple[Optional[str], Optional[int]] + ) -> "LanceDataset": """ Load the given version of the dataset. @@ -5352,8 +5395,6 @@ def write_dataset( target_bases: Optional[List[str]] = None, namespace: Optional[LanceNamespace] = None, table_id: Optional[List[str]] = None, - ignore_namespace_table_storage_options: bool = False, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ) -> LanceDataset: """Write a given data_obj to the given uri @@ -5455,29 +5496,16 @@ def write_dataset( table_id : optional, List[str] The table identifier when using a namespace (e.g., ["my_table"]). Must be provided together with `namespace`. Cannot be used with `uri`. - ignore_namespace_table_storage_options : bool, default False - If True, ignore the storage options returned by the namespace and only use - the provided `storage_options` parameter. The storage options provider will - not be created, so credentials will not be automatically refreshed. - This is useful when you want to use your own credentials instead of the - namespace-provided credentials. - s3_credentials_refresh_offset_seconds : optional, int - The number of seconds before credential expiration to trigger a refresh. - Default is 60 seconds. Only applicable when using AWS S3 with temporary - credentials. For example, if set to 60, credentials will be refreshed - when they have less than 60 seconds remaining before expiration. This - should be set shorter than the credential lifetime to avoid using - expired credentials. Notes ----- When using `namespace` and `table_id`: - The `uri` parameter is optional and will be fetched from the namespace + - Storage options from describe_table() will be used automatically - A `LanceNamespaceStorageOptionsProvider` will be created automatically for - storage options refresh (unless `ignore_namespace_table_storage_options=True`) + storage options refresh - Initial storage options from describe_table() will be merged with - any provided `storage_options` (unless - `ignore_namespace_table_storage_options=True`) + any provided `storage_options` """ # Validate that user provides either uri OR (namespace + table_id), not both has_uri = uri is not None @@ -5563,11 +5591,8 @@ def write_dataset( f"Namespace did not return a table location in {mode} response" ) - # Check if we should ignore namespace storage options - if ignore_namespace_table_storage_options: - namespace_storage_options = None - else: - namespace_storage_options = response.storage_options + # Use namespace storage options + namespace_storage_options = response.storage_options # Set up storage options and provider if namespace_storage_options: @@ -5630,12 +5655,6 @@ def write_dataset( if storage_options_provider is not None: params["storage_options_provider"] = storage_options_provider - # Add s3_credentials_refresh_offset_seconds if specified - if s3_credentials_refresh_offset_seconds is not None: - params["s3_credentials_refresh_offset_seconds"] = ( - s3_credentials_refresh_offset_seconds - ) - if commit_lock: if not callable(commit_lock): raise TypeError(f"commit_lock must be a function, got {type(commit_lock)}") diff --git a/python/python/lance/file.py b/python/python/lance/file.py index dec4aea00b6..8a20e4aff2f 100644 --- a/python/python/lance/file.py +++ b/python/python/lance/file.py @@ -68,7 +68,6 @@ def __init__( columns: Optional[List[str]] = None, *, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, _inner_reader: Optional[_LanceFileReader] = None, ): """ @@ -86,9 +85,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. columns: list of str, default None List of column names to be fetched. All columns are fetched if None or unspecified. @@ -102,7 +98,6 @@ def __init__( path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, columns=columns, ) @@ -219,7 +214,6 @@ def __init__( base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): """ Creates a new file session @@ -236,9 +230,6 @@ def __init__( storage_options_provider : optional A provider that can provide storage options dynamically. This is useful for credentials that need to be refreshed or vended on-demand. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. """ if isinstance(base_path, Path): base_path = str(base_path) @@ -246,7 +237,6 @@ def __init__( base_path, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, ) def open_reader( @@ -391,7 +381,6 @@ def __init__( version: Optional[str] = None, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, max_page_bytes: Optional[int] = None, _inner_writer: Optional[_LanceFileWriter] = None, **kwargs, @@ -422,9 +411,6 @@ def __init__( A storage options provider that can fetch and refresh storage options dynamically. This is useful for credentials that expire and need to be refreshed automatically. - s3_credentials_refresh_offset_seconds : optional, int - How early (in seconds) before expiration to refresh S3 credentials. - Default is 60 seconds. Only applies when using storage_options_provider. max_page_bytes : optional, int The maximum size of a page in bytes, if a single array would create a page larger than this then it will be split into multiple pages. The @@ -442,7 +428,6 @@ def __init__( version=version, storage_options=storage_options, storage_options_provider=storage_options_provider, - s3_credentials_refresh_offset_seconds=s3_credentials_refresh_offset_seconds, max_page_bytes=max_page_bytes, **kwargs, ) diff --git a/python/python/lance/lance/__init__.pyi b/python/python/lance/lance/__init__.pyi index f0cf1243d61..9ecc271754f 100644 --- a/python/python/lance/lance/__init__.pyi +++ b/python/python/lance/lance/__init__.pyi @@ -95,7 +95,6 @@ class LanceFileWriter: version: Optional[str], storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], keep_original_array: Optional[bool], max_page_bytes: Optional[int], ): ... @@ -110,7 +109,6 @@ class LanceFileSession: base_path: str, storage_options: Optional[Dict[str, str]] = None, storage_options_provider: Optional[StorageOptionsProvider] = None, - s3_credentials_refresh_offset_seconds: Optional[int] = None, ): ... def open_reader( self, path: str, columns: Optional[List[str]] = None @@ -135,7 +133,6 @@ class LanceFileReader: path: str, storage_options: Optional[Dict[str, str]], storage_options_provider: Optional[StorageOptionsProvider], - s3_credentials_refresh_offset_seconds: Optional[int], columns: Optional[List[str]] = None, ): ... def read_all( diff --git a/python/python/tests/test_namespace_integration.py b/python/python/tests/test_namespace_integration.py index 3c93dbcb504..30489496e38 100644 --- a/python/python/tests/test_namespace_integration.py +++ b/python/python/tests/test_namespace_integration.py @@ -128,6 +128,8 @@ def _modify_storage_options( (time.time() + self.credential_expires_in_seconds) * 1000 ) modified["expires_at_millis"] = str(expires_at_millis) + # Set refresh offset to 1 second (1000ms) for short-lived credential tests + modified["refresh_offset_millis"] = "1000" return modified @@ -235,7 +237,6 @@ def test_namespace_with_refresh(s3_bucket: str): namespace=namespace, table_id=table_id, mode="create", - s3_credentials_refresh_offset_seconds=1, ) assert ds.count_rows() == 2 assert namespace.get_create_call_count() == 1 @@ -243,7 +244,6 @@ def test_namespace_with_refresh(s3_bucket: str): ds_from_namespace = lance.dataset( namespace=namespace, table_id=table_id, - s3_credentials_refresh_offset_seconds=1, ) initial_call_count = namespace.get_describe_call_count() @@ -574,7 +574,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch = pa.RecordBatch.from_pydict({"x": [1, 2, 3], "y": [4, 5, 6]}, schema=schema) @@ -593,7 +592,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -613,7 +611,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): schema=schema, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) batch3 = pa.RecordBatch.from_pydict( @@ -629,7 +626,6 @@ def test_file_writer_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -696,7 +692,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result = reader.read_all(batch_size=1024) result_table = result.to_table() @@ -727,7 +722,6 @@ def test_file_reader_with_storage_options_provider(s3_bucket: str): file_uri2, storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) result2 = reader2.read_all(batch_size=1024) result_table2 = result2.to_table() @@ -778,7 +772,6 @@ def test_file_session_with_storage_options_provider(s3_bucket: str): f"s3://{s3_bucket}/{table_name}_session", storage_options=namespace_storage_options, storage_options_provider=provider, - s3_credentials_refresh_offset_seconds=1, ) # Test contains method diff --git a/python/src/dataset.rs b/python/src/dataset.rs index ade2b4516ca..37334ad9352 100644 --- a/python/src/dataset.rs +++ b/python/src/dataset.rs @@ -91,6 +91,7 @@ use crate::rt; use crate::scanner::ScanStatistics; use crate::schema::{logical_schema_from_lance, LanceSchema}; use crate::session::Session; +use crate::storage_options::PyStorageOptionsAccessor; use crate::utils::PyLance; use crate::{LanceReader, Scanner}; @@ -456,8 +457,9 @@ pub struct Dataset { #[pymethods] impl Dataset { #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[new] - #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri, version=None, block_size=None, index_cache_size=None, metadata_cache_size=None, commit_handler=None, storage_options=None, manifest=None, metadata_cache_size_bytes=None, index_cache_size_bytes=None, read_params=None, session=None, storage_options_provider=None))] fn new( py: Python, uri: String, @@ -472,8 +474,7 @@ impl Dataset { index_cache_size_bytes: Option, read_params: Option<&Bound>, session: Option, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, ) -> PyResult { let mut params = ReadParams::default(); if let Some(metadata_cache_size_bytes) = metadata_cache_size_bytes { @@ -490,16 +491,12 @@ impl Dataset { let index_cache_size_bytes = index_cache_size * 20 * 1024 * 1024; params.index_cache_size_bytes(index_cache_size_bytes); } - // Set up store options (block size and S3 credentials refresh offset) - let mut store_params = params.store_options.take().unwrap_or_default(); + // Set up store options (block size) if let Some(block_size) = block_size { + let mut store_params = params.store_options.take().unwrap_or_default(); store_params.block_size = Some(block_size); + params.store_options = Some(store_params); } - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } - params.store_options = Some(store_params); if let Some(commit_handler) = commit_handler { let py_commit_lock = PyCommitLock::new(commit_handler); params.set_commit_lock(Arc::new(py_commit_lock)); @@ -1447,8 +1444,40 @@ impl Dataset { .map_err(|err| PyIOError::new_err(err.to_string())) } - fn checkout_version(&self, py: Python, version: PyObject) -> PyResult { - let reference = self.transform_ref(py, Some(version))?; + /// Get the initial storage options used to open this dataset. + /// + /// This returns the options that were provided when the dataset was opened, + /// without any refresh from the provider. Returns None if no storage options + /// were provided. + fn initial_storage_options(&self) -> Option> { + self.ds.initial_storage_options().cloned() + } + + /// Get the latest storage options, potentially refreshed from the provider. + /// + /// If a storage options provider was configured and credentials are expiring, + /// this will refresh them. Returns the current valid storage options, or None + /// if no storage options accessor is configured. + fn latest_storage_options(self_: PyRef<'_, Self>) -> PyResult>> { + let result = rt() + .block_on(Some(self_.py()), self_.ds.latest_storage_options())? + .map_err(|err| PyIOError::new_err(err.to_string()))?; + Ok(result.map(|opts| opts.0)) + } + + /// Get the storage options accessor for this dataset. + /// + /// The accessor bundles static storage options and optional dynamic provider, + /// handling caching and refresh logic internally. + fn storage_options_accessor(&self) -> Option { + self.ds + .storage_options_accessor() + .map(PyStorageOptionsAccessor::new) + } + + fn checkout_version(&self, version: Bound<'_, PyAny>) -> PyResult { + let reference = + Python::with_gil(|py| self.transform_ref(py, Some(version.clone().unbind())))?; self._checkout_version(reference) } @@ -1465,7 +1494,9 @@ impl Dataset { // `version` can be a version number or a tag name. // `storage_options` will be forwarded to the object store params for the new dataset. let store_params = storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -1647,7 +1678,9 @@ impl Dataset { // Build Ref from python object let reference = self.transform_ref(py, reference)?; let store_params = storage_options.map(|opts| ObjectStoreParams { - storage_options: Some(opts), + storage_options_accessor: Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), ..Default::default() }); let created = rt() @@ -2099,7 +2132,7 @@ impl Dataset { read_version: Option, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, @@ -2127,6 +2160,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transaction, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_transaction( @@ -2134,25 +2168,19 @@ impl Dataset { transaction: PyLance, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, ) -> PyResult { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -2196,6 +2224,7 @@ impl Dataset { } #[allow(clippy::too_many_arguments)] + #[allow(deprecated)] #[staticmethod] #[pyo3(signature = (dest, transactions, commit_lock = None, storage_options = None, storage_options_provider = None, enable_v2_manifest_paths = None, detached = None, max_retries = None))] fn commit_batch( @@ -2203,25 +2232,19 @@ impl Dataset { transactions: Vec>, commit_lock: Option<&Bound<'_, PyAny>>, storage_options: Option>, - storage_options_provider: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, enable_v2_manifest_paths: Option, detached: Option, max_retries: Option, ) -> PyResult<(Self, PyLance)> { - let provider = storage_options_provider.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new(py_provider), - ) as Arc - }) - }); + let accessor = crate::storage_options::create_accessor_from_python( + storage_options.clone(), + storage_options_provider, + )?; - let object_store_params = if storage_options.is_some() || provider.is_some() { + let object_store_params = if accessor.is_some() { Some(ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider: provider, + storage_options_accessor: accessor, ..Default::default() }) } else { @@ -3006,6 +3029,7 @@ fn get_dict_opt<'a, 'py, D: FromPyObject<'a>>( .transpose() } +#[allow(deprecated)] pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult> { let params = if options.is_none() { None @@ -3033,34 +3057,17 @@ pub fn get_write_params(options: &Bound<'_, PyDict>) -> PyResult>(options, "storage_options")?; let storage_options_provider = - get_dict_opt::(options, "storage_options_provider")?.and_then(|py_obj| { - crate::storage_options::PyStorageOptionsProvider::new(py_obj) - .ok() - .map(|py_provider| { - Arc::new( - crate::storage_options::PyStorageOptionsProviderWrapper::new( - py_provider, - ), - ) - as Arc - }) - }); - - let s3_credentials_refresh_offset_seconds = - get_dict_opt::(options, "s3_credentials_refresh_offset_seconds")?; - - if storage_options.is_some() - || storage_options_provider.is_some() - || s3_credentials_refresh_offset_seconds.is_some() - { - let s3_credentials_refresh_offset = s3_credentials_refresh_offset_seconds - .map(std::time::Duration::from_secs) - .unwrap_or(std::time::Duration::from_secs(60)); + get_dict_opt::>(options, "storage_options_provider")?; - p.store_params = Some(ObjectStoreParams { + if storage_options.is_some() || storage_options_provider.is_some() { + let accessor = crate::storage_options::create_accessor_from_python( storage_options, - storage_options_provider, - s3_credentials_refresh_offset, + storage_options_provider + .as_ref() + .map(|py_obj| py_obj.bind(options.py())), + )?; + p.store_params = Some(ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }); } diff --git a/python/src/file.rs b/python/src/file.rs index 11971e5d5d7..213f3e2f71c 100644 --- a/python/src/file.rs +++ b/python/src/file.rs @@ -37,7 +37,8 @@ use lance_io::{ use object_store::path::Path; use pyo3::{ exceptions::{PyIOError, PyRuntimeError}, - pyclass, pyfunction, pymethods, IntoPyObjectExt, PyErr, PyObject, PyResult, Python, + pyclass, pyfunction, pymethods, Bound, IntoPyObjectExt, PyAny, PyErr, PyObject, PyResult, + Python, }; use serde::Serialize; use std::collections::HashMap; @@ -239,7 +240,6 @@ impl LanceFileWriter { version: Option, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -247,7 +247,6 @@ impl LanceFileWriter { uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store( @@ -297,7 +296,7 @@ impl LanceFileWriter { #[pymethods] impl LanceFileWriter { #[new] - #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, keep_original_array=None, max_page_bytes=None))] + #[pyo3(signature=(path, schema=None, data_cache_bytes=None, version=None, storage_options=None, storage_options_provider=None, keep_original_array=None, max_page_bytes=None))] #[allow(clippy::too_many_arguments)] pub fn new( path: String, @@ -305,8 +304,7 @@ impl LanceFileWriter { data_cache_bytes: Option, version: Option, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, keep_original_array: Option, max_page_bytes: Option, ) -> PyResult { @@ -324,7 +322,6 @@ impl LanceFileWriter { version, storage_options, provider, - s3_credentials_refresh_offset_seconds, keep_original_array, max_page_bytes, ), @@ -381,25 +378,33 @@ pub async fn object_store_from_uri_or_path( uri_or_path: impl AsRef, storage_options: Option>, ) -> PyResult<(Arc, Path)> { - object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None, None).await + object_store_from_uri_or_path_with_provider(uri_or_path, storage_options, None).await } pub async fn object_store_from_uri_or_path_with_provider( uri_or_path: impl AsRef, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult<(Arc, Path)> { let object_store_registry = Arc::new(lance::io::ObjectStoreRegistry::default()); - let mut object_store_params = ObjectStoreParams { - storage_options: storage_options.clone(), - storage_options_provider, + + let accessor = match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_initial_and_provider(opts, provider), + )), + (None, Some(provider)) => Some(Arc::new(lance::io::StorageOptionsAccessor::with_provider( + provider, + ))), + (Some(opts), None) => Some(Arc::new( + lance::io::StorageOptionsAccessor::with_static_options(opts), + )), + (None, None) => None, + }; + + let object_store_params = ObjectStoreParams { + storage_options_accessor: accessor, ..Default::default() }; - if let Some(offset_seconds) = s3_credentials_refresh_offset_seconds { - object_store_params.s3_credentials_refresh_offset = - std::time::Duration::from_secs(offset_seconds); - } let (object_store, path) = ObjectStore::from_uri_and_params( object_store_registry, @@ -423,13 +428,11 @@ impl LanceFileSession { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, ) -> PyResult { let (object_store, base_path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Ok(Self { @@ -442,25 +445,16 @@ impl LanceFileSession { #[pymethods] impl LanceFileSession { #[new] - #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None))] + #[pyo3(signature=(uri_or_path, storage_options=None, storage_options_provider=None))] pub fn new( uri_or_path: String, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::try_new( - uri_or_path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - ), - )? + rt().block_on(None, Self::try_new(uri_or_path, storage_options, provider))? } #[pyo3(signature=(path, columns=None))] @@ -642,14 +636,12 @@ impl LanceFileReader { uri_or_path: String, storage_options: Option>, storage_options_provider: Option>, - s3_credentials_refresh_offset_seconds: Option, columns: Option>, ) -> PyResult { let (object_store, path) = object_store_from_uri_or_path_with_provider( uri_or_path, storage_options, storage_options_provider, - s3_credentials_refresh_offset_seconds, ) .await?; Self::open_with_store(object_store, path, columns).await @@ -747,27 +739,17 @@ impl LanceFileReader { #[pymethods] impl LanceFileReader { #[new] - #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, s3_credentials_refresh_offset_seconds=None, columns=None))] + #[pyo3(signature=(path, storage_options=None, storage_options_provider=None, columns=None))] pub fn new( path: String, storage_options: Option>, - storage_options_provider: Option, - s3_credentials_refresh_offset_seconds: Option, + storage_options_provider: Option<&Bound<'_, PyAny>>, columns: Option>, ) -> PyResult { let provider = storage_options_provider .map(crate::storage_options::py_object_to_storage_options_provider) .transpose()?; - rt().block_on( - None, - Self::open( - path, - storage_options, - provider, - s3_credentials_refresh_offset_seconds, - columns, - ), - )? + rt().block_on(None, Self::open(path, storage_options, provider, columns))? } pub fn read_all( diff --git a/python/src/lib.rs b/python/src/lib.rs index faf62eb546c..1512a8deef8 100644 --- a/python/src/lib.rs +++ b/python/src/lib.rs @@ -273,6 +273,7 @@ fn lance(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { m.add_class::()?; #[cfg(feature = "rest-adapter")] m.add_class::()?; + m.add_class::()?; m.add_wrapped(wrap_pyfunction!(bfloat16_array))?; m.add_wrapped(wrap_pyfunction!(write_dataset))?; m.add_wrapped(wrap_pyfunction!(write_fragments))?; diff --git a/python/src/storage_options.rs b/python/src/storage_options.rs index 3defd74f267..ba7ec4f4ec4 100644 --- a/python/src/storage_options.rs +++ b/python/src/storage_options.rs @@ -5,7 +5,7 @@ use std::collections::HashMap; use std::sync::Arc; use async_trait::async_trait; -use lance_io::object_store::StorageOptionsProvider; +use lance_io::object_store::{StorageOptionsAccessor, StorageOptionsProvider}; use pyo3::prelude::*; use pyo3::types::PyDict; @@ -162,8 +162,132 @@ impl StorageOptionsProvider for PyStorageOptionsProviderWrapper { /// Convert a Python object to an Arc /// This is the main entry point for converting Python storage options providers to Rust pub fn py_object_to_storage_options_provider( - py_obj: PyObject, + py_obj: &Bound<'_, PyAny>, ) -> PyResult> { - let py_provider = PyStorageOptionsProvider::new(py_obj)?; + let py_provider = PyStorageOptionsProvider::new(py_obj.clone().unbind())?; Ok(Arc::new(PyStorageOptionsProviderWrapper::new(py_provider))) } + +/// Python wrapper for StorageOptionsAccessor +/// +/// This wraps a Rust StorageOptionsAccessor and exposes it to Python. +#[pyclass(name = "StorageOptionsAccessor")] +#[derive(Clone)] +pub struct PyStorageOptionsAccessor { + inner: Arc, +} + +impl PyStorageOptionsAccessor { + pub fn new(accessor: Arc) -> Self { + Self { inner: accessor } + } + + pub fn inner(&self) -> Arc { + self.inner.clone() + } +} + +#[pymethods] +impl PyStorageOptionsAccessor { + /// Create an accessor with only static options (no refresh capability) + #[staticmethod] + fn with_static_options(options: HashMap) -> Self { + Self { + inner: Arc::new(StorageOptionsAccessor::with_static_options(options)), + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The refresh offset is extracted from storage options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_provider(provider: &Bound<'_, PyAny>) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_provider(rust_provider)), + }) + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// The refresh offset is extracted from initial_options using the `refresh_offset_millis` key. + #[staticmethod] + fn with_initial_and_provider( + initial_options: HashMap, + provider: &Bound<'_, PyAny>, + ) -> PyResult { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Self { + inner: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + rust_provider, + )), + }) + } + + /// Get current valid storage options + fn get_storage_options(&self, py: Python<'_>) -> PyResult> { + let accessor = self.inner.clone(); + let options = rt() + .block_on(Some(py), accessor.get_storage_options())? + .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(e.to_string()))?; + Ok(options.0) + } + + /// Get the initial storage options without refresh + fn initial_storage_options(&self) -> Option> { + self.inner.initial_storage_options().cloned() + } + + /// Get the accessor ID for equality/hashing + fn accessor_id(&self) -> String { + self.inner.accessor_id() + } + + /// Check if this accessor has a dynamic provider + fn has_provider(&self) -> bool { + self.inner.has_provider() + } + + /// Get the refresh offset in seconds + fn refresh_offset_secs(&self) -> u64 { + self.inner.refresh_offset().as_secs() + } + + fn __repr__(&self) -> String { + format!( + "StorageOptionsAccessor(id={}, has_provider={})", + self.inner.accessor_id(), + self.inner.has_provider() + ) + } +} + +/// Create a StorageOptionsAccessor from Python parameters +/// +/// This handles the conversion from Python types to Rust StorageOptionsAccessor. +/// The refresh offset is extracted from storage_options using the `refresh_offset_millis` key. +#[allow(dead_code)] +pub fn create_accessor_from_python( + storage_options: Option>, + storage_options_provider: Option<&Bound<'_, PyAny>>, +) -> PyResult>> { + match (storage_options, storage_options_provider) { + (Some(opts), Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(opts, rust_provider), + ))) + } + (None, Some(provider)) => { + let rust_provider = py_object_to_storage_options_provider(provider)?; + Ok(Some(Arc::new(StorageOptionsAccessor::with_provider( + rust_provider, + )))) + } + (Some(opts), None) => Ok(Some(Arc::new(StorageOptionsAccessor::with_static_options( + opts, + )))), + (None, None) => Ok(None), + } +} diff --git a/rust/lance-io/src/object_store.rs b/rust/lance-io/src/object_store.rs index 4375a950d09..b941a57b4fd 100644 --- a/rust/lance-io/src/object_store.rs +++ b/rust/lance-io/src/object_store.rs @@ -64,7 +64,8 @@ pub const DEFAULT_DOWNLOAD_RETRY_COUNT: usize = 3; pub use providers::{ObjectStoreProvider, ObjectStoreRegistry}; pub use storage_options::{ - LanceNamespaceStorageOptionsProvider, StorageOptionsProvider, EXPIRES_AT_MILLIS_KEY, + LanceNamespaceStorageOptionsProvider, StorageOptionsAccessor, StorageOptionsProvider, + EXPIRES_AT_MILLIS_KEY, REFRESH_OFFSET_MILLIS_KEY, }; #[async_trait] @@ -127,6 +128,10 @@ pub struct ObjectStore { download_retry_count: usize, /// IO tracker for monitoring read/write operations io_tracker: IOTracker, + /// The datastore prefix that uniquely identifies this object store. It encodes information + /// which usually cannot be found in the URL such as Azure account name. The prefix plus the + /// path uniquely identifies any object inside the store. + pub store_prefix: String, } impl DeepSizeOf for ObjectStore { @@ -183,13 +188,18 @@ pub struct ObjectStoreParams { pub block_size: Option, #[deprecated(note = "Implement an ObjectStoreProvider instead")] pub object_store: Option<(Arc, Url)>, + /// Refresh offset for AWS credentials when using the legacy AWS credentials path. + /// For StorageOptionsAccessor, use `refresh_offset_millis` storage option instead. pub s3_credentials_refresh_offset: Duration, #[cfg(feature = "aws")] pub aws_credentials: Option, pub object_store_wrapper: Option>, - pub storage_options: Option>, - /// Dynamic storage options provider for automatic credential refresh - pub storage_options_provider: Option>, + /// Unified storage options accessor with caching and automatic refresh + /// + /// Provides storage options and optionally a dynamic provider for automatic + /// credential refresh. Use `StorageOptionsAccessor::with_static_options()` for static + /// options or `StorageOptionsAccessor::with_initial_and_provider()` for dynamic refresh. + pub storage_options_accessor: Option>, /// Use constant size upload parts for multipart uploads. Only necessary /// for Cloudflare R2, which doesn't support variable size parts. When this /// is false, max upload size is 2.5TB. When this is true, the max size is @@ -208,19 +218,34 @@ impl Default for ObjectStoreParams { #[cfg(feature = "aws")] aws_credentials: None, object_store_wrapper: None, - storage_options: None, - storage_options_provider: None, + storage_options_accessor: None, use_constant_size_upload_parts: false, list_is_lexically_ordered: None, } } } +impl ObjectStoreParams { + /// Get the StorageOptionsAccessor from the params + pub fn get_accessor(&self) -> Option> { + self.storage_options_accessor.clone() + } + + /// Get storage options from the accessor, if any + /// + /// Returns the initial storage options from the accessor without triggering refresh. + pub fn storage_options(&self) -> Option<&HashMap> { + self.storage_options_accessor + .as_ref() + .and_then(|a| a.initial_storage_options()) + } +} + // We implement hash for caching impl std::hash::Hash for ObjectStoreParams { #[allow(deprecated)] fn hash(&self, state: &mut H) { - // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper, and storage options provider + // For hashing, we use pointer values for ObjectStore, S3 credentials, wrapper self.block_size.hash(state); if let Some((store, url)) = &self.object_store { Arc::as_ptr(store).hash(state); @@ -234,14 +259,8 @@ impl std::hash::Hash for ObjectStoreParams { if let Some(wrapper) = &self.object_store_wrapper { Arc::as_ptr(wrapper).hash(state); } - if let Some(storage_options) = &self.storage_options { - for (key, value) in storage_options { - key.hash(state); - value.hash(state); - } - } - if let Some(provider) = &self.storage_options_provider { - provider.provider_id().hash(state); + if let Some(accessor) = &self.storage_options_accessor { + accessor.accessor_id().hash(state); } self.use_constant_size_upload_parts.hash(state); self.list_is_lexically_ordered.hash(state); @@ -259,7 +278,7 @@ impl PartialEq for ObjectStoreParams { } // For equality, we use pointer comparison for ObjectStore, S3 credentials, wrapper - // For storage_options_provider, we use provider_id() for semantic equality + // For accessor, we use accessor_id() for semantic equality self.block_size == other.block_size && self .object_store @@ -272,15 +291,14 @@ impl PartialEq for ObjectStoreParams { && self.s3_credentials_refresh_offset == other.s3_credentials_refresh_offset && self.object_store_wrapper.as_ref().map(Arc::as_ptr) == other.object_store_wrapper.as_ref().map(Arc::as_ptr) - && self.storage_options == other.storage_options && self - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) == other - .storage_options_provider + .storage_options_accessor .as_ref() - .map(|p| p.provider_id()) + .map(|a| a.accessor_id()) && self.use_constant_size_upload_parts == other.use_constant_size_upload_parts && self.list_is_lexically_ordered == other.list_is_lexically_ordered } @@ -410,7 +428,7 @@ impl ObjectStore { if let Some((store, path)) = params.object_store.as_ref() { let mut inner = store.clone(); let store_prefix = - registry.calculate_object_store_prefix(uri, params.storage_options.as_ref())?; + registry.calculate_object_store_prefix(uri, params.storage_options())?; if let Some(wrapper) = params.object_store_wrapper.as_ref() { inner = wrapper.wrap(&store_prefix, inner); } @@ -429,6 +447,7 @@ impl ObjectStore { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: DEFAULT_DOWNLOAD_RETRY_COUNT, io_tracker, + store_prefix: String::new(), // custom object store, no prefix needed }; let path = Path::parse(path.path())?; return Ok((Arc::new(store), path)); @@ -859,13 +878,12 @@ impl ObjectStore { let scheme = location.scheme(); let block_size = block_size.unwrap_or_else(|| infer_block_size(scheme)); + let store_prefix = DEFAULT_OBJECT_STORE_REGISTRY + .calculate_object_store_prefix(location.as_ref(), storage_options) + .unwrap_or_default(); + let store = match wrapper { - Some(wrapper) => { - let store_prefix = DEFAULT_OBJECT_STORE_REGISTRY - .calculate_object_store_prefix(location.as_ref(), storage_options) - .unwrap(); - wrapper.wrap(&store_prefix, store) - } + Some(wrapper) => wrapper.wrap(&store_prefix, store), None => store, }; @@ -883,6 +901,7 @@ impl ObjectStore { io_parallelism, download_retry_count, io_tracker, + store_prefix, } } } @@ -974,8 +993,11 @@ mod tests { ) { // Test the default let registry = Arc::new(ObjectStoreRegistry::default()); + let accessor = storage_options + .clone() + .map(|opts| Arc::new(StorageOptionsAccessor::with_static_options(opts))); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor.clone(), ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) @@ -987,7 +1009,7 @@ mod tests { let registry = Arc::new(ObjectStoreRegistry::default()); let params = ObjectStoreParams { block_size: Some(1024), - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..ObjectStoreParams::default() }; let (store, _) = ObjectStore::from_uri_and_params(registry, uri, ¶ms) diff --git a/rust/lance-io/src/object_store/providers.rs b/rust/lance-io/src/object_store/providers.rs index 17cbb3900d2..032c979c134 100644 --- a/rust/lance-io/src/object_store/providers.rs +++ b/rust/lance-io/src/object_store/providers.rs @@ -172,7 +172,7 @@ impl ObjectStoreRegistry { }; let cache_path = - provider.calculate_object_store_prefix(&base_path, params.storage_options.as_ref())?; + provider.calculate_object_store_prefix(&base_path, params.storage_options())?; let cache_key = (cache_path.clone(), params.clone()); // Check if we have a cached store for this base path and params diff --git a/rust/lance-io/src/object_store/providers/aws.rs b/rust/lance-io/src/object_store/providers/aws.rs index 9bd93bf029a..982470581f2 100644 --- a/rust/lance-io/src/object_store/providers/aws.rs +++ b/rust/lance-io/src/object_store/providers/aws.rs @@ -28,8 +28,9 @@ use tokio::sync::RwLock; use url::Url; use crate::object_store::{ - ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsProvider, - DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, DEFAULT_MAX_IOP_SIZE, + ObjectStore, ObjectStoreParams, ObjectStoreProvider, StorageOptions, StorageOptionsAccessor, + StorageOptionsProvider, DEFAULT_CLOUD_BLOCK_SIZE, DEFAULT_CLOUD_IO_PARALLELISM, + DEFAULT_MAX_IOP_SIZE, }; use lance_core::error::{Error, Result}; @@ -54,13 +55,16 @@ impl AwsStoreProvider { let mut s3_storage_options = storage_options.as_s3_options(); let region = resolve_s3_region(base_path, &s3_storage_options).await?; + + // Get accessor from params + let accessor = params.get_accessor(); + let (aws_creds, region) = build_aws_credential( params.s3_credentials_refresh_offset, params.aws_credentials.clone(), Some(&s3_storage_options), region, - params.storage_options_provider.clone(), - storage_options.expires_at_millis(), + accessor, ) .await?; @@ -132,7 +136,7 @@ impl ObjectStoreProvider for AwsStoreProvider { ) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_s3(); let download_retry_count = storage_options.download_retry_count(); @@ -171,6 +175,8 @@ impl ObjectStoreProvider for AwsStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -226,20 +232,17 @@ async fn resolve_s3_region( /// Build AWS credentials /// /// This resolves credentials from the following sources in order: -/// 1. An explicit `storage_options_provider` +/// 1. An explicit `storage_options_accessor` with a provider /// 2. An explicit `credentials` provider /// 3. Explicit credentials in storage_options (as in `aws_access_key_id`, /// `aws_secret_access_key`, `aws_session_token`) /// 4. The default credential provider chain from AWS SDK. /// -/// # Initial Credentials with Storage Options Provider +/// # Storage Options Accessor /// -/// When `storage_options_provider` is provided along with `storage_options` and -/// `expires_at_millis`, these serve as **initial values** to avoid redundant calls to -/// fetch new storage options. The provider will use these initial credentials until they -/// expire (based on `expires_at_millis`), then automatically fetch fresh credentials from -/// the provider. Once the initial credentials expire, the passed-in values are no longer -/// used - all future credentials come from the provider's `fetch_storage_options()` method. +/// When `storage_options_accessor` is provided and has a dynamic provider, +/// credentials are fetched and cached by the accessor with automatic refresh +/// before expiration. /// /// `credentials_refresh_offset` is the amount of time before expiry to refresh credentials. pub async fn build_aws_credential( @@ -247,10 +250,8 @@ pub async fn build_aws_credential( credentials: Option, storage_options: Option<&HashMap>, region: Option, - storage_options_provider: Option>, - expires_at_millis: Option, + storage_options_accessor: Option>, ) -> Result<(AwsCredentialProvider, String)> { - // TODO: make this return no credential provider not using AWS use aws_config::meta::region::RegionProviderChain; const DEFAULT_REGION: &str = "us-west-2"; @@ -266,17 +267,24 @@ pub async fn build_aws_credential( }; let storage_options_credentials = storage_options.and_then(extract_static_s3_credentials); - if let Some(storage_options_provider) = storage_options_provider { - let creds = build_aws_credential_with_storage_options_provider( - storage_options_provider, - credentials_refresh_offset, - credentials, - storage_options_credentials, - expires_at_millis, - ) - .await?; - Ok((creds, region)) - } else if let Some(creds) = credentials { + + // If accessor has a provider, use DynamicStorageOptionsCredentialProvider + if let Some(accessor) = storage_options_accessor { + if accessor.has_provider() { + // Explicit aws_credentials takes precedence + if let Some(creds) = credentials { + return Ok((creds, region)); + } + // Use accessor for dynamic credential refresh + return Ok(( + Arc::new(DynamicStorageOptionsCredentialProvider::new(accessor)), + region, + )); + } + } + + // Fall back to existing logic for static credentials + if let Some(creds) = credentials { Ok((creds, region)) } else if let Some(creds) = storage_options_credentials { Ok((Arc::new(creds), region)) @@ -293,58 +301,6 @@ pub async fn build_aws_credential( } } -async fn build_aws_credential_with_storage_options_provider( - storage_options_provider: Arc, - credentials_refresh_offset: Duration, - credentials: Option, - storage_options_credentials: Option>, - expires_at_millis: Option, -) -> Result { - match (expires_at_millis, credentials, storage_options_credentials) { - // Case 1: provider + credentials + expiration time - (Some(expires_at), Some(cred), _) => { - Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at, - ), - )) - } - // Case 2: provider + storage_options (with valid credentials) + expiration time - (Some(expires_at), None, Some(cred)) => { - Ok(Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( - storage_options_provider, - credentials_refresh_offset, - cred.get_credential().await?, - expires_at, - ), - )) - } - // Case 3: provider + storage_options without expiration - FAIL - (None, None, Some(_)) => Err(Error::IO { - source: Box::new(std::io::Error::other( - "expires_at_millis is required when using storage_options_provider with storage_options", - )), - location: location!(), - }), - // Case 4: provider + credentials without expiration - FAIL - (None, Some(_), _) => Err(Error::IO { - source: Box::new(std::io::Error::other( - "expires_at_millis is required when using storage_options_provider with credentials", - )), - location: location!(), - }), - // Case 5: provider without credentials/storage_options, or with expiration but no creds/opts - (_, None, None) => Ok(Arc::new(DynamicStorageOptionsCredentialProvider::new( - storage_options_provider, - credentials_refresh_offset, - ))), - } -} - fn extract_static_s3_credentials( options: &HashMap, ) -> Option> { @@ -487,20 +443,24 @@ impl ObjectStoreParams { aws_credentials: Option, region: Option, ) -> Self { + let storage_options_accessor = region.map(|region| { + let opts: HashMap = + [("region".into(), region)].iter().cloned().collect(); + Arc::new(StorageOptionsAccessor::with_static_options(opts)) + }); Self { aws_credentials, - storage_options: region - .map(|region| [("region".into(), region)].iter().cloned().collect()), + storage_options_accessor, ..Default::default() } } } -/// AWS Credential Provider that uses StorageOptionsProvider +/// AWS Credential Provider that delegates to StorageOptionsAccessor /// -/// This adapter converts our generic StorageOptionsProvider trait into -/// AWS-specific credentials that can be used with S3. It caches credentials -/// and automatically refreshes them before they expire. +/// This adapter converts storage options from a [`StorageOptionsAccessor`] into +/// AWS-specific credentials that can be used with S3. All caching and refresh logic +/// is handled by the accessor. /// /// # Future Work /// @@ -510,128 +470,71 @@ impl ObjectStoreParams { /// /// See: pub struct DynamicStorageOptionsCredentialProvider { - provider: Arc, - cache: Arc>>, - refresh_offset: Duration, + accessor: Arc, } impl fmt::Debug for DynamicStorageOptionsCredentialProvider { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("DynamicStorageOptionsCredentialProvider") - .field("provider", &self.provider) - .field("refresh_offset", &self.refresh_offset) + .field("accessor", &self.accessor) .finish() } } -#[derive(Debug, Clone)] -struct CachedCredential { - credential: Arc, - expires_at_millis: Option, -} - impl DynamicStorageOptionsCredentialProvider { - /// Create a new credential provider without initial credentials + /// Create a new credential provider from a storage options accessor + pub fn new(accessor: Arc) -> Self { + Self { accessor } + } + + /// Create a new credential provider from a storage options provider + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from storage options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - pub fn new(provider: Arc, refresh_offset: Duration) -> Self { + pub fn from_provider(provider: Arc) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(None)), - refresh_offset, + accessor: Arc::new(StorageOptionsAccessor::with_provider(provider)), } } - /// Create a new credential provider with initial credentials from an explicit credential + /// Create a new credential provider with initial credentials + /// + /// This is a convenience constructor for backward compatibility. + /// The refresh offset will be extracted from initial_options using + /// the `refresh_offset_millis` key, defaulting to 60 seconds. /// /// # Arguments /// * `provider` - The storage options provider - /// * `refresh_offset` - Duration before expiry to refresh credentials - /// * `credential` - Initial credential to cache - /// * `expires_at_millis` - Expiration time in milliseconds since epoch (required for refresh) - pub fn new_with_initial_credential( + /// * `initial_options` - Initial storage options to cache + pub fn from_provider_with_initial( provider: Arc, - refresh_offset: Duration, - credential: Arc, - expires_at_millis: u64, + initial_options: HashMap, ) -> Self { Self { - provider, - cache: Arc::new(RwLock::new(Some(CachedCredential { - credential, - expires_at_millis: Some(expires_at_millis), - }))), - refresh_offset, - } - } - - fn needs_refresh(&self, cached: &Option) -> bool { - match cached { - None => true, - Some(cached_cred) => { - if let Some(expires_at_millis) = cached_cred.expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - - // Refresh if we're within the refresh offset of expiration - let refresh_offset_millis = self.refresh_offset.as_millis() as u64; - now_ms + refresh_offset_millis >= expires_at_millis - } else { - // No expiration means credentials never expire - false - } - } + accessor: Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + provider, + )), } } +} - async fn do_get_credential(&self) -> ObjectStoreResult>> { - // Check if we have valid cached credentials with read lock - { - let cached = self.cache.read().await; - if !self.needs_refresh(&cached) { - if let Some(cached_cred) = &*cached { - return Ok(Some(cached_cred.credential.clone())); - } - } - } - - // Try to acquire write lock - if it fails, return None and let caller retry - let Ok(mut cache) = self.cache.try_write() else { - return Ok(None); - }; - - // Double-check if credentials are still stale after acquiring write lock - // (another thread might have refreshed them) - if !self.needs_refresh(&cache) { - if let Some(cached_cred) = &*cache { - return Ok(Some(cached_cred.credential.clone())); - } - } - - log::debug!( - "Refreshing S3 credentials from storage options provider: {}", - self.provider.provider_id() - ); +#[async_trait::async_trait] +impl CredentialProvider for DynamicStorageOptionsCredentialProvider { + type Credential = ObjectStoreAwsCredential; - let storage_options_map = self - .provider - .fetch_storage_options() - .await - .map_err(|e| object_store::Error::Generic { + async fn get_credential(&self) -> ObjectStoreResult> { + let storage_options = self.accessor.get_storage_options().await.map_err(|e| { + object_store::Error::Generic { store: "DynamicStorageOptionsCredentialProvider", source: Box::new(e), - })? - .ok_or_else(|| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: "No storage options available".into(), - })?; + } + })?; - let storage_options = StorageOptions(storage_options_map); - let expires_at_millis = storage_options.expires_at_millis(); let s3_options = storage_options.as_s3_options(); let static_creds = extract_static_s3_credentials(&s3_options).ok_or_else(|| { object_store::Error::Generic { @@ -640,58 +543,13 @@ impl DynamicStorageOptionsCredentialProvider { } })?; - let credential = - static_creds - .get_credential() - .await - .map_err(|e| object_store::Error::Generic { - store: "DynamicStorageOptionsCredentialProvider", - source: Box::new(e), - })?; - - if let Some(expires_at) = expires_at_millis { - let now_ms = SystemTime::now() - .duration_since(UNIX_EPOCH) - .unwrap_or(Duration::from_secs(0)) - .as_millis() as u64; - let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; - log::debug!( - "Successfully refreshed S3 credentials from provider: {}, credentials expire in {} seconds", - self.provider.provider_id(), - expires_in_secs - ); - } else { - log::debug!( - "Successfully refreshed S3 credentials from provider: {} (no expiration)", - self.provider.provider_id() - ); - } - - *cache = Some(CachedCredential { - credential: credential.clone(), - expires_at_millis, - }); - - Ok(Some(credential)) - } -} - -#[async_trait::async_trait] -impl CredentialProvider for DynamicStorageOptionsCredentialProvider { - type Credential = ObjectStoreAwsCredential; - - async fn get_credential(&self) -> ObjectStoreResult> { - // Retry loop - if do_get_credential returns None (lock busy), retry from the beginning - loop { - match self.do_get_credential().await? { - Some(cred) => return Ok(cred), - None => { - // Lock was busy, wait 10ms before retrying - tokio::time::sleep(Duration::from_millis(10)).await; - continue; - } - } - } + static_creds + .get_credential() + .await + .map_err(|e| object_store::Error::Generic { + store: "DynamicStorageOptionsCredentialProvider", + source: Box::new(e), + }) } } @@ -813,13 +671,16 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AwsStoreProvider; let url = Url::parse("s3://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("region".to_string(), "us-west-2".to_string()), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("region".to_string(), "us-west-2".to_string()), + ]), + ))), ..Default::default() }; @@ -896,19 +757,22 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expire in 10 minutes + // Create initial options with cached credentials that expire in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_CACHED".to_string(), - secret_key: "SECRET_CACHED".to_string(), - token: Some("TOKEN_CACHED".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_CACHED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_CACHED".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_CACHED".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expires_at, + initial_options, ); // First call should use cached credentials (not expired yet) @@ -932,19 +796,21 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create credential provider with initial cached credentials that expired 1 second ago + // Create initial options with credentials that expired 1 second ago let expired_time = now_ms - 1_000; // 1 second ago - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_EXPIRED".to_string(), - secret_key: "SECRET_EXPIRED".to_string(), - token: None, - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_EXPIRED".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_EXPIRED".to_string(), + ), + ("expires_at_millis".to_string(), expired_time.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expired_time, + initial_options, ); // First call should fetch new credentials because cached ones are expired @@ -961,27 +827,24 @@ mod tests { async fn test_dynamic_credential_provider_refresh_lead_time() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 4 minutes + // Create a mock provider that returns credentials expiring in 30 seconds let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 240_000, // Expires in 4 minutes + 30_000, // Expires in 30 seconds ))); - // Create credential provider with 5 minute refresh offset - // This means credentials should be refreshed when they have less than 5 minutes left - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider with default 60 second refresh offset + // This means credentials should be refreshed when they have less than 60 seconds left + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch credentials from provider (no initial cache) - // Credentials expire in 4 minutes, which is less than our 5 minute refresh offset, + // Credentials expire in 30 seconds, which is less than our 60 second refresh offset, // so they should be considered "needs refresh" immediately let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); - // Second call should trigger refresh because credentials expire in 4 minutes - // but our refresh lead time is 5 minutes (now + 5min > expires_at) + // Second call should trigger refresh because credentials expire in 30 seconds + // but our refresh lead time is 60 seconds (now + 60sec > expires_at) // The mock will return new credentials (AKID_2) with the same expiration let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); @@ -992,16 +855,13 @@ mod tests { async fn test_dynamic_credential_provider_no_initial_cache() { MockClock::set_system_time(Duration::from_secs(100_000)); - // Create a mock provider that returns credentials expiring in 10 minutes + // Create a mock provider that returns credentials expiring in 2 minutes let mock = Arc::new(MockStorageOptionsProvider::new(Some( - 600_000, // Expires in 10 minutes + 120_000, // Expires in 2 minutes ))); - // Create credential provider without initial cache - let provider = DynamicStorageOptionsCredentialProvider::new( - mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - ); + // Create credential provider without initial cache, using default 60 second refresh offset + let provider = DynamicStorageOptionsCredentialProvider::from_provider(mock.clone()); // First call should fetch from provider (call count = 1) let cred = provider.get_credential().await.unwrap(); @@ -1010,21 +870,22 @@ mod tests { assert_eq!(cred.token, Some("TOKEN_1".to_string())); assert_eq!(mock.get_call_count().await, 1); - // Second call should use cached credentials (not expired yet) + // Second call should use cached credentials (not expired yet, still > 60 seconds remaining) let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_1"); assert_eq!(mock.get_call_count().await, 1); // Still 1, didn't fetch again - // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) - MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + // Advance time to 90 seconds - should trigger refresh (within 60 sec refresh offset) + // At this point, credentials expire in 30 seconds (< 60 sec offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 90)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_2"); assert_eq!(cred.secret_key, "SECRET_2"); assert_eq!(cred.token, Some("TOKEN_2".to_string())); assert_eq!(mock.get_call_count().await, 2); - // Advance time to 11 minutes total - should trigger another refresh - MockClock::set_system_time(Duration::from_secs(100_000 + 660)); + // Advance time to 210 seconds total (90 + 120) - should trigger another refresh + MockClock::set_system_time(Duration::from_secs(100_000 + 210)); let cred = provider.get_credential().await.unwrap(); assert_eq!(cred.key_id, "AKID_3"); assert_eq!(cred.secret_key, "SECRET_3"); @@ -1032,7 +893,7 @@ mod tests { } #[tokio::test] - async fn test_dynamic_credential_provider_with_initial_credential() { + async fn test_dynamic_credential_provider_with_initial_options() { MockClock::set_system_time(Duration::from_secs(100_000)); let now_ms = MockClock::system_time().as_millis() as u64; @@ -1042,20 +903,23 @@ mod tests { 600_000, // Expires in 10 minutes ))); - // Create an initial credential with expiration in 10 minutes + // Create initial options with expiration in 10 minutes let expires_at = now_ms + 600_000; // 10 minutes from now - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_INITIAL".to_string(), - secret_key: "SECRET_INITIAL".to_string(), - token: Some("TOKEN_INITIAL".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_INITIAL".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_INITIAL".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_INITIAL".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); - // Create credential provider with initial credential and expiration - let provider = DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + // Create credential provider with initial options + let provider = DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), // 5 minute refresh offset - initial_cred, - expires_at, + initial_options, ); // First call should use the initial credential (not expired yet) @@ -1104,9 +968,8 @@ mod tests { // Create a mock provider with far future expiration let mock = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); - let provider = Arc::new(DynamicStorageOptionsCredentialProvider::new( + let provider = Arc::new(DynamicStorageOptionsCredentialProvider::from_provider( mock.clone(), - Duration::from_secs(300), )); // Spawn 10 concurrent tasks that all try to get credentials at the same time @@ -1152,14 +1015,18 @@ mod tests { let now_ms = MockClock::system_time().as_millis() as u64; - // Create initial credentials that expired in the past (1000 seconds ago) + // Create initial options with credentials that expired in the past (1000 seconds ago) let expires_at = now_ms - 1_000_000; - - let initial_cred = Arc::new(ObjectStoreAwsCredential { - key_id: "AKID_OLD".to_string(), - secret_key: "SECRET_OLD".to_string(), - token: Some("TOKEN_OLD".to_string()), - }); + let initial_options = HashMap::from([ + ("aws_access_key_id".to_string(), "AKID_OLD".to_string()), + ( + "aws_secret_access_key".to_string(), + "SECRET_OLD".to_string(), + ), + ("aws_session_token".to_string(), "TOKEN_OLD".to_string()), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); // Mock will return credentials expiring in 1 hour let mock = Arc::new(MockStorageOptionsProvider::new(Some( @@ -1167,11 +1034,9 @@ mod tests { ))); let provider = Arc::new( - DynamicStorageOptionsCredentialProvider::new_with_initial_credential( + DynamicStorageOptionsCredentialProvider::from_provider_with_initial( mock.clone(), - Duration::from_secs(300), - initial_cred, - expires_at, + initial_options, ), ); @@ -1217,4 +1082,112 @@ mod tests { call_count ); } + + #[tokio::test] + async fn test_explicit_aws_credentials_takes_precedence_over_accessor() { + // Create a mock storage options provider that should NOT be called + let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create an accessor with the mock provider + let accessor = Arc::new(StorageOptionsAccessor::with_provider( + mock_storage_provider.clone(), + )); + + // Create an explicit AWS credentials provider + let explicit_cred_provider = Arc::new(MockAwsCredentialsProvider::default()); + + // Build credentials with both aws_credentials AND accessor + // The explicit aws_credentials should take precedence + let (result, _region) = build_aws_credential( + Duration::from_secs(300), + Some(explicit_cred_provider.clone() as AwsCredentialProvider), + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), + ) + .await + .unwrap(); + + // Get credential from the result + let cred = result.get_credential().await.unwrap(); + + // The explicit provider should have been called (it returns empty strings) + assert!(explicit_cred_provider.called.load(Ordering::Relaxed)); + + // The storage options provider should NOT have been called + assert_eq!( + mock_storage_provider.get_call_count().await, + 0, + "Storage options provider should not be called when explicit aws_credentials is provided" + ); + + // Verify we got credentials from the explicit provider (empty strings) + assert_eq!(cred.key_id, ""); + assert_eq!(cred.secret_key, ""); + } + + #[tokio::test] + async fn test_accessor_used_when_no_explicit_aws_credentials() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + + // Create a mock storage options provider + let mock_storage_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + // Create initial options + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial_options = HashMap::from([ + ( + "aws_access_key_id".to_string(), + "AKID_FROM_ACCESSOR".to_string(), + ), + ( + "aws_secret_access_key".to_string(), + "SECRET_FROM_ACCESSOR".to_string(), + ), + ( + "aws_session_token".to_string(), + "TOKEN_FROM_ACCESSOR".to_string(), + ), + ("expires_at_millis".to_string(), expires_at.to_string()), + ("refresh_offset_millis".to_string(), "300000".to_string()), // 5 minute refresh offset + ]); + + // Create an accessor with initial options and provider + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + initial_options, + mock_storage_provider.clone(), + )); + + // Build credentials with accessor but NO explicit aws_credentials + let (result, _region) = build_aws_credential( + Duration::from_secs(300), + None, // no explicit aws_credentials + None, // no storage_options + Some("us-west-2".to_string()), + Some(accessor), + ) + .await + .unwrap(); + + // Get credential - should use the initial accessor credentials + let cred = result.get_credential().await.unwrap(); + assert_eq!(cred.key_id, "AKID_FROM_ACCESSOR"); + assert_eq!(cred.secret_key, "SECRET_FROM_ACCESSOR"); + + // Storage options provider should NOT have been called yet (using cached initial creds) + assert_eq!(mock_storage_provider.get_call_count().await, 0); + + // Advance time to trigger refresh (past the 5 minute refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + + // Get credential again - should now fetch from provider + let cred = result.get_credential().await.unwrap(); + assert_eq!(cred.key_id, "AKID_1"); + assert_eq!(cred.secret_key, "SECRET_1"); + + // Storage options provider should have been called once + assert_eq!(mock_storage_provider.get_call_count().await, 1); + } } diff --git a/rust/lance-io/src/object_store/providers/azure.rs b/rust/lance-io/src/object_store/providers/azure.rs index 7a90fc6744a..7bf566c8972 100644 --- a/rust/lance-io/src/object_store/providers/azure.rs +++ b/rust/lance-io/src/object_store/providers/azure.rs @@ -95,7 +95,7 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_azure(); let download_retry_count = storage_options.download_retry_count(); @@ -123,6 +123,8 @@ impl ObjectStoreProvider for AzureBlobStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -230,21 +232,24 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = AzureBlobStoreProvider; let url = Url::parse("az://test-container/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ("account_name".to_string(), "test_account".to_string()), - ( - "endpoint".to_string(), - "https://test_account.blob.core.windows.net".to_string(), - ), - ( - "account_key".to_string(), - "dGVzdF9hY2NvdW50X2tleQ==".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ("account_name".to_string(), "test_account".to_string()), + ( + "endpoint".to_string(), + "https://test_account.blob.core.windows.net".to_string(), + ), + ( + "account_key".to_string(), + "dGVzdF9hY2NvdW50X2tleQ==".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/gcp.rs b/rust/lance-io/src/object_store/providers/gcp.rs index 038015d7f4e..dba5cd8dd40 100644 --- a/rust/lance-io/src/object_store/providers/gcp.rs +++ b/rust/lance-io/src/object_store/providers/gcp.rs @@ -96,7 +96,7 @@ impl ObjectStoreProvider for GcsStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); let mut storage_options = - StorageOptions(params.storage_options.clone().unwrap_or_default()); + StorageOptions(params.storage_options().cloned().unwrap_or_default()); storage_options.with_env_gcs(); let download_retry_count = storage_options.download_retry_count(); @@ -124,6 +124,8 @@ impl ObjectStoreProvider for GcsStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } } @@ -180,16 +182,19 @@ mod tests { #[tokio::test] async fn test_use_opendal_flag() { + use crate::object_store::StorageOptionsAccessor; let provider = GcsStoreProvider; let url = Url::parse("gs://test-bucket/path").unwrap(); let params_with_flag = ObjectStoreParams { - storage_options: Some(HashMap::from([ - ("use_opendal".to_string(), "true".to_string()), - ( - "service_account".to_string(), - "test@example.iam.gserviceaccount.com".to_string(), - ), - ])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([ + ("use_opendal".to_string(), "true".to_string()), + ( + "service_account".to_string(), + "test@example.iam.gserviceaccount.com".to_string(), + ), + ]), + ))), ..Default::default() }; diff --git a/rust/lance-io/src/object_store/providers/huggingface.rs b/rust/lance-io/src/object_store/providers/huggingface.rs index c52c85a3c72..55c5f6d50b9 100644 --- a/rust/lance-io/src/object_store/providers/huggingface.rs +++ b/rust/lance-io/src/object_store/providers/huggingface.rs @@ -65,7 +65,7 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { } = parse_hf_url(&base_path)?; let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); // Build OpenDAL config with allowed keys only. @@ -114,6 +114,8 @@ impl ObjectStoreProvider for HuggingfaceStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } @@ -157,12 +159,13 @@ mod tests { #[test] fn storage_option_revision_takes_precedence() { + use crate::object_store::StorageOptionsAccessor; + use std::sync::Arc; let url = Url::parse("hf://datasets/acme/repo/data/file").unwrap(); let params = ObjectStoreParams { - storage_options: Some(HashMap::from([( - String::from("hf_revision"), - String::from("stable"), - )])), + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( + HashMap::from([(String::from("hf_revision"), String::from("stable"))]), + ))), ..Default::default() }; // new_store should accept without creating operator; test precedence via builder config @@ -175,8 +178,7 @@ mod tests { config_map.insert("repo_type".to_string(), repo_type); config_map.insert("repo".to_string(), repo_id); if let Some(rev) = params - .storage_options - .as_ref() + .storage_options() .unwrap() .get("hf_revision") .cloned() diff --git a/rust/lance-io/src/object_store/providers/local.rs b/rust/lance-io/src/object_store/providers/local.rs index 74f2777992b..78c8c9632c4 100644 --- a/rust/lance-io/src/object_store/providers/local.rs +++ b/rust/lance-io/src/object_store/providers/local.rs @@ -20,7 +20,7 @@ pub struct FileStoreProvider; impl ObjectStoreProvider for FileStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(LocalFileSystem::new()), @@ -32,6 +32,8 @@ impl ObjectStoreProvider for FileStoreProvider { io_parallelism: DEFAULT_LOCAL_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/memory.rs b/rust/lance-io/src/object_store/providers/memory.rs index 9519806ed70..addc2fafc80 100644 --- a/rust/lance-io/src/object_store/providers/memory.rs +++ b/rust/lance-io/src/object_store/providers/memory.rs @@ -17,9 +17,9 @@ pub struct MemoryStoreProvider; #[async_trait::async_trait] impl ObjectStoreProvider for MemoryStoreProvider { - async fn new_store(&self, _base_path: Url, params: &ObjectStoreParams) -> Result { + async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_LOCAL_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let download_retry_count = storage_options.download_retry_count(); Ok(ObjectStore { inner: Arc::new(InMemory::new()), @@ -31,6 +31,8 @@ impl ObjectStoreProvider for MemoryStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count, io_tracker: Default::default(), + store_prefix: self + .calculate_object_store_prefix(&base_path, params.storage_options())?, }) } diff --git a/rust/lance-io/src/object_store/providers/oss.rs b/rust/lance-io/src/object_store/providers/oss.rs index 3437ec8d1b6..80f161b233e 100644 --- a/rust/lance-io/src/object_store/providers/oss.rs +++ b/rust/lance-io/src/object_store/providers/oss.rs @@ -22,7 +22,7 @@ pub struct OssStoreProvider; impl ObjectStoreProvider for OssStoreProvider { async fn new_store(&self, base_path: Url, params: &ObjectStoreParams) -> Result { let block_size = params.block_size.unwrap_or(DEFAULT_CLOUD_BLOCK_SIZE); - let storage_options = StorageOptions(params.storage_options.clone().unwrap_or_default()); + let storage_options = StorageOptions(params.storage_options().cloned().unwrap_or_default()); let bucket = base_path .host_str() @@ -103,6 +103,7 @@ impl ObjectStoreProvider for OssStoreProvider { io_parallelism: DEFAULT_CLOUD_IO_PARALLELISM, download_retry_count: storage_options.download_retry_count(), io_tracker: Default::default(), + store_prefix: self.calculate_object_store_prefix(&url, params.storage_options())?, }) } } diff --git a/rust/lance-io/src/object_store/storage_options.rs b/rust/lance-io/src/object_store/storage_options.rs index 22854e8fd53..d0f5cc20e93 100644 --- a/rust/lance-io/src/object_store/storage_options.rs +++ b/rust/lance-io/src/object_store/storage_options.rs @@ -1,25 +1,42 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright The Lance Authors -//! Storage options provider for dynamic credential fetching +//! Storage options provider and accessor for dynamic credential fetching //! -//! This module provides a trait for fetching storage options from various sources -//! (namespace servers, secret managers, etc.) with support for expiration tracking -//! and automatic refresh. +//! This module provides: +//! - [`StorageOptionsProvider`] trait for fetching storage options from various sources +//! (namespace servers, secret managers, etc.) with support for expiration tracking +//! - [`StorageOptionsAccessor`] for unified access to storage options with automatic +//! caching and refresh use std::collections::HashMap; use std::fmt; use std::sync::Arc; +use std::time::Duration; + +#[cfg(test)] +use mock_instant::thread_local::{SystemTime, UNIX_EPOCH}; + +#[cfg(not(test))] +use std::time::{SystemTime, UNIX_EPOCH}; -use crate::{Error, Result}; use async_trait::async_trait; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; use snafu::location; +use tokio::sync::RwLock; + +use crate::{Error, Result}; /// Key for the expiration timestamp in storage options HashMap pub const EXPIRES_AT_MILLIS_KEY: &str = "expires_at_millis"; +/// Key for the refresh offset in storage options HashMap (milliseconds before expiry to refresh) +pub const REFRESH_OFFSET_MILLIS_KEY: &str = "refresh_offset_millis"; + +/// Default refresh offset: 60 seconds before expiration +const DEFAULT_REFRESH_OFFSET_MILLIS: u64 = 60_000; + /// Trait for providing storage options with expiration tracking /// /// Implementations can fetch storage options from various sources (namespace servers, @@ -139,3 +156,558 @@ impl StorageOptionsProvider for LanceNamespaceStorageOptionsProvider { ) } } + +/// Unified access to storage options with automatic caching and refresh +/// +/// This struct bundles static storage options with an optional dynamic provider, +/// handling all caching and refresh logic internally. It provides a single entry point +/// for accessing storage options regardless of whether they're static or dynamic. +/// +/// # Behavior +/// +/// - If only static options are provided, returns those options +/// - If a provider is configured, fetches from provider and caches results +/// - Automatically refreshes cached options before expiration (based on refresh_offset) +/// - Uses `expires_at_millis` key to track expiration +/// +/// # Thread Safety +/// +/// The accessor is thread-safe and can be shared across multiple tasks. +/// Concurrent refresh attempts are deduplicated using a try-lock mechanism. +pub struct StorageOptionsAccessor { + /// Initial/fallback static storage options + initial_options: Option>, + + /// Optional dynamic provider for refreshing options + provider: Option>, + + /// Cached storage options with expiration tracking + cache: Arc>>, + + /// Duration before expiry to trigger refresh + refresh_offset: Duration, +} + +impl fmt::Debug for StorageOptionsAccessor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StorageOptionsAccessor") + .field("has_initial_options", &self.initial_options.is_some()) + .field("has_provider", &self.provider.is_some()) + .field("refresh_offset", &self.refresh_offset) + .finish() + } +} + +#[derive(Debug, Clone)] +struct CachedStorageOptions { + options: HashMap, + expires_at_millis: Option, +} + +impl StorageOptionsAccessor { + /// Extract refresh offset from storage options, or use default + fn extract_refresh_offset(options: &HashMap) -> Duration { + options + .get(REFRESH_OFFSET_MILLIS_KEY) + .and_then(|s| s.parse::().ok()) + .map(Duration::from_millis) + .unwrap_or(Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS)) + } + + /// Create an accessor with only static options (no refresh capability) + /// + /// The returned accessor will always return the provided options. + /// This is useful when credentials don't expire or are managed externally. + pub fn with_static_options(options: HashMap) -> Self { + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&options); + + Self { + initial_options: Some(options.clone()), + provider: None, + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Create an accessor with a dynamic provider (no initial options) + /// + /// The accessor will fetch from the provider on first access and cache + /// the results. Refresh happens automatically before expiration. + /// Uses the default refresh offset (60 seconds) until options are fetched. + /// + /// # Arguments + /// * `provider` - The storage options provider for fetching fresh options + pub fn with_provider(provider: Arc) -> Self { + Self { + initial_options: None, + provider: Some(provider), + cache: Arc::new(RwLock::new(None)), + refresh_offset: Duration::from_millis(DEFAULT_REFRESH_OFFSET_MILLIS), + } + } + + /// Create an accessor with initial options and a dynamic provider + /// + /// Initial options are used until they expire, then the provider is called. + /// This avoids an immediate fetch when initial credentials are still valid. + /// The `refresh_offset_millis` key in initial_options controls refresh timing. + /// + /// # Arguments + /// * `initial_options` - Initial storage options to cache + /// * `provider` - The storage options provider for refreshing + pub fn with_initial_and_provider( + initial_options: HashMap, + provider: Arc, + ) -> Self { + let expires_at_millis = initial_options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + let refresh_offset = Self::extract_refresh_offset(&initial_options); + + Self { + initial_options: Some(initial_options.clone()), + provider: Some(provider), + cache: Arc::new(RwLock::new(Some(CachedStorageOptions { + options: initial_options, + expires_at_millis, + }))), + refresh_offset, + } + } + + /// Get current valid storage options + /// + /// - Returns cached options if not expired + /// - Fetches from provider if expired or not cached + /// - Falls back to initial_options if provider returns None + /// + /// # Errors + /// + /// Returns an error if: + /// - The provider fails to fetch options + /// - No options are available (no cache, no provider, no initial options) + pub async fn get_storage_options(&self) -> Result { + loop { + match self.do_get_storage_options().await? { + Some(options) => return Ok(options), + None => { + // Lock was busy, wait 10ms before retrying + tokio::time::sleep(Duration::from_millis(10)).await; + continue; + } + } + } + } + + async fn do_get_storage_options(&self) -> Result> { + // Check if we have valid cached options with read lock + { + let cached = self.cache.read().await; + if !self.needs_refresh(&cached) { + if let Some(cached_opts) = &*cached { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + } + + // If no provider, return initial options or error + let Some(provider) = &self.provider else { + return if let Some(initial) = &self.initial_options { + Ok(Some(super::StorageOptions(initial.clone()))) + } else { + Err(Error::IO { + source: Box::new(std::io::Error::other("No storage options available")), + location: location!(), + }) + }; + }; + + // Try to acquire write lock - if it fails, return None and let caller retry + let Ok(mut cache) = self.cache.try_write() else { + return Ok(None); + }; + + // Double-check if options are still stale after acquiring write lock + // (another thread might have refreshed them) + if !self.needs_refresh(&cache) { + if let Some(cached_opts) = &*cache { + return Ok(Some(super::StorageOptions(cached_opts.options.clone()))); + } + } + + log::debug!( + "Refreshing storage options from provider: {}", + provider.provider_id() + ); + + let storage_options_map = + provider + .fetch_storage_options() + .await + .map_err(|e| Error::IO { + source: Box::new(std::io::Error::other(format!( + "Failed to fetch storage options: {}", + e + ))), + location: location!(), + })?; + + let Some(options) = storage_options_map else { + // Provider returned None, fall back to initial options + if let Some(initial) = &self.initial_options { + return Ok(Some(super::StorageOptions(initial.clone()))); + } + return Err(Error::IO { + source: Box::new(std::io::Error::other( + "Provider returned no storage options", + )), + location: location!(), + }); + }; + + let expires_at_millis = options + .get(EXPIRES_AT_MILLIS_KEY) + .and_then(|s| s.parse::().ok()); + + if let Some(expires_at) = expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + let expires_in_secs = (expires_at.saturating_sub(now_ms)) / 1000; + log::debug!( + "Successfully refreshed storage options from provider: {}, options expire in {} seconds", + provider.provider_id(), + expires_in_secs + ); + } else { + log::debug!( + "Successfully refreshed storage options from provider: {} (no expiration)", + provider.provider_id() + ); + } + + *cache = Some(CachedStorageOptions { + options: options.clone(), + expires_at_millis, + }); + + Ok(Some(super::StorageOptions(options))) + } + + fn needs_refresh(&self, cached: &Option) -> bool { + match cached { + None => true, + Some(cached_opts) => { + if let Some(expires_at_millis) = cached_opts.expires_at_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or(Duration::from_secs(0)) + .as_millis() as u64; + + // Refresh if we're within the refresh offset of expiration + let refresh_offset_millis = self.refresh_offset.as_millis() as u64; + now_ms + refresh_offset_millis >= expires_at_millis + } else { + // No expiration means options never expire + false + } + } + } + } + + /// Get the initial storage options without refresh + /// + /// Returns the initial options that were provided when creating the accessor. + /// This does not trigger any refresh, even if the options have expired. + pub fn initial_storage_options(&self) -> Option<&HashMap> { + self.initial_options.as_ref() + } + + /// Get the accessor ID for equality/hashing + /// + /// Returns the provider_id if a provider exists, otherwise generates + /// a stable ID from the initial options hash. + pub fn accessor_id(&self) -> String { + if let Some(provider) = &self.provider { + provider.provider_id() + } else if let Some(initial) = &self.initial_options { + // Generate a stable ID from initial options + use std::collections::hash_map::DefaultHasher; + use std::hash::{Hash, Hasher}; + + let mut hasher = DefaultHasher::new(); + let mut keys: Vec<_> = initial.keys().collect(); + keys.sort(); + for key in keys { + key.hash(&mut hasher); + initial.get(key).hash(&mut hasher); + } + format!("static_options_{:x}", hasher.finish()) + } else { + "empty_accessor".to_string() + } + } + + /// Check if this accessor has a dynamic provider + pub fn has_provider(&self) -> bool { + self.provider.is_some() + } + + /// Get the refresh offset duration + pub fn refresh_offset(&self) -> Duration { + self.refresh_offset + } + + /// Get the storage options provider, if any + pub fn provider(&self) -> Option<&Arc> { + self.provider.as_ref() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use mock_instant::thread_local::MockClock; + + #[derive(Debug)] + struct MockStorageOptionsProvider { + call_count: Arc>, + expires_in_millis: Option, + } + + impl MockStorageOptionsProvider { + fn new(expires_in_millis: Option) -> Self { + Self { + call_count: Arc::new(RwLock::new(0)), + expires_in_millis, + } + } + + async fn get_call_count(&self) -> usize { + *self.call_count.read().await + } + } + + #[async_trait] + impl StorageOptionsProvider for MockStorageOptionsProvider { + async fn fetch_storage_options(&self) -> Result>> { + let count = { + let mut c = self.call_count.write().await; + *c += 1; + *c + }; + + let mut options = HashMap::from([ + ("aws_access_key_id".to_string(), format!("AKID_{}", count)), + ( + "aws_secret_access_key".to_string(), + format!("SECRET_{}", count), + ), + ("aws_session_token".to_string(), format!("TOKEN_{}", count)), + ]); + + if let Some(expires_in) = self.expires_in_millis { + let now_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap() + .as_millis() as u64; + let expires_at = now_ms + expires_in; + options.insert(EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()); + } + + Ok(Some(options)) + } + + fn provider_id(&self) -> String { + let ptr = Arc::as_ptr(&self.call_count) as usize; + format!("MockStorageOptionsProvider {{ id: {} }}", ptr) + } + } + + #[tokio::test] + async fn test_static_options_only() { + let options = HashMap::from([ + ("key1".to_string(), "value1".to_string()), + ("key2".to_string(), "value2".to_string()), + ]); + let accessor = StorageOptionsAccessor::with_static_options(options.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0, options); + assert!(!accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), Some(&options)); + } + + #[tokio::test] + async fn test_provider_only() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key("aws_access_key_id")); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert!(accessor.has_provider()); + assert_eq!(accessor.initial_storage_options(), None); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_initial_and_provider_uses_initial_first() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "INITIAL_KEY".to_string()), + ( + "aws_secret_access_key".to_string(), + "INITIAL_SECRET".to_string(), + ), + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = StorageOptionsAccessor::with_initial_and_provider( + initial.clone(), + mock_provider.clone(), + ); + + // First call uses initial + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "INITIAL_KEY"); + assert_eq!(mock_provider.get_call_count().await, 0); // Provider not called yet + } + + #[tokio::test] + async fn test_caching_and_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); // 10 min expiry + // Use with_initial_and_provider to set custom refresh_offset_millis (5 min = 300000ms) + let now_ms = MockClock::system_time().as_millis() as u64; + let expires_at = now_ms + 600_000; // 10 minutes from now + let initial = HashMap::from([ + (EXPIRES_AT_MILLIS_KEY.to_string(), expires_at.to_string()), + (REFRESH_OFFSET_MILLIS_KEY.to_string(), "300000".to_string()), // 5 min refresh offset + ]); + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // First call uses initial cached options + let result = accessor.get_storage_options().await.unwrap(); + assert!(result.0.contains_key(EXPIRES_AT_MILLIS_KEY)); + assert_eq!(mock_provider.get_call_count().await, 0); + + // Advance time to 6 minutes - should trigger refresh (within 5 min refresh offset) + MockClock::set_system_time(Duration::from_secs(100_000 + 360)); + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_expired_initial_triggers_refresh() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let now_ms = MockClock::system_time().as_millis() as u64; + let expired_time = now_ms - 1_000; // Expired 1 second ago + + let initial = HashMap::from([ + ("aws_access_key_id".to_string(), "EXPIRED_KEY".to_string()), + (EXPIRES_AT_MILLIS_KEY.to_string(), expired_time.to_string()), + ]); + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(600_000))); + + let accessor = + StorageOptionsAccessor::with_initial_and_provider(initial, mock_provider.clone()); + + // Should fetch from provider since initial is expired + let result = accessor.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + assert_eq!(mock_provider.get_call_count().await, 1); + } + + #[tokio::test] + async fn test_accessor_id_with_provider() { + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); + let accessor = StorageOptionsAccessor::with_provider(mock_provider); + + let id = accessor.accessor_id(); + assert!(id.starts_with("MockStorageOptionsProvider")); + } + + #[tokio::test] + async fn test_accessor_id_static() { + let options = HashMap::from([("key".to_string(), "value".to_string())]); + let accessor = StorageOptionsAccessor::with_static_options(options); + + let id = accessor.accessor_id(); + assert!(id.starts_with("static_options_")); + } + + #[tokio::test] + async fn test_concurrent_access() { + // Create a mock provider with far future expiration + let mock_provider = Arc::new(MockStorageOptionsProvider::new(Some(9999999999999))); + + let accessor = Arc::new(StorageOptionsAccessor::with_provider(mock_provider.clone())); + + // Spawn 10 concurrent tasks that all try to get options at the same time + let mut handles = vec![]; + for i in 0..10 { + let acc = accessor.clone(); + let handle = tokio::spawn(async move { + let result = acc.get_storage_options().await.unwrap(); + assert_eq!(result.0.get("aws_access_key_id").unwrap(), "AKID_1"); + i + }); + handles.push(handle); + } + + // Wait for all tasks to complete + let results: Vec<_> = futures::future::join_all(handles) + .await + .into_iter() + .map(|r| r.unwrap()) + .collect(); + + // Verify all 10 tasks completed successfully + assert_eq!(results.len(), 10); + + // The provider should have been called exactly once + let call_count = mock_provider.get_call_count().await; + assert_eq!( + call_count, 1, + "Provider should be called exactly once despite concurrent access" + ); + } + + #[tokio::test] + async fn test_no_expiration_never_refreshes() { + MockClock::set_system_time(Duration::from_secs(100_000)); + + let mock_provider = Arc::new(MockStorageOptionsProvider::new(None)); // No expiration + let accessor = StorageOptionsAccessor::with_provider(mock_provider.clone()); + + // First call fetches + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + + // Advance time significantly + MockClock::set_system_time(Duration::from_secs(200_000)); + + // Should still use cached options + accessor.get_storage_options().await.unwrap(); + assert_eq!(mock_provider.get_call_count().await, 1); + } +} diff --git a/rust/lance-namespace-impls/src/dir.rs b/rust/lance-namespace-impls/src/dir.rs index 4d6a88419ee..875df33e580 100644 --- a/rust/lance-namespace-impls/src/dir.rs +++ b/rust/lance-namespace-impls/src/dir.rs @@ -471,8 +471,11 @@ impl DirectoryNamespaceBuilder { session: &Option>, ) -> Result<(Arc, Path)> { // Build ObjectStoreParams from storage options + let accessor = storage_options.clone().map(|opts| { + Arc::new(lance_io::object_store::StorageOptionsAccessor::with_static_options(opts)) + }); let params = ObjectStoreParams { - storage_options: storage_options.clone(), + storage_options_accessor: accessor, ..Default::default() }; @@ -1262,7 +1265,9 @@ impl LanceNamespace for DirectoryNamespace { }; let store_params = self.storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options(opts.clone()), + )), ..Default::default() }); @@ -3106,15 +3111,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -3140,7 +3140,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -3169,7 +3168,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); diff --git a/rust/lance-namespace-impls/src/dir/manifest.rs b/rust/lance-namespace-impls/src/dir/manifest.rs index bfcb9602b9a..49d19712e26 100644 --- a/rust/lance-namespace-impls/src/dir/manifest.rs +++ b/rust/lance-namespace-impls/src/dir/manifest.rs @@ -982,7 +982,11 @@ impl ManifestNamespace { let write_params = WriteParams { session, store_params: storage_options.as_ref().map(|opts| ObjectStoreParams { - storage_options: Some(opts.clone()), + storage_options_accessor: Some(Arc::new( + lance_io::object_store::StorageOptionsAccessor::with_static_options( + opts.clone(), + ), + )), ..Default::default() }), ..Default::default() diff --git a/rust/lance-namespace-impls/src/rest_adapter.rs b/rust/lance-namespace-impls/src/rest_adapter.rs index 899863793ff..b63331c8a66 100644 --- a/rust/lance-namespace-impls/src/rest_adapter.rs +++ b/rust/lance-namespace-impls/src/rest_adapter.rs @@ -2695,15 +2695,10 @@ mod tests { .unwrap(); let reader1 = RecordBatchIterator::new(vec![data1].into_iter().map(Ok), schema.clone()); - let dataset = Dataset::write_into_namespace( - reader1, - namespace.clone(), - table_id.clone(), - None, - false, - ) - .await - .unwrap(); + let dataset = + Dataset::write_into_namespace(reader1, namespace.clone(), table_id.clone(), None) + .await + .unwrap(); assert_eq!(dataset.count_rows(None).await.unwrap(), 3); assert_eq!(dataset.version().version, 1); @@ -2729,7 +2724,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_append), - false, ) .await .unwrap(); @@ -2758,7 +2752,6 @@ mod tests { namespace.clone(), table_id.clone(), Some(params_overwrite), - false, ) .await .unwrap(); diff --git a/rust/lance-table/src/io/commit.rs b/rust/lance-table/src/io/commit.rs index 41cd5b65002..1d20fb72bd2 100644 --- a/rust/lance-table/src/io/commit.rs +++ b/rust/lance-table/src/io/commit.rs @@ -765,20 +765,22 @@ pub async fn commit_handler_from_url( } }; let options = options.clone().unwrap_or_default(); - let storage_options = StorageOptions(options.storage_options.unwrap_or_default()); - let dynamo_endpoint = get_dynamodb_endpoint(&storage_options); - let expires_at_millis = storage_options.expires_at_millis(); - let storage_options = storage_options.as_s3_options(); + let storage_options_raw = + StorageOptions(options.storage_options().cloned().unwrap_or_default()); + let dynamo_endpoint = get_dynamodb_endpoint(&storage_options_raw); + let storage_options = storage_options_raw.as_s3_options(); let region = storage_options.get(&AmazonS3ConfigKey::Region).cloned(); + // Get accessor from the options + let accessor = options.get_accessor(); + let (aws_creds, region) = build_aws_credential( options.s3_credentials_refresh_offset, options.aws_credentials.clone(), Some(&storage_options), region, - options.storage_options_provider.clone(), - expires_at_millis, + accessor, ) .await?; diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs index 7249594783d..7565b96b434 100644 --- a/rust/lance/src/dataset.rs +++ b/rust/lance/src/dataset.rs @@ -35,7 +35,8 @@ use lance_file::reader::FileReaderOptions; use lance_file::version::LanceFileVersion; use lance_index::DatasetIndexExt; use lance_io::object_store::{ - LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, + LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, + StorageOptionsAccessor, StorageOptionsProvider, }; use lance_io::utils::{read_last_block, read_message, read_metadata_offset, read_struct}; use lance_namespace::LanceNamespace; @@ -813,15 +814,11 @@ impl Dataset { /// * `namespace` - The namespace to use for table management /// * `table_id` - The table identifier /// * `params` - Write parameters - /// * `ignore_namespace_table_storage_options` - If true, ignore storage options returned - /// by the namespace and only use the storage options in params. The storage options - /// provider will not be created, so credentials will not be automatically refreshed. pub async fn write_into_namespace( batches: impl RecordBatchReader + Send + 'static, namespace: Arc, table_id: Vec, mut params: Option, - ignore_namespace_table_storage_options: bool, ) -> Result { let mut write_params = params.take().unwrap_or_default(); @@ -870,28 +867,30 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( - namespace, table_id, - )); + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = Arc::new( + LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } Self::write(batches, uri.as_str(), Some(write_params)).await @@ -917,29 +916,32 @@ impl Dataset { location: location!(), })?; - // Set initial credentials and provider unless ignored - if !ignore_namespace_table_storage_options { - if let Some(namespace_storage_options) = response.storage_options { - let provider = Arc::new(LanceNamespaceStorageOptionsProvider::new( + // Set initial credentials and provider from namespace + if let Some(namespace_storage_options) = response.storage_options { + let provider: Arc = + Arc::new(LanceNamespaceStorageOptionsProvider::new( namespace.clone(), table_id.clone(), )); - // Merge namespace storage options with any existing options - let mut merged_options = write_params - .store_params - .as_ref() - .and_then(|p| p.storage_options.clone()) - .unwrap_or_default(); - merged_options.extend(namespace_storage_options); - - let existing_params = write_params.store_params.take().unwrap_or_default(); - write_params.store_params = Some(ObjectStoreParams { - storage_options: Some(merged_options), - storage_options_provider: Some(provider), - ..existing_params - }); - } + // Merge namespace storage options with any existing options + let mut merged_options = write_params + .store_params + .as_ref() + .and_then(|p| p.storage_options().cloned()) + .unwrap_or_default(); + merged_options.extend(namespace_storage_options); + + let accessor = Arc::new(StorageOptionsAccessor::with_initial_and_provider( + merged_options, + provider, + )); + + let existing_params = write_params.store_params.take().unwrap_or_default(); + write_params.store_params = Some(ObjectStoreParams { + storage_options_accessor: Some(accessor), + ..existing_params + }); } // For APPEND/OVERWRITE modes, we must open the existing dataset first @@ -947,11 +949,8 @@ impl Dataset { // assumes no dataset exists and converts the mode to CREATE. let mut builder = DatasetBuilder::from_uri(uri.as_str()); if let Some(ref store_params) = write_params.store_params { - if let Some(ref storage_options) = store_params.storage_options { - builder = builder.with_storage_options(storage_options.clone()); - } - if let Some(ref provider) = store_params.storage_options_provider { - builder = builder.with_storage_options_provider(provider.clone()); + if let Some(accessor) = &store_params.storage_options_accessor { + builder = builder.with_storage_options_accessor(accessor.clone()); } } let dataset = Arc::new(builder.load().await?); @@ -1612,11 +1611,22 @@ impl Dataset { &self.object_store } - /// Returns the storage options used when opening this dataset, if any. + /// Returns the initial storage options used when opening this dataset, if any. + /// + /// This returns the static initial options without triggering any refresh. + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + #[deprecated(since = "0.25.0", note = "Use initial_storage_options() instead")] pub fn storage_options(&self) -> Option<&HashMap> { + self.initial_storage_options() + } + + /// Returns the initial storage options without triggering any refresh. + /// + /// For the latest refreshed options, use [`Self::latest_storage_options`]. + pub fn initial_storage_options(&self) -> Option<&HashMap> { self.store_params .as_ref() - .and_then(|params| params.storage_options.as_ref()) + .and_then(|params| params.storage_options()) } /// Returns the storage options provider used when opening this dataset, if any. @@ -1625,7 +1635,42 @@ impl Dataset { ) -> Option> { self.store_params .as_ref() - .and_then(|params| params.storage_options_provider.clone()) + .and_then(|params| params.storage_options_accessor.as_ref()) + .and_then(|accessor| accessor.provider().cloned()) + } + + /// Returns the unified storage options accessor for this dataset, if any. + /// + /// The accessor handles both static and dynamic storage options with automatic + /// caching and refresh. Use [`StorageOptionsAccessor::get_storage_options`] to + /// get the latest options. + pub fn storage_options_accessor(&self) -> Option> { + self.store_params + .as_ref() + .and_then(|params| params.get_accessor()) + } + + /// Returns the latest (possibly refreshed) storage options. + /// + /// If a dynamic storage options provider is configured, this will return + /// the cached options if still valid, or fetch fresh options if expired. + /// + /// For the initial static options without refresh, use [`Self::storage_options`]. + /// + /// # Returns + /// + /// - `Ok(Some(options))` - Storage options are available (static or refreshed) + /// - `Ok(None)` - No storage options were configured for this dataset + /// - `Err(...)` - Error occurred while fetching/refreshing options from provider + pub async fn latest_storage_options(&self) -> Result> { + // First check if we have an accessor (handles both static and dynamic options) + if let Some(accessor) = self.storage_options_accessor() { + let options = accessor.get_storage_options().await?; + return Ok(Some(options)); + } + + // Fallback to initial storage options if no accessor + Ok(self.initial_storage_options().cloned().map(StorageOptions)) } pub fn data_dir(&self) -> Path { diff --git a/rust/lance/src/dataset/builder.rs b/rust/lance/src/dataset/builder.rs index 6bac1b553a3..8ee5ffa5e41 100644 --- a/rust/lance/src/dataset/builder.rs +++ b/rust/lance/src/dataset/builder.rs @@ -12,7 +12,7 @@ use lance_file::datatypes::populate_schema_dictionary; use lance_file::reader::FileReaderOptions; use lance_io::object_store::{ LanceNamespaceStorageOptionsProvider, ObjectStore, ObjectStoreParams, StorageOptions, - DEFAULT_CLOUD_IO_PARALLELISM, + StorageOptionsAccessor, DEFAULT_CLOUD_IO_PARALLELISM, }; use lance_namespace::models::DescribeTableRequest; use lance_namespace::LanceNamespace; @@ -95,8 +95,6 @@ impl DatasetBuilder { /// # Arguments /// * `namespace` - The namespace implementation to fetch table info from /// * `table_id` - The table identifier (e.g., vec!["my_table"]) - /// * `ignore_namespace_table_storage_options` - If true, storage options returned from - /// the namespace's `describe_table()` will be ignored (treated as None). Defaults to false. /// /// # Example /// ```ignore @@ -111,28 +109,17 @@ impl DatasetBuilder { /// /// // Load a dataset using storage options from namespace /// let dataset = DatasetBuilder::from_namespace( - /// namespace.clone(), - /// vec!["my_table".to_string()], - /// false, - /// ) - /// .await? - /// .load() - /// .await?; - /// - /// // Load a dataset ignoring namespace storage options - /// let dataset = DatasetBuilder::from_namespace( /// namespace, /// vec!["my_table".to_string()], - /// true, /// ) /// .await? /// .load() /// .await?; /// ``` + #[allow(deprecated)] pub async fn from_namespace( namespace: Arc, table_id: Vec, - ignore_namespace_table_storage_options: bool, ) -> Result { let request = DescribeTableRequest { id: Some(table_id.clone()), @@ -156,17 +143,17 @@ impl DatasetBuilder { let mut builder = Self::from_uri(table_uri); - let namespace_storage_options = if ignore_namespace_table_storage_options { - None - } else { - response.storage_options - }; + // Use namespace storage options if available + let namespace_storage_options = response.storage_options; builder.storage_options_override = namespace_storage_options.clone(); - if namespace_storage_options.is_some() { - builder.options.storage_options_provider = Some(Arc::new( + if let Some(initial_opts) = namespace_storage_options { + let provider: Arc = Arc::new( LanceNamespaceStorageOptionsProvider::new(namespace, table_id), + ); + builder.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial_opts, provider), )); } @@ -289,7 +276,27 @@ impl DatasetBuilder { /// - [S3 options](https://docs.rs/object_store/latest/object_store/aws/enum.AmazonS3ConfigKey.html#variants) /// - [Google options](https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants) pub fn with_storage_options(mut self, storage_options: HashMap) -> Self { - self.options.storage_options = Some(storage_options); + // Merge with existing options if accessor exists, otherwise create new static accessor + if let Some(existing) = self.options.storage_options_accessor.take() { + let mut merged = existing + .initial_storage_options() + .cloned() + .unwrap_or_default(); + merged.extend(storage_options); + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -301,9 +308,25 @@ impl DatasetBuilder { /// .with_storage_option("region", "us-east-1"); /// ``` pub fn with_storage_option(mut self, key: impl AsRef, value: impl AsRef) -> Self { - let mut storage_options = self.options.storage_options.unwrap_or_default(); + let mut storage_options = self.options.storage_options().cloned().unwrap_or_default(); storage_options.insert(key.as_ref().to_string(), value.as_ref().to_string()); - self.options.storage_options = Some(storage_options); + + // Merge with existing accessor if present + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(provider) = existing.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(storage_options, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(storage_options), + )); + } self } @@ -355,7 +378,50 @@ impl DatasetBuilder { mut self, provider: Arc, ) -> Self { - self.options.storage_options_provider = Some(provider); + // Preserve existing storage options if any + if let Some(existing) = self.options.storage_options_accessor.take() { + if let Some(initial) = existing.initial_storage_options().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(initial, provider), + )); + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + } else { + self.options.storage_options_accessor = + Some(Arc::new(StorageOptionsAccessor::with_provider(provider))); + } + self + } + + /// Set a unified storage options accessor for credential management + /// + /// The accessor bundles static storage options with an optional dynamic provider, + /// handling all caching and refresh logic internally. + /// + /// # Arguments + /// * `accessor` - The storage options accessor + /// + /// # Example + /// ```ignore + /// use std::sync::Arc; + /// use std::time::Duration; + /// use lance_io::object_store::StorageOptionsAccessor; + /// + /// // Create an accessor with a dynamic provider + /// let accessor = Arc::new(StorageOptionsAccessor::with_provider( + /// provider, + /// Duration::from_secs(300), // 5 minute refresh offset + /// )); + /// + /// let dataset = DatasetBuilder::from_uri("s3://bucket/table.lance") + /// .with_storage_options_accessor(accessor) + /// .load() + /// .await?; + /// ``` + pub fn with_storage_options_accessor(mut self, accessor: Arc) -> Self { + self.options.storage_options_accessor = Some(accessor); self } @@ -418,8 +484,8 @@ impl DatasetBuilder { let storage_options = self .options - .storage_options - .clone() + .storage_options() + .cloned() .map(StorageOptions::new) .unwrap_or_default(); let download_retry_count = storage_options.download_retry_count(); @@ -478,12 +544,29 @@ impl DatasetBuilder { } async fn load_impl(mut self) -> Result { - // Apply storage_options_override last to ensure namespace options take precedence + // Apply storage_options_override to merge namespace options with any existing accessor if let Some(override_opts) = self.storage_options_override.take() { - let mut merged_opts = self.options.storage_options.clone().unwrap_or_default(); + // Get existing options and merge + let mut merged_opts = self.options.storage_options().cloned().unwrap_or_default(); // Override with namespace storage options - they take precedence merged_opts.extend(override_opts); - self.options.storage_options = Some(merged_opts); + + // Update accessor with merged options + if let Some(accessor) = &self.options.storage_options_accessor { + if let Some(provider) = accessor.provider().cloned() { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_initial_and_provider(merged_opts, provider), + )); + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } + } else { + self.options.storage_options_accessor = Some(Arc::new( + StorageOptionsAccessor::with_static_options(merged_opts), + )); + } } let session = match self.session.as_ref() { diff --git a/rust/lance/src/dataset/fragment/write.rs b/rust/lance/src/dataset/fragment/write.rs index b4e96ccbe27..cf7361b5878 100644 --- a/rust/lance/src/dataset/fragment/write.rs +++ b/rust/lance/src/dataset/fragment/write.rs @@ -287,12 +287,12 @@ impl<'a> FragmentCreateBuilder<'a> { async fn existing_dataset_schema(&self) -> Result> { let mut builder = DatasetBuilder::from_uri(self.dataset_uri); - let storage_options = self + let accessor = self .write_params .and_then(|p| p.store_params.as_ref()) - .and_then(|p| p.storage_options.clone()); - if let Some(storage_options) = storage_options { - builder = builder.with_storage_options(storage_options); + .and_then(|p| p.storage_options_accessor.clone()); + if let Some(accessor) = accessor { + builder = builder.with_storage_options_accessor(accessor); } match builder.load().await { Ok(dataset) => { diff --git a/rust/lance/src/io.rs b/rust/lance/src/io.rs index 1ad45ce2d68..1113ef0a2a7 100644 --- a/rust/lance/src/io.rs +++ b/rust/lance/src/io.rs @@ -9,6 +9,9 @@ pub mod exec; pub use lance_io::{ bytes_read_counter, iops_counter, - object_store::{ObjectStore, ObjectStoreParams, ObjectStoreRegistry, WrappingObjectStore}, + object_store::{ + ObjectStore, ObjectStoreParams, ObjectStoreRegistry, StorageOptionsAccessor, + WrappingObjectStore, + }, stream::RecordBatchStream, }; diff --git a/rust/lance/src/io/commit/s3_test.rs b/rust/lance/src/io/commit/s3_test.rs index 35e64703688..1402fb25d46 100644 --- a/rust/lance/src/io/commit/s3_test.rs +++ b/rust/lance/src/io/commit/s3_test.rs @@ -8,7 +8,7 @@ use crate::{ dataset::{ builder::DatasetBuilder, CommitBuilder, InsertBuilder, ReadParams, WriteMode, WriteParams, }, - io::ObjectStoreParams, + io::{ObjectStoreParams, StorageOptionsAccessor}, }; use aws_config::{BehaviorVersion, ConfigLoader, Region, SdkConfig}; use aws_sdk_s3::{config::Credentials, Client as S3Client}; @@ -186,12 +186,12 @@ async fn test_concurrent_writers() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams { @@ -270,12 +270,12 @@ async fn test_ddb_open_iops() { // Create a table let store_params = ObjectStoreParams { object_store_wrapper: Some(io_tracker.clone()), - storage_options: Some( + storage_options_accessor: Some(Arc::new(StorageOptionsAccessor::with_static_options( CONFIG .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(), - ), + ))), ..Default::default() }; let write_params = WriteParams {