lance-format · westonpace · Feb 17, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 16, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/protos/filtered_read.proto b/protos/filtered_read.proto
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.datafusion;
+
+import "table_identifier.proto";
+
+message U64Range {
+  uint64 start = 1;
+  uint64 end = 2;
+}
+
+message ProjectionProto {
+  repeated int32 field_ids = 1;
+  bool with_row_id = 2;
+  bool with_row_addr = 3;
+  bool with_row_last_updated_at_version = 4;
+  bool with_row_created_at_version = 5;
+  BlobHandlingProto blob_handling = 6;
+}
+
+message BlobHandlingProto {
+  oneof mode {
+    // All blobs read as binary
+    bool all_binary = 1;
+    // Blobs as descriptions, other binary as binary (default)
+    bool blobs_descriptions = 2;
+    // All binary columns as descriptions
+    bool all_descriptions = 3;
+    // Specific blobs read as binary, rest as descriptions (non-blob binary stays binary)
+    FieldIdSet some_blobs_binary = 4;
+    // Specific columns as binary, all other binary as descriptions
+    FieldIdSet some_binary = 5;
+  }
+}
+
+message FieldIdSet {
+  repeated uint32 field_ids = 1;
+}
+
+message FilteredReadThreadingModeProto {
+  oneof mode {
+    uint64 one_partition_multiple_threads = 1;
+    uint64 multiple_partitions = 2;
+  }
+}
+
+// Serializable form of FilteredReadOptions.
+message FilteredReadOptionsProto {
+  optional U64Range scan_range_before_filter = 1;
+  optional U64Range scan_range_after_filter = 2;
+  bool with_deleted_rows = 3;
+  optional uint32 batch_size = 4;
+  optional uint64 fragment_readahead = 5;
+  repeated uint64 fragment_ids = 6;
+  ProjectionProto projection = 7;
+  optional bytes refine_filter_substrait = 8;
+  optional bytes full_filter_substrait = 9;
+  FilteredReadThreadingModeProto threading_mode = 10;
+  optional uint64 io_buffer_size_bytes = 11;
+  // Arrow IPC schema for decoding Substrait filters (may be wider than projection).
+  optional bytes filter_schema_ipc = 12;
+}
+
+// Serializable form of FilteredReadPlan (planned/distributed mode).
+// RowAddrTreeMap serialized via its built-in serialize_into/deserialize_from.
+// Per-fragment filters are Substrait-encoded and deduplicated.
+message FilteredReadPlanProto {
+  bytes row_addr_tree_map = 1;
+  optional U64Range scan_range_after_filter = 2;
+  // Arrow IPC schema for decoding Substrait filters (matches the schema used at encode time).
+  optional bytes filter_schema_ipc = 3;
+  // Deduplicated filter storage: frag_id → index into filter_expressions.
+  map<uint32, uint32> fragment_filter_ids = 4;
+  // Unique Substrait-encoded filter expressions (indexed by fragment_filter_ids values).
+  repeated bytes filter_expressions = 5;
+}
+
+// Top-level wrapper for FilteredReadExec serialization.
+message FilteredReadExecProto {
+  TableIdentifier table = 1;
+  FilteredReadOptionsProto options = 2;
+  optional FilteredReadPlanProto plan = 3;
+  // index_input (child plan) handled by DataFusion's codec via inputs[]
+}
diff --git a/protos/table_identifier.proto b/protos/table_identifier.proto
@@ -0,0 +1,18 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+syntax = "proto3";
+
+package lance.datafusion;
+
+// Identifies a Lance dataset for remote reconstruction.
+//
+// Two modes:
+//   1. uri + serialized_manifest (fast): remote executor skips manifest read.
+//   2. uri + version + etag (lightweight): remote executor loads manifest from storage.
+message TableIdentifier {
+  string uri = 1;
+  uint64 version = 2;
+  optional string manifest_etag = 3;
+  optional bytes serialized_manifest = 4;
+}
diff --git a/python/Cargo.lock b/python/Cargo.lock
diff --git a/rust/lance-core/src/datatypes/schema.rs b/rust/lance-core/src/datatypes/schema.rs
@@ -1024,7 +1024,7 @@ impl Projectable for Schema {
 }
 
 /// Specifies how to handle blob columns when projecting
-#[derive(Debug, Clone, Default)]
+#[derive(Debug, Clone, Default, PartialEq)]
 pub enum BlobHandling {
     /// Read all blobs as binary
     AllBinary,

diff --git a/rust/lance-datafusion/Cargo.toml b/rust/lance-datafusion/Cargo.toml
@@ -36,11 +36,16 @@ snafu.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 
+[build-dependencies]
+prost-build.workspace = true
+protobuf-src = {version = "2.1", optional = true}
+
 [dev-dependencies]
 lance-datagen.workspace = true
 
 [features]
 substrait = ["dep:datafusion-substrait"]
+protoc = ["dep:protobuf-src"]
 
 [lints]
 workspace = true
diff --git a/rust/lance-datafusion/build.rs b/rust/lance-datafusion/build.rs
@@ -0,0 +1,25 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+use std::io::Result;
+
+fn main() -> Result<()> {
+    println!("cargo:rerun-if-changed=protos");
+
+    #[cfg(feature = "protoc")]
+    // Use vendored protobuf compiler if requested.
+    std::env::set_var("PROTOC", protobuf_src::protoc());
+
+    let mut prost_build = prost_build::Config::new();
+    prost_build.protoc_arg("--experimental_allow_proto3_optional");
+    prost_build.enable_type_names();
+    prost_build.compile_protos(
+        &[
+            "./protos/table_identifier.proto",
+            "./protos/filtered_read.proto",
+        ],
+        &["./protos"],
+    )?;
+
+    Ok(())
+}
diff --git a/rust/lance-datafusion/protos b/rust/lance-datafusion/protos
@@ -0,0 +1 @@
+../../protos
diff --git a/rust/lance-datafusion/src/lib.rs b/rust/lance-datafusion/src/lib.rs
@@ -10,6 +10,17 @@ pub mod expr;
 pub mod logical_expr;
 pub mod planner;
 pub mod projection;
+pub mod pb {
+    #![allow(clippy::all)]
+    #![allow(non_upper_case_globals)]
+    #![allow(non_camel_case_types)]
+    #![allow(non_snake_case)]
+    #![allow(unused)]
+    #![allow(improper_ctypes)]
+    #![allow(clippy::upper_case_acronyms)]
+    #![allow(clippy::use_self)]
+    include!(concat!(env!("OUT_DIR"), "/lance.datafusion.rs"));
+}
 pub mod spill;
 pub mod sql;
 #[cfg(feature = "substrait")]

diff --git a/rust/lance-datafusion/src/substrait.rs b/rust/lance-datafusion/src/substrait.rs
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-use arrow_schema::Schema as ArrowSchema;
+use arrow_schema::{DataType, Schema as ArrowSchema};
 use datafusion::{execution::SessionState, logical_expr::Expr};
 
 use crate::aggregate::Aggregate;
@@ -27,6 +27,31 @@ use snafu::location;
 use std::collections::HashMap;
 use std::sync::Arc;
 
+/// Substrait doesn't yet support all data types.
+fn is_substrait_compatible(data_type: &DataType) -> bool {
+    match data_type {
+        DataType::Null | DataType::FixedSizeList(_, _) | DataType::Float16 => false,
+        DataType::List(inner) => is_substrait_compatible(inner.data_type()),
+        DataType::Struct(fields) => fields
+            .iter()
+            .all(|f| is_substrait_compatible(f.data_type())),
+        _ => true,
+    }
+}
+
+/// Removes top-level fields that contain data types that Substrait
+/// is not capable of serializing.
+pub fn prune_schema_for_substrait(schema: &ArrowSchema) -> ArrowSchema {
+    ArrowSchema::new(
+        schema
+            .fields()
+            .iter()
+            .filter(|f| is_substrait_compatible(f.data_type()))
+            .cloned()
+            .collect::<Vec<_>>(),
+    )
+}
+
 /// Convert a DF Expr into a Substrait ExtendedExpressions message
 ///
 /// The schema needs to contain all of the fields that are referenced in the expression.

diff --git a/rust/lance/src/io/exec.rs b/rust/lance/src/io/exec.rs
@@ -7,6 +7,8 @@
 
 mod filter;
 pub mod filtered_read;
+#[cfg(feature = "substrait")]
+pub mod filtered_read_proto;
 pub mod fts;
 pub(crate) mod knn;
 mod optimizer;

diff --git a/rust/lance/src/io/exec/filtered_read.rs b/rust/lance/src/io/exec/filtered_read.rs
@@ -1751,6 +1751,11 @@ impl FilteredReadExec {
     pub fn index_input(&self) -> Option<&Arc<dyn ExecutionPlan>> {
         self.index_input.as_ref()
     }
+
+    /// Return the pre-computed plan if one exists, without triggering initialization.
+    pub fn plan(&self) -> Option<FilteredReadPlan> {
+        self.plan.get().map(|p| p.to_external_plan())
+    }
 }
 
 impl DisplayAs for FilteredReadExec {