Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -2528,7 +2528,7 @@ def create_scalar_index(
)

column = column[0]
lance_field = self._ds.lance_schema.field(column)
lance_field = self._ds.lance_schema.field_case_insensitive(column)
if lance_field is None:
raise KeyError(f"{column} not found in schema")

Expand Down Expand Up @@ -2816,7 +2816,7 @@ def create_index(

# validate args
for c in column:
lance_field = self._ds.lance_schema.field(c)
lance_field = self._ds.lance_schema.field_case_insensitive(c)
if lance_field is None:
raise KeyError(f"{c} not found in schema")
field = lance_field.to_arrow()
Expand Down Expand Up @@ -4697,7 +4697,7 @@ def nearest(
) -> ScannerBuilder:
q, q_dim = _coerce_query_vector(q)

lance_field = self.ds._ds.lance_schema.field(column)
lance_field = self.ds._ds.lance_schema.field_case_insensitive(column)
if lance_field is None:
raise ValueError(f"Embedding column {column} is not in the dataset")

Expand Down
593 changes: 593 additions & 0 deletions python/python/tests/test_column_names.py

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions python/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,22 @@ impl LanceSchema {
pub fn field(&self, name: &str) -> PyResult<Option<LanceField>> {
Ok(self.0.field(name).map(|f| LanceField(f.clone())))
}

/// Get a field by name or path with case-insensitive matching.
///
/// This first tries an exact match, then falls back to case-insensitive matching.
/// Returns the actual field from the schema (preserving original case).
///
/// For nested fields, use dot notation (e.g., "parent.child").
/// Field names containing dots must be quoted with backticks (e.g., "parent.`child.with.dot`").
///
/// Returns None if the field is not found.
pub fn field_case_insensitive(&self, name: &str) -> PyResult<Option<LanceField>> {
Ok(self
.0
.field_case_insensitive(name)
.map(|f| LanceField(f.clone())))
}
}

pub(crate) fn logical_arrow_schema(schema: &ArrowSchema) -> ArrowSchema {
Expand Down
27 changes: 27 additions & 0 deletions rust/lance-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,33 @@ impl Field {
}
}

/// Case-insensitive version of resolve.
/// First tries exact match for each child, then falls back to case-insensitive.
pub(crate) fn resolve_case_insensitive<'a>(
&'a self,
split: &mut VecDeque<&str>,
fields: &mut Vec<&'a Self>,
) -> bool {
fields.push(self);
if split.is_empty() {
return true;
}
let first = split.pop_front().unwrap();
// Try exact match first
if let Some(child) = self.children.iter().find(|c| c.name == first) {
return child.resolve_case_insensitive(split, fields);
}
// Fall back to case-insensitive match
if let Some(child) = self
.children
.iter()
.find(|c| c.name.eq_ignore_ascii_case(first))
{
return child.resolve_case_insensitive(split, fields);
}
false
}

pub(crate) fn do_intersection(&self, other: &Self, ignore_types: bool) -> Result<Self> {
if self.name != other.name {
return Err(Error::Arrow {
Expand Down
72 changes: 67 additions & 5 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,62 @@ impl Schema {
self.resolve(name).and_then(|fields| fields.last().copied())
}

/// Get a field by its path, with case-insensitive matching.
///
/// This first tries an exact match, then falls back to case-insensitive matching.
/// Returns the actual field from the schema (preserving original case).
/// Field names containing dots must be quoted: parent."child.with.dot"
pub fn field_case_insensitive(&self, name: &str) -> Option<&Field> {
self.resolve_case_insensitive(name)
.and_then(|fields| fields.last().copied())
}

/// Given a string column reference, resolve the path of fields with case-insensitive matching.
///
/// This first tries an exact match, then falls back to case-insensitive matching.
/// Returns the actual fields from the schema (preserving original case).
pub fn resolve_case_insensitive(&self, column: impl AsRef<str>) -> Option<Vec<&Field>> {
let split = parse_field_path(column.as_ref()).ok()?;
if split.is_empty() {
return None;
}

if split.len() == 1 {
let field_name = &split[0];
// Try exact match first
if let Some(field) = self.fields.iter().find(|f| &f.name == field_name) {
return Some(vec![field]);
}
// Fall back to case-insensitive match
if let Some(field) = self
.fields
.iter()
.find(|f| f.name.eq_ignore_ascii_case(field_name))
{
return Some(vec![field]);
}
return None;
}

// Multiple segments - resolve as a nested field path
let mut fields = Vec::with_capacity(split.len());
let first = &split[0];

// Find the first field (try exact match, then case-insensitive)
let field = self.fields.iter().find(|f| &f.name == first).or_else(|| {
self.fields
.iter()
.find(|f| f.name.eq_ignore_ascii_case(first))
})?;

let mut split_refs: VecDeque<&str> = split[1..].iter().map(|s| s.as_str()).collect();
if field.resolve_case_insensitive(&mut split_refs, &mut fields) {
Some(fields)
} else {
None
}
}

// TODO: This is not a public API, change to pub(crate) after refactor is done.
pub fn field_id(&self, column: &str) -> Result<i32> {
self.field(column)
Expand Down Expand Up @@ -1443,17 +1499,23 @@ pub fn parse_field_path(path: &str) -> Result<Vec<String>> {
Ok(result)
}

/// Format a field path, quoting field names that contain dots or backticks.
/// Format a field path, quoting field names that require escaping.
///
/// Field names are quoted if they contain any character that is not alphanumeric
/// or underscore, to ensure safe SQL parsing.
///
/// For example: ["parent", "child.with.dot"] formats to “parent.`child.with.dot`”
/// For example: ["parent", "child.with.dot"] formats to "parent.`child.with.dot`"
/// For example: ["meta-data", "user-id"] formats to "`meta-data`.`user-id`"
/// Backticks in field names are escaped by doubling them.
/// For example: ["field`with`backticks"] formats to `field``with``backticks`
/// For example: ["field`with`backticks"] formats to "`field``with``backticks`"
pub fn format_field_path(fields: &[&str]) -> String {
fields
.iter()
.map(|field| {
if field.contains('.') || field.contains('`') {
// Quote this field
// Quote if the field contains any non-identifier character
// (i.e., anything other than alphanumeric or underscore)
let needs_quoting = field.chars().any(|c| !c.is_alphanumeric() && c != '_');
if needs_quoting {
// Escape backticks by doubling them (PostgreSQL style)
let escaped = field.replace('`', "``");
format!("`{}`", escaped)
Expand Down
37 changes: 30 additions & 7 deletions rust/lance-datafusion/src/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ use datafusion::sql::sqlparser::ast::{
};
use datafusion::{
common::Column,
logical_expr::{col, Between, BinaryExpr, Like, Operator},
logical_expr::{Between, BinaryExpr, Like, Operator},
physical_expr::execution_props::ExecutionProps,
physical_plan::PhysicalExpr,
prelude::Expr,
Expand Down Expand Up @@ -252,6 +252,23 @@ impl Planner {
self
}

/// Resolve a column name using case-insensitive matching against the schema.
/// Returns the actual field name if found, otherwise returns the original name.
fn resolve_column_name(&self, name: &str) -> String {
// Try exact match first
if self.schema.field_with_name(name).is_ok() {
return name.to_string();
}
// Fall back to case-insensitive match
for field in self.schema.fields() {
if field.name().eq_ignore_ascii_case(name) {
return field.name().clone();
}
}
// Not found in schema - return original (might be computed column, system column, etc.)
name.to_string()
}

fn column(&self, idents: &[Ident]) -> Expr {
fn handle_remaining_idents(expr: &mut Expr, idents: &[Ident]) {
for ident in idents {
Expand All @@ -268,14 +285,16 @@ impl Planner {
if self.enable_relations && idents.len() > 1 {
// Create qualified column reference (relation.column)
let relation = &idents[0].value;
let column_name = &idents[1].value;
let column = Expr::Column(Column::new(Some(relation.clone()), column_name.clone()));
let column_name = self.resolve_column_name(&idents[1].value);
let column = Expr::Column(Column::new(Some(relation.clone()), column_name));
let mut result = column;
handle_remaining_idents(&mut result, &idents[2..]);
result
} else {
// Default behavior - treat as struct field access
let mut column = col(&idents[0].value);
// Use resolved column name to handle case-insensitive matching
let resolved_name = self.resolve_column_name(&idents[0].value);
let mut column = Expr::Column(Column::from_name(resolved_name));
handle_remaining_idents(&mut column, &idents[1..]);
column
}
Expand Down Expand Up @@ -842,10 +861,14 @@ impl Planner {
/// Note: the returned expression must be passed through `optimize_filter()`
/// before being passed to `create_physical_expr()`.
pub fn parse_expr(&self, expr: &str) -> Result<Expr> {
if self.schema.field_with_name(expr).is_ok() {
return Ok(col(expr));
// First check if it's a simple column reference (no operators, functions, etc.)
// resolve_column_name tries exact match first, then falls back to case-insensitive
let resolved_name = self.resolve_column_name(expr);
if self.schema.field_with_name(&resolved_name).is_ok() {
return Ok(Expr::Column(Column::from_name(resolved_name)));
}

// Parse as SQL expression
let ast_expr = parse_sql_expr(expr)?;
let expr = self.parse_sql_expr(&ast_expr)?;
let schema = Schema::try_from(self.schema.as_ref())?;
Expand Down Expand Up @@ -999,7 +1022,7 @@ mod tests {
};
use arrow_schema::{DataType, Fields, Schema};
use datafusion::{
logical_expr::{lit, Cast},
logical_expr::{col, lit, Cast},
prelude::{array_element, get_field},
};
use datafusion_functions::core::expr_ext::FieldAccessor;
Expand Down
20 changes: 13 additions & 7 deletions rust/lance/src/dataset/scanner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1295,12 +1295,14 @@ impl Scanner {
arrow_schema: &ArrowSchema,
) -> Result<Arc<dyn PhysicalExpr>> {
let lance_schema = dataset.schema();
let field_path = lance_schema.resolve(column_name).ok_or_else(|| {
Error::invalid_input(
format!("Field '{}' not found in schema", column_name),
location!(),
)
})?;
let field_path = lance_schema
.resolve_case_insensitive(column_name)
.ok_or_else(|| {
Error::invalid_input(
format!("Field '{}' not found in schema", column_name),
location!(),
)
})?;

if field_path.len() == 1 {
// Simple top-level column
Expand All @@ -1315,7 +1317,11 @@ impl Scanner {
// Nested field - build a chain of GetFieldFunc calls
let get_field_func = ScalarUDF::from(GetFieldFunc::default());

let mut expr = col(&field_path[0].name);
// Use Expr::Column with Column::new_unqualified to preserve exact case
// (col() normalizes identifiers to lowercase)
let mut expr = Expr::Column(datafusion::common::Column::new_unqualified(
&field_path[0].name,
));
for nested_field in &field_path[1..] {
expr = get_field_func.call(vec![expr, lit(&nested_field.name)]);
}
Expand Down
Loading
Loading