-
Notifications
You must be signed in to change notification settings - Fork 728
fix: make column name lookups case-insensitive #5465
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
wjones127
merged 7 commits into
lance-format:main
from
wjones127:fix-case-insensitive-column-names
Dec 18, 2025
Merged
Changes from 4 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
b7d43f5
fix: make column name lookups case-insensitive
wjones127 8e7076c
refactor: integrate case-insensitive resolution into Planner::column()
wjones127 b88c99f
fix: handle nested field paths correctly in index creation
wjones127 f7bbfb9
test: verify scalar index is used via explain_plan
wjones127 8369d3a
fix nested fields
wjones127 3d903aa
fix: improve case-insensitive column resolution for nested paths
wjones127 569444c
test: strengthen case-variant column tests
wjones127 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,381 @@ | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
|
||
| """ | ||
| Tests for column name handling with mixed case and special characters. | ||
|
|
||
| These tests verify that Lance properly handles column names that: | ||
| 1. Use mixed case (e.g., "userId", "OrderId") - common in TypeScript/JavaScript | ||
| 2. Contain special characters (e.g., "user-id", "order:id") | ||
|
|
||
| See: https://github.com/lancedb/lance/issues/3424 | ||
| """ | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| import lance | ||
| import pyarrow as pa | ||
| import pytest | ||
| from lance.dataset import ColumnOrdering | ||
|
|
||
|
|
||
| class TestMixedCaseColumnNames: | ||
| """ | ||
| Test that mixed-case column names work without requiring backtick quoting. | ||
|
|
||
| Users coming from TypeScript/JavaScript commonly use camelCase column names. | ||
| These should work in filter expressions, order by, scalar indices, etc. | ||
| without requiring backtick escaping. | ||
| """ | ||
|
|
||
| @pytest.fixture | ||
| def mixed_case_table(self): | ||
| """Create a table with mixed-case column names.""" | ||
| return pa.table( | ||
| { | ||
| "userId": range(100), | ||
| "OrderId": range(100, 200), | ||
| "itemName": [f"item_{i}" for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| @pytest.fixture | ||
| def mixed_case_dataset(self, tmp_path: Path, mixed_case_table): | ||
| """Create a dataset with mixed-case column names.""" | ||
| return lance.write_dataset(mixed_case_table, tmp_path / "mixed_case") | ||
|
|
||
| def test_create_table_with_mixed_case(self, mixed_case_dataset): | ||
| """Verify table creation with mixed-case columns works.""" | ||
| # Table creation preserves column names - this works | ||
| assert "userId" in [f.name for f in mixed_case_dataset.schema] | ||
| assert "OrderId" in [f.name for f in mixed_case_dataset.schema] | ||
| assert "itemName" in [f.name for f in mixed_case_dataset.schema] | ||
|
|
||
| def test_filter_with_mixed_case(self, mixed_case_dataset): | ||
| """Filter expressions should work with mixed-case column names.""" | ||
| # This should work without backticks | ||
| result = mixed_case_dataset.to_table(filter="userId > 50") | ||
| assert result.num_rows == 49 | ||
|
|
||
| # Also test with the other mixed-case columns | ||
| result = mixed_case_dataset.to_table(filter="OrderId >= 150") | ||
| assert result.num_rows == 50 | ||
|
|
||
| result = mixed_case_dataset.to_table(filter="itemName = 'item_25'") | ||
| assert result.num_rows == 1 | ||
|
|
||
| def test_order_by_with_mixed_case(self, mixed_case_dataset): | ||
| """Order by works with mixed-case column names when using proper API.""" | ||
| # order_by takes a list of column names or ColumnOrdering objects | ||
| # This does NOT go through SQL parsing, so it preserves case | ||
| ordering = ColumnOrdering("userId", ascending=False) | ||
| scanner = mixed_case_dataset.scanner(order_by=[ordering]) | ||
| result = scanner.to_table() | ||
| assert result.num_rows == 100 | ||
| assert result["userId"][0].as_py() == 99 | ||
|
|
||
| # Also test ordering by OrderId | ||
| ordering = ColumnOrdering("OrderId", ascending=True) | ||
| scanner = mixed_case_dataset.scanner(order_by=[ordering]) | ||
| result = scanner.to_table() | ||
| assert result["OrderId"][0].as_py() == 100 | ||
|
|
||
| def test_scalar_index_with_mixed_case(self, mixed_case_dataset): | ||
| """Scalar index creation should work with mixed-case column names.""" | ||
| mixed_case_dataset.create_scalar_index("userId", index_type="BTREE") | ||
|
|
||
| indices = mixed_case_dataset.list_indices() | ||
| assert len(indices) == 1 | ||
| assert indices[0]["fields"] == ["userId"] | ||
|
|
||
| # Query using the indexed column | ||
| result = mixed_case_dataset.to_table(filter="userId = 50") | ||
| assert result.num_rows == 1 | ||
|
|
||
| # Verify the index is actually used in the query plan | ||
| plan = mixed_case_dataset.scanner(filter="userId = 50").explain_plan() | ||
| assert "ScalarIndexQuery" in plan | ||
|
|
||
| def test_alter_column_with_mixed_case(self, mixed_case_dataset): | ||
| """Altering columns works with mixed-case column names.""" | ||
| # alter_columns uses direct schema lookup, not SQL parsing | ||
| mixed_case_dataset.alter_columns({"path": "userId", "name": "user_id"}) | ||
|
|
||
| assert "user_id" in [f.name for f in mixed_case_dataset.schema] | ||
| assert "userId" not in [f.name for f in mixed_case_dataset.schema] | ||
|
|
||
| def test_drop_column_with_mixed_case(self, tmp_path: Path, mixed_case_table): | ||
| """Dropping columns works with mixed-case column names.""" | ||
| # drop_columns uses direct schema lookup, not SQL parsing | ||
| dataset = lance.write_dataset(mixed_case_table, tmp_path / "drop_test") | ||
|
|
||
| dataset.drop_columns(["OrderId"]) | ||
|
|
||
| assert "OrderId" not in [f.name for f in dataset.schema] | ||
| assert "userId" in [f.name for f in dataset.schema] | ||
|
|
||
| def test_merge_insert_with_mixed_case_key(self, tmp_path: Path, mixed_case_table): | ||
| """Merge insert should work with mixed-case column as the key.""" | ||
| dataset = lance.write_dataset(mixed_case_table, tmp_path / "merge_test") | ||
|
|
||
| new_data = pa.table( | ||
| { | ||
| "userId": range(50, 150), | ||
| "OrderId": range(1000, 1100), | ||
| "itemName": [f"new_item_{i}" for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| dataset.merge_insert( | ||
| "userId" | ||
| ).when_matched_update_all().when_not_matched_insert_all().execute(new_data) | ||
|
|
||
| result = dataset.to_table() | ||
| assert result.num_rows == 150 | ||
|
|
||
|
|
||
| class TestSpecialCharacterColumnNames: | ||
| """ | ||
| Test that column names with special characters work properly. | ||
|
|
||
| Users may have column names with dashes, colons, or other special | ||
| characters. These should work in filter expressions, order by, | ||
| scalar indices, etc. | ||
|
|
||
| Note: Column names with `.` are NOT allowed at the top level since `.` is | ||
| used for nested field paths. This test uses `-` and `:` instead. | ||
| """ | ||
|
|
||
| @pytest.fixture | ||
| def special_char_table(self): | ||
| """Create a table with special character column names.""" | ||
| return pa.table( | ||
| { | ||
| "user-id": range(100), | ||
| "order:id": range(100, 200), | ||
| "item_name": [f"item_{i}" for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| @pytest.fixture | ||
| def special_char_dataset(self, tmp_path: Path, special_char_table): | ||
| """Create a dataset with special character column names.""" | ||
| return lance.write_dataset(special_char_table, tmp_path / "special_char") | ||
|
|
||
| def test_create_table_with_special_chars(self, special_char_dataset): | ||
| """Verify table creation with special character columns works.""" | ||
| # Table creation preserves column names - this works | ||
| assert "user-id" in [f.name for f in special_char_dataset.schema] | ||
| assert "order:id" in [f.name for f in special_char_dataset.schema] | ||
| assert "item_name" in [f.name for f in special_char_dataset.schema] | ||
|
|
||
| def test_filter_with_special_chars_using_backticks(self, special_char_dataset): | ||
| """Filter expressions work with special char columns when using backticks.""" | ||
| # Backticks work for escaping special characters in SQL | ||
| result = special_char_dataset.to_table(filter="`user-id` > 50") | ||
| assert result.num_rows == 49 | ||
|
|
||
| result = special_char_dataset.to_table(filter="`order:id` >= 150") | ||
| assert result.num_rows == 50 | ||
|
|
||
| # Regular column for comparison | ||
| result = special_char_dataset.to_table(filter="item_name = 'item_25'") | ||
| assert result.num_rows == 1 | ||
|
|
||
| def test_order_by_with_special_chars(self, special_char_dataset): | ||
| """Order by works with special character column names.""" | ||
| # order_by uses column name directly, not SQL parsing | ||
| ordering = ColumnOrdering("user-id", ascending=False) | ||
| scanner = special_char_dataset.scanner(order_by=[ordering]) | ||
| result = scanner.to_table() | ||
| assert result.num_rows == 100 | ||
| assert result["user-id"][0].as_py() == 99 | ||
|
|
||
| ordering = ColumnOrdering("order:id", ascending=True) | ||
| scanner = special_char_dataset.scanner(order_by=[ordering]) | ||
| result = scanner.to_table() | ||
| assert result["order:id"][0].as_py() == 100 | ||
|
|
||
| def test_scalar_index_with_special_chars(self, special_char_dataset): | ||
| """Scalar index creation works with special character column names.""" | ||
| # Column name is used directly without SQL parsing | ||
| special_char_dataset.create_scalar_index("user-id", index_type="BTREE") | ||
|
|
||
| indices = special_char_dataset.list_indices() | ||
| assert len(indices) == 1 | ||
| assert indices[0]["fields"] == ["user-id"] | ||
|
|
||
| # Query using the indexed column (requires backticks in filter) | ||
| result = special_char_dataset.to_table(filter="`user-id` = 50") | ||
| assert result.num_rows == 1 | ||
|
|
||
| # Verify the index is actually used in the query plan | ||
| plan = special_char_dataset.scanner(filter="`user-id` = 50").explain_plan() | ||
| assert "ScalarIndexQuery" in plan | ||
|
|
||
| def test_alter_column_with_special_chars(self, special_char_dataset): | ||
| """Altering columns works with special character column names.""" | ||
| # alter_columns uses direct schema lookup | ||
| special_char_dataset.alter_columns({"path": "user-id", "name": "user_id"}) | ||
|
|
||
| assert "user_id" in [f.name for f in special_char_dataset.schema] | ||
| assert "user-id" not in [f.name for f in special_char_dataset.schema] | ||
|
|
||
| def test_drop_column_with_special_chars(self, tmp_path: Path, special_char_table): | ||
| """Dropping columns works with special character column names.""" | ||
| # drop_columns uses direct schema lookup | ||
| dataset = lance.write_dataset(special_char_table, tmp_path / "drop_test") | ||
|
|
||
| dataset.drop_columns(["order:id"]) | ||
|
|
||
| assert "order:id" not in [f.name for f in dataset.schema] | ||
| assert "user-id" in [f.name for f in dataset.schema] | ||
|
|
||
| def test_merge_insert_with_special_char_key( | ||
| self, tmp_path: Path, special_char_table | ||
| ): | ||
| """Merge insert should work with special character column as the key.""" | ||
| dataset = lance.write_dataset(special_char_table, tmp_path / "merge_test") | ||
|
|
||
| new_data = pa.table( | ||
| { | ||
| "user-id": range(50, 150), | ||
| "order:id": range(1000, 1100), | ||
| "item_name": [f"new_item_{i}" for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| dataset.merge_insert( | ||
| "user-id" | ||
| ).when_matched_update_all().when_not_matched_insert_all().execute(new_data) | ||
|
|
||
| result = dataset.to_table() | ||
| assert result.num_rows == 150 | ||
|
|
||
|
|
||
| class TestNestedFieldColumnNames: | ||
| """ | ||
| Test that column names with mixed case and special characters work | ||
| properly within nested (struct) fields. | ||
|
|
||
| This tests nested field paths like: | ||
| - metadata.userId (mixed case in nested field) | ||
| - metadata.user-id (special chars in nested field) | ||
| """ | ||
|
|
||
| @pytest.fixture | ||
| def nested_mixed_case_table(self): | ||
| """Create a table with mixed-case nested column names.""" | ||
| return pa.table( | ||
| { | ||
| "id": range(100), | ||
| "metadata": [{"userId": i, "itemCount": i * 10} for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| @pytest.fixture | ||
| def nested_mixed_case_dataset(self, tmp_path: Path, nested_mixed_case_table): | ||
| """Create a dataset with mixed-case nested column names.""" | ||
| return lance.write_dataset( | ||
| nested_mixed_case_table, tmp_path / "nested_mixed_case" | ||
| ) | ||
|
|
||
| def test_create_table_with_nested_mixed_case(self, nested_mixed_case_dataset): | ||
| """Verify table creation with nested mixed-case columns preserves names.""" | ||
| schema = nested_mixed_case_dataset.schema | ||
| assert "metadata" in [f.name for f in schema] | ||
| metadata_field = schema.field("metadata") | ||
| nested_names = [f.name for f in metadata_field.type] | ||
| assert "userId" in nested_names | ||
| assert "itemCount" in nested_names | ||
|
|
||
| def test_filter_with_nested_mixed_case(self, nested_mixed_case_dataset): | ||
| """Filter expressions should work with mixed-case nested column names.""" | ||
| result = nested_mixed_case_dataset.to_table(filter="metadata.userId > 50") | ||
| assert result.num_rows == 49 | ||
|
|
||
| result = nested_mixed_case_dataset.to_table(filter="metadata.itemCount >= 500") | ||
| assert result.num_rows == 50 | ||
|
|
||
| def test_scalar_index_with_nested_mixed_case(self, nested_mixed_case_dataset): | ||
| """Scalar index creation should work with mixed-case nested column names.""" | ||
| nested_mixed_case_dataset.create_scalar_index( | ||
| "metadata.userId", index_type="BTREE" | ||
| ) | ||
|
|
||
| indices = nested_mixed_case_dataset.list_indices() | ||
| assert len(indices) == 1 | ||
| assert indices[0]["fields"] == ["metadata.userId"] | ||
|
|
||
| # Query using the indexed column | ||
| result = nested_mixed_case_dataset.to_table(filter="metadata.userId = 50") | ||
| assert result.num_rows == 1 | ||
|
|
||
| # Verify the index is actually used in the query plan | ||
| plan = nested_mixed_case_dataset.scanner( | ||
| filter="metadata.userId = 50" | ||
| ).explain_plan() | ||
| assert "ScalarIndexQuery" in plan | ||
|
|
||
| @pytest.fixture | ||
| def nested_special_char_table(self): | ||
| """Create a table with special character nested column names.""" | ||
| return pa.table( | ||
| { | ||
| "id": range(100), | ||
| "meta-data": [{"user-id": i, "item:count": i * 10} for i in range(100)], | ||
| } | ||
| ) | ||
|
|
||
| @pytest.fixture | ||
| def nested_special_char_dataset(self, tmp_path: Path, nested_special_char_table): | ||
| """Create a dataset with special character nested column names.""" | ||
| return lance.write_dataset( | ||
| nested_special_char_table, tmp_path / "nested_special_char" | ||
| ) | ||
|
|
||
| def test_create_table_with_nested_special_chars(self, nested_special_char_dataset): | ||
| """Verify table creation with nested special char columns preserves names.""" | ||
| schema = nested_special_char_dataset.schema | ||
| assert "meta-data" in [f.name for f in schema] | ||
| metadata_field = schema.field("meta-data") | ||
| nested_names = [f.name for f in metadata_field.type] | ||
| assert "user-id" in nested_names | ||
| assert "item:count" in nested_names | ||
|
|
||
| def test_filter_with_nested_special_chars(self, nested_special_char_dataset): | ||
| """Filter expressions work with special char nested columns using backticks.""" | ||
| # Both the parent and child need backticks when they contain special chars | ||
| result = nested_special_char_dataset.to_table( | ||
| filter="`meta-data`.`user-id` > 50" | ||
| ) | ||
| assert result.num_rows == 49 | ||
|
|
||
| result = nested_special_char_dataset.to_table( | ||
| filter="`meta-data`.`item:count` >= 500" | ||
| ) | ||
| assert result.num_rows == 50 | ||
|
|
||
| def test_scalar_index_with_nested_special_chars(self, nested_special_char_dataset): | ||
| """Scalar index creation should work with special char nested column names.""" | ||
| # Use backtick syntax for nested field path with special chars | ||
| nested_special_char_dataset.create_scalar_index( | ||
| "`meta-data`.`user-id`", index_type="BTREE" | ||
| ) | ||
|
|
||
| indices = nested_special_char_dataset.list_indices() | ||
| assert len(indices) == 1 | ||
| # Backticks are stripped when storing the field path | ||
| assert indices[0]["fields"] == ["meta-data.user-id"] | ||
|
|
||
| # Query using the indexed column (backticks required in filter) | ||
| result = nested_special_char_dataset.to_table( | ||
| filter="`meta-data`.`user-id` = 50" | ||
| ) | ||
| assert result.num_rows == 1 | ||
|
|
||
| # Verify the index is actually used in the query plan | ||
| plan = nested_special_char_dataset.scanner( | ||
| filter="`meta-data`.`user-id` = 50" | ||
| ).explain_plan() | ||
| assert "ScalarIndexQuery" in plan | ||
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This one is scary 😆 I wonder if we should limit the available character set for column names at all?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I actually did have a user who wanted this. I don't think there's any reason we shouldn't support it.