From 9f28f5ad94fd2f6a480b4cb788e730e57cc5c94c Mon Sep 17 00:00:00 2001 From: Georg Grob Date: Mon, 5 Aug 2024 10:15:41 +0100 Subject: [PATCH 1/3] Fix: accept empty arrays in struct field lookup Fixes #992. Empty `pyarrow` arrays are considered falsy, which caused a `ResolveError` for required fields during scan operations. --- pyiceberg/io/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py index 4175f5fecf..aefe86ac7a 100644 --- a/pyiceberg/io/pyarrow.py +++ b/pyiceberg/io/pyarrow.py @@ -1589,7 +1589,7 @@ def schema_partner(self, partner: Optional[pa.Array]) -> Optional[pa.Array]: return partner def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: str) -> Optional[pa.Array]: - if partner_struct: + if partner_struct is not None: # use the field name from the file schema try: name = self.file_schema.find_field(field_id).name From f34c4afd7efaa93ab8dd77ef7b7cfeba05bb1092 Mon Sep 17 00:00:00 2001 From: Georg Grob Date: Tue, 6 Aug 2024 13:16:23 +0100 Subject: [PATCH 2/3] Integration test: empty scan on non-nullable ordered string column This covers the issue reported in #992 where empty scan queries yielded a `ResolveError`. Specifically, this occurred under the following conditions: - a table with an ordered, non-nullable string column - a scan filtering for a non-existing value _within_ the range of the values in that particular column --- dev/provision.py | 10 ++++++++++ tests/integration/test_reads.py | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/dev/provision.py b/dev/provision.py index 6c8fe366d7..53360748b6 100644 --- a/dev/provision.py +++ b/dev/provision.py @@ -389,3 +389,13 @@ VALUES (4) """ ) + + spark.sql( + f""" + CREATE OR REPLACE TABLE {catalog_name}.default.test_empty_scan_ordered_str (id string NOT NULL) + USING iceberg + TBLPROPERTIES ('format-version'='2') + """ + ) + spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id") + spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'") diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 078abf406a..39f5ef5494 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -663,3 +663,10 @@ def another_task() -> None: table.transaction().set_properties(lock="xxx").commit_transaction() assert table.properties.get("lock") == "xxx" + +@pytest.mark.integration +@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) +def test_empty_scan_ordered_str(catalog: Catalog) -> None: + table_empty_scan_ordered_str = catalog.load_table("default.test_empty_scan_ordered_str") + arrow_table = table_empty_scan_ordered_str.scan(EqualTo("id", "b")).to_arrow() + assert len(arrow_table) == 0 From 50811149b5adfbf759ab59d104cca14dd10bd4b8 Mon Sep 17 00:00:00 2001 From: Georg Grob Date: Tue, 6 Aug 2024 16:31:27 +0100 Subject: [PATCH 3/3] Lint (add missing newline) --- tests/integration/test_reads.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py index 39f5ef5494..078ec163d4 100644 --- a/tests/integration/test_reads.py +++ b/tests/integration/test_reads.py @@ -664,6 +664,7 @@ def another_task() -> None: table.transaction().set_properties(lock="xxx").commit_transaction() assert table.properties.get("lock") == "xxx" + @pytest.mark.integration @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")]) def test_empty_scan_ordered_str(catalog: Catalog) -> None: