From 9f28f5ad94fd2f6a480b4cb788e730e57cc5c94c Mon Sep 17 00:00:00 2001
From: Georg Grob <grobgl@users.noreply.github.com>
Date: Mon, 5 Aug 2024 10:15:41 +0100
Subject: [PATCH 1/3] Fix: accept empty arrays in struct field lookup

Fixes #992.

Empty `pyarrow` arrays are considered falsy, which caused a `ResolveError` for required fields during scan operations.
---
 pyiceberg/io/pyarrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
index 4175f5fecf..aefe86ac7a 100644
--- a/pyiceberg/io/pyarrow.py
+++ b/pyiceberg/io/pyarrow.py
@@ -1589,7 +1589,7 @@ def schema_partner(self, partner: Optional[pa.Array]) -> Optional[pa.Array]:
         return partner
 
     def field_partner(self, partner_struct: Optional[pa.Array], field_id: int, _: str) -> Optional[pa.Array]:
-        if partner_struct:
+        if partner_struct is not None:
             # use the field name from the file schema
             try:
                 name = self.file_schema.find_field(field_id).name

From f34c4afd7efaa93ab8dd77ef7b7cfeba05bb1092 Mon Sep 17 00:00:00 2001
From: Georg Grob <grobgl@users.noreply.github.com>
Date: Tue, 6 Aug 2024 13:16:23 +0100
Subject: [PATCH 2/3] Integration test: empty scan on non-nullable ordered
 string column

This covers the issue reported in #992 where empty scan queries yielded a `ResolveError`. Specifically, this occurred under the following conditions:
- a table with an ordered, non-nullable string column
- a scan filtering for a non-existing value _within_ the range of the values in that particular column
---
 dev/provision.py                | 10 ++++++++++
 tests/integration/test_reads.py |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/dev/provision.py b/dev/provision.py
index 6c8fe366d7..53360748b6 100644
--- a/dev/provision.py
+++ b/dev/provision.py
@@ -389,3 +389,13 @@
         VALUES (4)
         """
     )
+
+    spark.sql(
+        f"""
+        CREATE OR REPLACE TABLE {catalog_name}.default.test_empty_scan_ordered_str (id string NOT NULL)
+        USING iceberg
+        TBLPROPERTIES ('format-version'='2')
+        """
+    )
+    spark.sql(f"ALTER TABLE {catalog_name}.default.test_empty_scan_ordered_str WRITE ORDERED BY id")
+    spark.sql(f"INSERT INTO {catalog_name}.default.test_empty_scan_ordered_str VALUES 'a', 'c'")
diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 078abf406a..39f5ef5494 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -663,3 +663,10 @@ def another_task() -> None:
 
         table.transaction().set_properties(lock="xxx").commit_transaction()
         assert table.properties.get("lock") == "xxx"
+
+@pytest.mark.integration
+@pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
+def test_empty_scan_ordered_str(catalog: Catalog) -> None:
+    table_empty_scan_ordered_str = catalog.load_table("default.test_empty_scan_ordered_str")
+    arrow_table = table_empty_scan_ordered_str.scan(EqualTo("id", "b")).to_arrow()
+    assert len(arrow_table) == 0

From 50811149b5adfbf759ab59d104cca14dd10bd4b8 Mon Sep 17 00:00:00 2001
From: Georg Grob <grobgl@users.noreply.github.com>
Date: Tue, 6 Aug 2024 16:31:27 +0100
Subject: [PATCH 3/3] Lint (add missing newline)

---
 tests/integration/test_reads.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_reads.py b/tests/integration/test_reads.py
index 39f5ef5494..078ec163d4 100644
--- a/tests/integration/test_reads.py
+++ b/tests/integration/test_reads.py
@@ -664,6 +664,7 @@ def another_task() -> None:
         table.transaction().set_properties(lock="xxx").commit_transaction()
         assert table.properties.get("lock") == "xxx"
 
+
 @pytest.mark.integration
 @pytest.mark.parametrize("catalog", [pytest.lazy_fixture("session_catalog_hive"), pytest.lazy_fixture("session_catalog")])
 def test_empty_scan_ordered_str(catalog: Catalog) -> None: