From 382a548817ff3fb294cb5a18a05a25bd2cbc7b25 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Thu, 31 Jul 2025 21:18:32 +0200 Subject: [PATCH 1/6] Avro: Fix tests and add missing `content` header While working on https://github.com/apache/iceberg-python/pull/2004 I've noticed some small discrepancies that I think would be good to address in a separate PR. --- pyiceberg/manifest.py | 1 + tests/conftest.py | 32 ++++++++++++++++++++++++++++++-- tests/utils/test_manifest.py | 17 +++++------------ 3 files changed, 36 insertions(+), 14 deletions(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 362e9085df..4a65637d64 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -1279,6 +1279,7 @@ def __init__( "parent-snapshot-id": str(parent_snapshot_id) if parent_snapshot_id is not None else "null", "sequence-number": str(sequence_number), "format-version": "2", + "content": "data", AVRO_CODEC_KEY: compression, }, ) diff --git a/tests/conftest.py b/tests/conftest.py index 584b6c633a..3de871663b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,6 +47,7 @@ import boto3 import pytest from moto import mock_aws +from pydantic_core import to_json from pyiceberg.catalog import Catalog, load_catalog from pyiceberg.catalog.noop import NoopCatalog @@ -67,10 +68,12 @@ ) from pyiceberg.io.fsspec import FsspecFileIO from pyiceberg.manifest import DataFile, FileFormat +from pyiceberg.partitioning import PartitionField, PartitionSpec from pyiceberg.schema import Accessor, Schema from pyiceberg.serializers import ToOutputFile from pyiceberg.table import FileScanTask, Table from pyiceberg.table.metadata import TableMetadataV1, TableMetadataV2 +from pyiceberg.transforms import DayTransform, IdentityTransform from pyiceberg.types import ( BinaryType, BooleanType, @@ -1858,7 +1861,24 @@ def simple_map() -> MapType: @pytest.fixture(scope="session") -def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> Generator[str, None, None]: +def test_schema() -> Schema: + return Schema( + NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", TimestampType(), False) + ) + + +@pytest.fixture(scope="session") +def test_partition_spec() -> Schema: + return PartitionSpec( + PartitionField(1, 1000, IdentityTransform(), "VendorID"), + PartitionField(2, 1001, DayTransform(), "tpep_pickup_datetime"), + ) + + +@pytest.fixture(scope="session") +def generated_manifest_entry_file( + avro_schema_manifest_entry: Dict[str, Any], test_schema: Schema, test_partition_spec: PartitionSpec +) -> Generator[str, None, None]: from fastavro import parse_schema, writer parsed_schema = parse_schema(avro_schema_manifest_entry) @@ -1866,7 +1886,15 @@ def generated_manifest_entry_file(avro_schema_manifest_entry: Dict[str, Any]) -> with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/manifest.avro" with open(tmp_avro_file, "wb") as out: - writer(out, parsed_schema, manifest_entry_records) + writer( + out, + parsed_schema, + manifest_entry_records, + metadata={ + "schema": test_schema.model_dump_json(), + "partition-spec": to_json(test_partition_spec.fields).decode("utf-8"), + }, + ) yield tmp_avro_file diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 825431c776..fc68a2fc9e 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -38,10 +38,9 @@ write_manifest, write_manifest_list, ) -from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionField, PartitionSpec +from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec from pyiceberg.schema import Schema from pyiceberg.table.snapshots import Operation, Snapshot, Summary -from pyiceberg.transforms import IdentityTransform from pyiceberg.typedef import Record, TableVersion from pyiceberg.types import IntegerType, NestedField @@ -363,6 +362,8 @@ def test_write_manifest( generated_manifest_file_file_v1: str, generated_manifest_file_file_v2: str, format_version: TableVersion, + test_schema: Schema, + test_partition_spec: PartitionSpec, compression: AvroCompressionCodec, ) -> None: io = load_file_io() @@ -376,20 +377,12 @@ def test_write_manifest( ) demo_manifest_file = snapshot.manifests(io)[0] manifest_entries = demo_manifest_file.fetch_manifest_entry(io) - test_schema = Schema( - NestedField(1, "VendorID", IntegerType(), False), NestedField(2, "tpep_pickup_datetime", IntegerType(), False) - ) - test_spec = PartitionSpec( - PartitionField(source_id=1, field_id=1, transform=IdentityTransform(), name="VendorID"), - PartitionField(source_id=2, field_id=2, transform=IdentityTransform(), name="tpep_pickup_datetime"), - spec_id=demo_manifest_file.partition_spec_id, - ) with TemporaryDirectory() as tmpdir: tmp_avro_file = tmpdir + "/test_write_manifest.avro" output = io.new_output(tmp_avro_file) with write_manifest( format_version=format_version, - spec=test_spec, + spec=test_partition_spec, schema=test_schema, output_file=output, snapshot_id=8744736658442914487, @@ -404,7 +397,7 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": """[{"source-id":1,"field-id":1,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":2,"transform":"identity","name":"tpep_pickup_datetime"}]""", + "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_datetime"}]""", "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } From 6db3e522da4c9f8a8d56a27ac700482df4637827 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Fri, 1 Aug 2025 22:26:54 +0200 Subject: [PATCH 2/6] Make the name more specific --- tests/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/conftest.py b/tests/conftest.py index 3de871663b..1cb11c70f3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1871,7 +1871,7 @@ def test_schema() -> Schema: def test_partition_spec() -> Schema: return PartitionSpec( PartitionField(1, 1000, IdentityTransform(), "VendorID"), - PartitionField(2, 1001, DayTransform(), "tpep_pickup_datetime"), + PartitionField(2, 1001, DayTransform(), "tpep_pickup_day"), ) From 63c4504087ccb1b21de78c110fb059f241453543 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Sun, 3 Aug 2025 21:03:40 +0200 Subject: [PATCH 3/6] Thanks Kevin! --- tests/conftest.py | 16 ++++++++-------- tests/utils/test_manifest.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 1cb11c70f3..e5572483ea 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1258,8 +1258,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 15, "value": 0}, ], "lower_bounds": [ - {"key": 2, "value": b"2020-04-01 00:00"}, - {"key": 3, "value": b"2020-04-01 00:12"}, + {"key": 2, "value": b"\x01\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00"}, {"key": 7, "value": b"\x03\x00\x00\x00"}, {"key": 8, "value": b"\x01\x00\x00\x00"}, {"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"}, @@ -1273,8 +1273,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"}, ], "upper_bounds": [ - {"key": 2, "value": b"2020-04-30 23:5:"}, - {"key": 3, "value": b"2020-05-01 00:41"}, + {"key": 2, "value": b"\x06\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00"}, {"key": 7, "value": b"\t\x01\x00\x00"}, {"key": 8, "value": b"\t\x01\x00\x00"}, {"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"}, @@ -1379,8 +1379,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "lower_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"2020-04-01 00:00"}, - {"key": 3, "value": b"2020-04-01 00:03"}, + {"key": 2, "value": b"\x01\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00"}, {"key": 4, "value": b"\x00\x00\x00\x00"}, {"key": 5, "value": b"\x01\x00\x00\x00"}, {"key": 6, "value": b"N"}, @@ -1399,8 +1399,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "upper_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"2020-04-30 23:5:"}, - {"key": 3, "value": b"2020-05-01 00:1:"}, + {"key": 2, "value": b"\x06\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00"}, {"key": 4, "value": b"\x06\x00\x00\x00"}, {"key": 5, "value": b"c\x00\x00\x00"}, {"key": 6, "value": b"Y"}, diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index fc68a2fc9e..210256343e 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -153,8 +153,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:12", + 2: b"\x01\x00\x00\x00", + 3: b"\x01\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -168,8 +168,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:41", + 2: b"\x06\x00\x00\x00", + 3: b"\x06\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@", @@ -397,7 +397,7 @@ def test_write_manifest( expected_metadata = { "schema": test_schema.model_dump_json(), - "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_datetime"}]""", + "partition-spec": """[{"source-id":1,"field-id":1000,"transform":"identity","name":"VendorID"},{"source-id":2,"field-id":1001,"transform":"day","name":"tpep_pickup_day"}]""", "partition-spec-id": str(demo_manifest_file.partition_spec_id), "format-version": str(format_version), } @@ -490,8 +490,8 @@ def test_write_manifest( } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"2020-04-01 00:00", - 3: b"2020-04-01 00:12", + 2: b"\x01\x00\x00\x00", + 3: b"\x01\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -505,8 +505,8 @@ def test_write_manifest( 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"2020-04-30 23:5:", - 3: b"2020-05-01 00:41", + 2: b"\x06\x00\x00\x00", + 3: b"\x06\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@", From 0c58bdad21a57a847e690be8d83eb9e28d01eaa8 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 4 Aug 2025 10:58:14 -0700 Subject: [PATCH 4/6] 8 bytes --- tests/conftest.py | 16 ++++++++-------- tests/utils/test_manifest.py | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e5572483ea..72ece557b3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1258,8 +1258,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 15, "value": 0}, ], "lower_bounds": [ - {"key": 2, "value": b"\x01\x00\x00\x00"}, - {"key": 3, "value": b"\x01\x00\x00\x00"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 7, "value": b"\x03\x00\x00\x00"}, {"key": 8, "value": b"\x01\x00\x00\x00"}, {"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"}, @@ -1273,8 +1273,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: {"key": 19, "value": b"\x00\x00\x00\x00\x00\x00\x04\xc0"}, ], "upper_bounds": [ - {"key": 2, "value": b"\x06\x00\x00\x00"}, - {"key": 3, "value": b"\x06\x00\x00\x00"}, + {"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, {"key": 7, "value": b"\t\x01\x00\x00"}, {"key": 8, "value": b"\t\x01\x00\x00"}, {"key": 10, "value": b"\xcd\xcc\xcc\xcc\xcc,_@"}, @@ -1379,8 +1379,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "lower_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"\x01\x00\x00\x00"}, - {"key": 3, "value": b"\x01\x00\x00\x00"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 4, "value": b"\x00\x00\x00\x00"}, {"key": 5, "value": b"\x01\x00\x00\x00"}, {"key": 6, "value": b"N"}, @@ -1399,8 +1399,8 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "upper_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, - {"key": 2, "value": b"\x06\x00\x00\x00"}, - {"key": 3, "value": b"\x06\x00\x00\x00"}, + {"key": 2, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x06\x00\x00\x00\x00\x00\x00\x00"}, {"key": 4, "value": b"\x06\x00\x00\x00"}, {"key": 5, "value": b"c\x00\x00\x00"}, {"key": 6, "value": b"Y"}, diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 210256343e..7c62b9564c 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -153,8 +153,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"\x01\x00\x00\x00", - 3: b"\x01\x00\x00\x00", + 2: b"\x01\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x01\x00\x00\x00\x00\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -168,8 +168,8 @@ def test_read_manifest_entry(generated_manifest_entry_file: str) -> None: 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"\x06\x00\x00\x00", - 3: b"\x06\x00\x00\x00", + 2: b"\x06\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x06\x00\x00\x00\x00\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@", @@ -490,8 +490,8 @@ def test_write_manifest( } assert data_file.nan_value_counts == {16: 0, 17: 0, 18: 0, 19: 0, 10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0} assert data_file.lower_bounds == { - 2: b"\x01\x00\x00\x00", - 3: b"\x01\x00\x00\x00", + 2: b"\x01\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x01\x00\x00\x00\x00\x00\x00\x00", 7: b"\x03\x00\x00\x00", 8: b"\x01\x00\x00\x00", 10: b"\xf6(\\\x8f\xc2\x05S\xc0", @@ -505,8 +505,8 @@ def test_write_manifest( 19: b"\x00\x00\x00\x00\x00\x00\x04\xc0", } assert data_file.upper_bounds == { - 2: b"\x06\x00\x00\x00", - 3: b"\x06\x00\x00\x00", + 2: b"\x06\x00\x00\x00\x00\x00\x00\x00", + 3: b"\x06\x00\x00\x00\x00\x00\x00\x00", 7: b"\t\x01\x00\x00", 8: b"\t\x01\x00\x00", 10: b"\xcd\xcc\xcc\xcc\xcc,_@", From f808e4f7308ddd545eddf2b5a71c6992ea3c1cb6 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 4 Aug 2025 11:09:51 -0700 Subject: [PATCH 5/6] typo --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 72ece557b3..79560bc532 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1259,7 +1259,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: ], "lower_bounds": [ {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, - {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 7, "value": b"\x03\x00\x00\x00"}, {"key": 8, "value": b"\x01\x00\x00\x00"}, {"key": 10, "value": b"\xf6(\\\x8f\xc2\x05S\xc0"}, @@ -1380,7 +1380,7 @@ def metadata_location_gz(tmp_path_factory: pytest.TempPathFactory) -> str: "lower_bounds": [ {"key": 1, "value": b"\x01\x00\x00\x00"}, {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, - {"key": 2, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, + {"key": 3, "value": b"\x01\x00\x00\x00\x00\x00\x00\x00"}, {"key": 4, "value": b"\x00\x00\x00\x00"}, {"key": 5, "value": b"\x01\x00\x00\x00"}, {"key": 6, "value": b"N"}, From 81291d6e9b1797f4aefae19bf5ec587391813e86 Mon Sep 17 00:00:00 2001 From: Fokko Driesprong Date: Mon, 4 Aug 2025 20:26:26 +0200 Subject: [PATCH 6/6] Wrong place --- pyiceberg/manifest.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pyiceberg/manifest.py b/pyiceberg/manifest.py index 4a65637d64..362e9085df 100644 --- a/pyiceberg/manifest.py +++ b/pyiceberg/manifest.py @@ -1279,7 +1279,6 @@ def __init__( "parent-snapshot-id": str(parent_snapshot_id) if parent_snapshot_id is not None else "null", "sequence-number": str(sequence_number), "format-version": "2", - "content": "data", AVRO_CODEC_KEY: compression, }, )