Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: Add support for initial default #7699

Merged
merged 3 commits into from
Jun 2, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions python/pyiceberg/avro/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,19 @@ def skip(self, decoder: BinaryDecoder) -> None:
return None


class DefaultReader(Reader):
default_value: Any

def __init__(self, default_value: Any) -> None:
self.default_value = default_value

def read(self, _: BinaryDecoder) -> Any:
return self.default_value

def skip(self, decoder: BinaryDecoder) -> None:
return None
Fokko marked this conversation as resolved.
Show resolved Hide resolved


class BooleanReader(Reader):
def read(self, decoder: BinaryDecoder) -> bool:
return decoder.read_boolean()
Expand Down
14 changes: 10 additions & 4 deletions python/pyiceberg/avro/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
BooleanReader,
DateReader,
DecimalReader,
DefaultReader,
DoubleReader,
FixedReader,
FloatReader,
Expand Down Expand Up @@ -77,6 +78,8 @@
UUIDType,
)

STRUCT_ROOT = -1


def construct_reader(
file_schema: Union[Schema, IcebergType], read_types: Dict[int, Callable[..., StructProtocol]] = EMPTY_DICT
Expand Down Expand Up @@ -128,8 +131,7 @@ def after_field(self, field: NestedField, field_partner: Optional[NestedField])
self.context.pop()

def struct(self, struct: StructType, expected_struct: Optional[IcebergType], field_readers: List[Reader]) -> Reader:
# -1 indicates the struct root
read_struct_id = self.context[-1] if len(self.context) > 0 else -1
read_struct_id = self.context[STRUCT_ROOT] if len(self.context) > 0 else STRUCT_ROOT
struct_callable = self.read_types.get(read_struct_id, Record)

if not expected_struct:
Expand All @@ -150,8 +152,12 @@ def struct(self, struct: StructType, expected_struct: Optional[IcebergType], fie
if read_field.field_id not in file_fields:
if read_field.required:
raise ResolveError(f"{read_field} is non-optional, and not part of the file schema")
# Just set the new field to None
results.append((pos, NoneReader()))
rdblue marked this conversation as resolved.
Show resolved Hide resolved
if isinstance(read_field, NestedField) and read_field.initial_default is not None:
# The field is not in the file, but there is a default value
results.append((pos, DefaultReader(read_field.initial_default)))
else:
# Just set the new field to None
results.append((pos, NoneReader()))

return StructReader(tuple(results), struct_callable, expected_struct)

Expand Down
6 changes: 3 additions & 3 deletions python/pyiceberg/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,9 @@ def __init__(self, *data: Any, **named_data: Any) -> None:
NestedField(500, "manifest_path", StringType(), required=True, doc="Location URI with FS scheme"),
NestedField(501, "manifest_length", LongType(), required=True),
NestedField(502, "partition_spec_id", IntegerType(), required=True),
NestedField(517, "content", IntegerType(), required=False),
NestedField(515, "sequence_number", LongType(), required=False),
NestedField(516, "min_sequence_number", LongType(), required=False),
NestedField(517, "content", IntegerType(), required=False, initial_default=0),
NestedField(515, "sequence_number", LongType(), required=False, initial_default=0),
NestedField(516, "min_sequence_number", LongType(), required=False, initial_default=0),
NestedField(503, "added_snapshot_id", LongType(), required=False),
NestedField(504, "added_files_count", IntegerType(), required=False),
NestedField(505, "existing_files_count", IntegerType(), required=False),
Expand Down
3 changes: 3 additions & 0 deletions python/pyiceberg/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ class NestedField(IcebergType):
field_type: IcebergType = Field(alias="type")
required: bool = Field(default=True)
doc: Optional[str] = Field(default=None, repr=False)
initial_default: Any = Field(alias="initial-default", repr=False)

def __init__(
self,
Expand All @@ -227,6 +228,7 @@ def __init__(
field_type: Optional[IcebergType] = None,
required: bool = True,
doc: Optional[str] = None,
initial_default: Optional[Any] = None,
**data: Any,
):
# We need an init when we want to use positional arguments, but
Expand All @@ -236,6 +238,7 @@ def __init__(
data["field_type"] = data["type"] if "type" in data else field_type
data["required"] = required
data["doc"] = doc
data["initial_default"] = initial_default
super().__init__(**data)

def __str__(self) -> str:
Expand Down
21 changes: 21 additions & 0 deletions python/tests/avro/test_resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from pyiceberg.avro.file import AvroFile
from pyiceberg.avro.reader import (
DecimalReader,
DefaultReader,
DoubleReader,
FloatReader,
IntegerReader,
Expand Down Expand Up @@ -280,3 +281,23 @@ class Ints(Record):
records = list(reader)

assert repr(records) == "[Ints[c=3, d=None]]"


def test_resolver_initial_value() -> None:
write_schema = Schema(
NestedField(1, "name", StringType()),
schema_id=1,
)
read_schema = Schema(
NestedField(2, "something", StringType(), required=False, initial_default="vo"),
schema_id=1,
rdblue marked this conversation as resolved.
Show resolved Hide resolved
)

assert resolve(write_schema, read_schema) == StructReader(
(
(None, StringReader()), # The one we skip
(0, DefaultReader("vo")),
),
Record,
read_schema.as_struct(),
)
8 changes: 4 additions & 4 deletions python/tests/utils/test_manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def test_read_manifest_list(generated_manifest_file_file: str) -> None:


def test_read_manifest(generated_manifest_file_file: str) -> None:
io = load_file_io({})
io = load_file_io()

snapshot = Snapshot(
snapshot_id=25,
Expand All @@ -191,9 +191,9 @@ def test_read_manifest(generated_manifest_file_file: str) -> None:

assert manifest_list.manifest_length == 7989
assert manifest_list.partition_spec_id == 0
assert manifest_list.content is None
assert manifest_list.sequence_number is None
assert manifest_list.min_sequence_number is None
assert manifest_list.content == 0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a blocker, but we should have an enum for content and use it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will go well for a long time:

python git:(fd-initial-default) ✗ python3
Python 3.11.3 (main, Apr  7 2023, 20:13:31) [Clang 14.0.0 (clang-1400.0.29.202)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> from pyiceberg.manifest import ManifestContent
>>> ManifestContent.DATA == 0
True
>>> ManifestContent.DATA == 1
False
>>> 1 == ManifestContent.DATA
False

But you're right and this should be an enum, updated.

assert manifest_list.sequence_number == 0
assert manifest_list.min_sequence_number == 0
assert manifest_list.added_snapshot_id == 9182715666859759686
assert manifest_list.added_files_count == 3
assert manifest_list.existing_files_count == 0
Expand Down