-
Notifications
You must be signed in to change notification settings - Fork 150
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Partitioned Append on Identity Transform #555
Changes from 11 commits
eef72f3
5a16b88
870e49b
6020297
aecd7ad
d5f39f3
fd484ef
e8c9334
7595b6b
9b371c0
9553ec6
9c13dbb
ebbec01
caddcce
eab2865
82dd3ad
d1f4ba8
d99aa5c
05721ff
f786ef4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -16,13 +16,13 @@ | |||||
# under the License. | ||||||
from __future__ import annotations | ||||||
|
||||||
import datetime | ||||||
import itertools | ||||||
import uuid | ||||||
import warnings | ||||||
from abc import ABC, abstractmethod | ||||||
from copy import copy | ||||||
from dataclasses import dataclass | ||||||
from datetime import datetime | ||||||
from enum import Enum | ||||||
from functools import cached_property, singledispatch | ||||||
from itertools import chain | ||||||
|
@@ -77,6 +77,8 @@ | |||||
INITIAL_PARTITION_SPEC_ID, | ||||||
PARTITION_FIELD_ID_START, | ||||||
PartitionField, | ||||||
PartitionFieldValue, | ||||||
PartitionKey, | ||||||
PartitionSpec, | ||||||
_PartitionNameGenerator, | ||||||
_visit_partition_field, | ||||||
|
@@ -716,7 +718,7 @@ def _(update: SetSnapshotRefUpdate, base_metadata: TableMetadata, context: _Tabl | |||||
if update.ref_name == MAIN_BRANCH: | ||||||
metadata_updates["current_snapshot_id"] = snapshot_ref.snapshot_id | ||||||
if "last_updated_ms" not in metadata_updates: | ||||||
metadata_updates["last_updated_ms"] = datetime_to_millis(datetime.datetime.now().astimezone()) | ||||||
metadata_updates["last_updated_ms"] = datetime_to_millis(datetime.now().astimezone()) | ||||||
|
||||||
metadata_updates["snapshot_log"] = base_metadata.snapshot_log + [ | ||||||
SnapshotLogEntry( | ||||||
|
@@ -1131,8 +1133,11 @@ def append(self, df: pa.Table, snapshot_properties: Dict[str, str] = EMPTY_DICT) | |||||
if not isinstance(df, pa.Table): | ||||||
raise ValueError(f"Expected PyArrow table, got: {df}") | ||||||
|
||||||
if len(self.spec().fields) > 0: | ||||||
raise ValueError("Cannot write to partitioned tables") | ||||||
supported = {IdentityTransform} | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit:
Suggested change
|
||||||
if not all(type(field.transform) in supported for field in self.metadata.spec().fields): | ||||||
raise ValueError( | ||||||
f"All transforms are not supported, expected: {supported}, but get: {[str(field) for field in self.metadata.spec().fields if field.transform not in supported]}." | ||||||
) | ||||||
|
||||||
_check_schema_compatible(self.schema(), other_schema=df.schema) | ||||||
# cast if the two schemas are compatible but not equal | ||||||
|
@@ -2492,16 +2497,28 @@ def _add_and_move_fields( | |||||
class WriteTask: | ||||||
write_uuid: uuid.UUID | ||||||
task_id: int | ||||||
schema: Schema | ||||||
record_batches: List[pa.RecordBatch] | ||||||
sort_order_id: Optional[int] = None | ||||||
partition_key: Optional[PartitionKey] = None | ||||||
|
||||||
# Later to be extended with partition information | ||||||
def generate_data_file_partition_path(self) -> str: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: This function looks redundant. The check is being done in |
||||||
if self.partition_key is None: | ||||||
raise ValueError("Cannot generate partition path based on non-partitioned WriteTask") | ||||||
return self.partition_key.to_path() | ||||||
|
||||||
def generate_data_file_filename(self, extension: str) -> str: | ||||||
# Mimics the behavior in the Java API: | ||||||
# https://github.com/apache/iceberg/blob/a582968975dd30ff4917fbbe999f1be903efac02/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java#L92-L101 | ||||||
return f"00000-{self.task_id}-{self.write_uuid}.{extension}" | ||||||
|
||||||
def generate_data_file_path(self, extension: str) -> str: | ||||||
if self.partition_key: | ||||||
file_path = f"{self.generate_data_file_partition_path()}/{self.generate_data_file_filename(extension)}" | ||||||
return file_path | ||||||
else: | ||||||
return self.generate_data_file_filename(extension) | ||||||
|
||||||
|
||||||
@dataclass(frozen=True) | ||||||
class AddFileTask: | ||||||
|
@@ -2529,25 +2546,44 @@ def _dataframe_to_data_files( | |||||
""" | ||||||
from pyiceberg.io.pyarrow import bin_pack_arrow_table, write_file | ||||||
|
||||||
if len([spec for spec in table_metadata.partition_specs if spec.spec_id != 0]) > 0: | ||||||
raise ValueError("Cannot write to partitioned tables") | ||||||
|
||||||
counter = itertools.count(0) | ||||||
write_uuid = write_uuid or uuid.uuid4() | ||||||
|
||||||
target_file_size = PropertyUtil.property_as_int( | ||||||
properties=table_metadata.properties, | ||||||
property_name=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, | ||||||
default=TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT, | ||||||
) | ||||||
if target_file_size is None: | ||||||
raise ValueError( | ||||||
"Fail to get neither TableProperties.WRITE_TARGET_FILE_SIZE_BYTES nor WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT for writing target data file." | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have mixed feelings about this exception check, because we are setting the default value of I understand why we are doing it though:
If we run into more of these type checking redundancies in the code base, where when we are using property values that are always expected to have a none-null default value, maybe we should refactor There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. property_as_int_with_default sounds better to me because all the exceptions raised due to missing default property could be centralized in the function? How do you feel about it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like that as well, the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i just find the default value itself could be None: the original code for this target_file_size check just |
||||||
) | ||||||
|
||||||
# This is an iter, so we don't have to materialize everything every time | ||||||
# This will be more relevant when we start doing partitioned writes | ||||||
yield from write_file( | ||||||
io=io, | ||||||
table_metadata=table_metadata, | ||||||
tasks=iter([WriteTask(write_uuid, next(counter), batches) for batches in bin_pack_arrow_table(df, target_file_size)]), # type: ignore | ||||||
) | ||||||
if len(table_metadata.spec().fields) > 0: | ||||||
partitions = partition(spec=table_metadata.spec(), schema=table_metadata.schema(), arrow_table=df) | ||||||
yield from write_file( | ||||||
io=io, | ||||||
table_metadata=table_metadata, | ||||||
tasks=iter([ | ||||||
WriteTask( | ||||||
write_uuid=write_uuid, | ||||||
task_id=next(counter), | ||||||
record_batches=batches, | ||||||
partition_key=partition.partition_key, | ||||||
schema=table_metadata.schema(), | ||||||
) | ||||||
for partition in partitions | ||||||
for batches in bin_pack_arrow_table(partition.arrow_table_partition, target_file_size) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks very nice! |
||||||
]), | ||||||
) | ||||||
else: | ||||||
yield from write_file( | ||||||
io=io, | ||||||
table_metadata=table_metadata, | ||||||
tasks=iter([ | ||||||
WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=table_metadata.schema()) | ||||||
for batches in bin_pack_arrow_table(df, target_file_size) | ||||||
]), | ||||||
) | ||||||
|
||||||
|
||||||
def _parquet_files_to_data_files(table_metadata: TableMetadata, file_paths: List[str], io: FileIO) -> Iterable[DataFile]: | ||||||
|
@@ -3099,7 +3135,7 @@ def snapshots(self) -> "pa.Table": | |||||
additional_properties = None | ||||||
|
||||||
snapshots.append({ | ||||||
'committed_at': datetime.datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), | ||||||
'committed_at': datetime.utcfromtimestamp(snapshot.timestamp_ms / 1000.0), | ||||||
'snapshot_id': snapshot.snapshot_id, | ||||||
'parent_id': snapshot.parent_snapshot_id, | ||||||
'operation': str(operation), | ||||||
|
@@ -3111,3 +3147,112 @@ def snapshots(self) -> "pa.Table": | |||||
snapshots, | ||||||
schema=snapshots_schema, | ||||||
) | ||||||
|
||||||
|
||||||
@dataclass(frozen=True) | ||||||
class TablePartition: | ||||||
partition_key: PartitionKey | ||||||
arrow_table_partition: pa.Table | ||||||
|
||||||
|
||||||
def _get_partition_sort_order(partition_columns: list[str], reverse: bool = False) -> dict[str, Any]: | ||||||
order = 'ascending' if not reverse else 'descending' | ||||||
null_placement = 'at_start' if reverse else 'at_end' | ||||||
return {'sort_keys': [(column_name, order) for column_name in partition_columns], 'null_placement': null_placement} | ||||||
|
||||||
|
||||||
def group_by_partition_scheme(arrow_table: pa.Table, partition_columns: list[str]) -> pa.Table: | ||||||
"""Given a table, sort it by current partition scheme.""" | ||||||
# only works for identity for now | ||||||
sort_options = _get_partition_sort_order(partition_columns, reverse=False) | ||||||
sorted_arrow_table = arrow_table.sort_by(sorting=sort_options['sort_keys'], null_placement=sort_options['null_placement']) | ||||||
return sorted_arrow_table | ||||||
|
||||||
|
||||||
def get_partition_columns( | ||||||
spec: PartitionSpec, | ||||||
schema: Schema, | ||||||
) -> list[str]: | ||||||
partition_cols = [] | ||||||
for partition_field in spec.fields: | ||||||
column_name = schema.find_column_name(partition_field.source_id) | ||||||
if not column_name: | ||||||
raise ValueError(f"{partition_field=} could not be found in {schema}.") | ||||||
partition_cols.append(column_name) | ||||||
return partition_cols | ||||||
|
||||||
|
||||||
def _get_table_partitions( | ||||||
arrow_table: pa.Table, | ||||||
partition_spec: PartitionSpec, | ||||||
schema: Schema, | ||||||
slice_instructions: list[dict[str, Any]], | ||||||
) -> list[TablePartition]: | ||||||
sorted_slice_instructions = sorted(slice_instructions, key=lambda x: x['offset']) | ||||||
|
||||||
partition_fields = partition_spec.fields | ||||||
|
||||||
offsets = [inst["offset"] for inst in sorted_slice_instructions] | ||||||
projected_and_filtered = { | ||||||
partition_field.source_id: arrow_table[schema.find_field(name_or_id=partition_field.source_id).name] | ||||||
.take(offsets) | ||||||
.to_pylist() | ||||||
for partition_field in partition_fields | ||||||
} | ||||||
|
||||||
table_partitions = [] | ||||||
for idx, inst in enumerate(sorted_slice_instructions): | ||||||
partition_slice = arrow_table.slice(**inst) | ||||||
fieldvalues = [ | ||||||
PartitionFieldValue(partition_field, projected_and_filtered[partition_field.source_id][idx]) | ||||||
for partition_field in partition_fields | ||||||
] | ||||||
partition_key = PartitionKey(raw_partition_field_values=fieldvalues, partition_spec=partition_spec, schema=schema) | ||||||
table_partitions.append(TablePartition(partition_key=partition_key, arrow_table_partition=partition_slice)) | ||||||
return table_partitions | ||||||
|
||||||
|
||||||
def partition(spec: PartitionSpec, schema: Schema, arrow_table: pa.Table) -> Iterable[TablePartition]: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be good to have a bit more length filenames. I also think we should hide this from the outside user.
Suggested change
I think we can also return a list, so folks know that it is already materialized. |
||||||
"""Based on the iceberg table partition spec, slice the arrow table into partitions with their keys. | ||||||
|
||||||
Example: | ||||||
Input: | ||||||
An arrow table with partition key of ['n_legs', 'year'] and with data of | ||||||
{'year': [2020, 2022, 2022, 2021, 2022, 2022, 2022, 2019, 2021], | ||||||
'n_legs': [2, 2, 2, 4, 4, 4, 4, 5, 100], | ||||||
'animal': ["Flamingo", "Parrot", "Parrot", "Dog", "Horse", "Horse", "Horse","Brittle stars", "Centipede"]}. | ||||||
The algrithm: | ||||||
Firstly we group the rows into partitions by sorting with sort order [('n_legs', 'descending'), ('year', 'descending')] | ||||||
and null_placement of "at_end". | ||||||
This gives the same table as raw input. | ||||||
Then we sort_indices using reverse order of [('n_legs', 'descending'), ('year', 'descending')] | ||||||
and null_placement : "at_start". | ||||||
This gives: | ||||||
[8, 7, 4, 5, 6, 3, 1, 2, 0] | ||||||
Based on this we get partition groups of indices: | ||||||
[{'offset': 8, 'length': 1}, {'offset': 7, 'length': 1}, {'offset': 4, 'length': 3}, {'offset': 3, 'length': 1}, {'offset': 1, 'length': 2}, {'offset': 0, 'length': 1}] | ||||||
We then retrieve the partition keys by offsets. | ||||||
And slice the arrow table by offsets and lengths of each partition. | ||||||
""" | ||||||
import pyarrow as pa | ||||||
|
||||||
partition_columns = get_partition_columns(spec=spec, schema=schema) | ||||||
arrow_table = group_by_partition_scheme(arrow_table, partition_columns) | ||||||
|
||||||
reversing_sort_order_options = _get_partition_sort_order(partition_columns, reverse=True) | ||||||
reversed_indices = pa.compute.sort_indices(arrow_table, **reversing_sort_order_options).to_pylist() | ||||||
|
||||||
slice_instructions: list[dict[str, Any]] = [] | ||||||
last = len(reversed_indices) | ||||||
reversed_indices_size = len(reversed_indices) | ||||||
ptr = 0 | ||||||
while ptr < reversed_indices_size: | ||||||
group_size = last - reversed_indices[ptr] | ||||||
offset = reversed_indices[ptr] | ||||||
slice_instructions.append({"offset": offset, "length": group_size}) | ||||||
last = reversed_indices[ptr] | ||||||
ptr = ptr + group_size | ||||||
|
||||||
table_partitions: list[TablePartition] = _get_table_partitions(arrow_table, spec, schema, slice_instructions) | ||||||
|
||||||
return table_partitions |
Original file line number | Diff line number | Diff line change | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -30,7 +30,7 @@ | |||||||||||||
import socket | ||||||||||||||
import string | ||||||||||||||
import uuid | ||||||||||||||
from datetime import date, datetime | ||||||||||||||
from datetime import date, datetime, timezone | ||||||||||||||
from pathlib import Path | ||||||||||||||
from random import choice | ||||||||||||||
from tempfile import TemporaryDirectory | ||||||||||||||
|
@@ -2000,7 +2000,11 @@ def spark() -> "SparkSession": | |||||||||||||
'float': [0.0, None, 0.9], | ||||||||||||||
'double': [0.0, None, 0.9], | ||||||||||||||
'timestamp': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], | ||||||||||||||
'timestamptz': [datetime(2023, 1, 1, 19, 25, 00), None, datetime(2023, 3, 1, 19, 25, 00)], | ||||||||||||||
'timestamptz': [ | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice one! |
||||||||||||||
datetime(2023, 1, 1, 19, 25, 00, tzinfo=timezone.utc), | ||||||||||||||
None, | ||||||||||||||
datetime(2023, 3, 1, 19, 25, 00, tzinfo=timezone.utc), | ||||||||||||||
], | ||||||||||||||
'date': [date(2023, 1, 1), None, date(2023, 3, 1)], | ||||||||||||||
# Not supported by Spark | ||||||||||||||
# 'time': [time(1, 22, 0), None, time(19, 25, 0)], | ||||||||||||||
|
@@ -2045,3 +2049,19 @@ def arrow_table_with_null(pa_schema: "pa.Schema") -> "pa.Table": | |||||||||||||
|
||||||||||||||
"""PyArrow table with all kinds of columns""" | ||||||||||||||
return pa.Table.from_pydict(TEST_DATA_WITH_NULL, schema=pa_schema) | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.fixture(scope="session") | ||||||||||||||
def arrow_table_without_data(pa_schema: "pa.Schema") -> "pa.Table": | ||||||||||||||
import pyarrow as pa | ||||||||||||||
|
||||||||||||||
"""PyArrow table with all kinds of columns.""" | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
return pa.Table.from_pylist([], schema=pa_schema) | ||||||||||||||
|
||||||||||||||
|
||||||||||||||
@pytest.fixture(scope="session") | ||||||||||||||
def arrow_table_with_only_nulls(pa_schema: "pa.Schema") -> "pa.Table": | ||||||||||||||
import pyarrow as pa | ||||||||||||||
|
||||||||||||||
"""PyArrow table with all kinds of columns.""" | ||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||
return pa.Table.from_pylist([{}, {}], schema=pa_schema) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This single-dispatch is there only for the
TimeType
it seems. Probably we should we should also convert those into a native type.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
fixed in the commit 82dd3ad
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Beautiful, thanks 👍