Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,12 @@
from setuptools import setup

setup(
name='py-iceberg',
name="py-iceberg",
install_requires=[],
extras_require={
"dev": [
"tox-travis==0.12",
"pytest",
],
}
},
)
46 changes: 35 additions & 11 deletions python/src/iceberg/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
# specific language governing permissions and limitations
# under the License.


class Type(object):
def __init__(self, type_string: str, repr_string: str, is_primitive=False):
self._type_string = type_string
Expand All @@ -34,7 +35,9 @@ def is_primitive(self) -> bool:

class FixedType(Type):
def __init__(self, length: int):
super().__init__(f"fixed[{length}]", f"FixedType(length={length})", is_primitive=True)
super().__init__(
f"fixed[{length}]", f"FixedType(length={length})", is_primitive=True
)
self._length = length

@property
Expand All @@ -44,8 +47,11 @@ def length(self) -> int:

class DecimalType(Type):
def __init__(self, precision: int, scale: int):
super().__init__(f"decimal({precision}, {scale})",
f"DecimalType(precision={precision}, scale={scale})", is_primitive=True)
super().__init__(
f"decimal({precision}, {scale})",
f"DecimalType(precision={precision}, scale={scale})",
is_primitive=True,
)
self._precision = precision
self._scale = scale

Expand All @@ -59,7 +65,14 @@ def scale(self) -> int:


class NestedField(object):
def __init__(self, is_optional: bool, field_id: int, name: str, field_type: Type, doc: str = None):
def __init__(
self,
is_optional: bool,
field_id: int,
name: str,
field_type: Type,
doc: str = None,
):
self._is_optional = is_optional
self._id = field_id
self._name = name
Expand Down Expand Up @@ -87,17 +100,26 @@ def type(self) -> Type:
return self._type

def __repr__(self):
return (f"NestedField(is_optional={self._is_optional}, field_id={self._id}, "
f"name={repr(self._name)}, field_type={repr(self._type)}, doc={repr(self._doc)})")
return (
f"NestedField(is_optional={self._is_optional}, field_id={self._id}, "
f"name={repr(self._name)}, field_type={repr(self._type)}, doc={repr(self._doc)})"
)

def __str__(self):
return (f"{self._id}: {self._name}: {'optional' if self._is_optional else 'required'} {self._type}"
"" if self._doc is None else f" ({self._doc})")
return (
f"{self._id}: {self._name}: {'optional' if self._is_optional else 'required'} {self._type}"
""
if self._doc is None
else f" ({self._doc})"
)


class StructType(Type):
def __init__(self, fields: list):
super().__init__(f"struct<{', '.join(map(str, fields))}>", f"StructType(fields={repr(fields)})")
super().__init__(
f"struct<{', '.join(map(str, fields))}>",
f"StructType(fields={repr(fields)})",
)
self._fields = fields

@property
Expand All @@ -117,8 +139,10 @@ def element(self) -> NestedField:

class MapType(Type):
def __init__(self, key: NestedField, value: NestedField):
super().__init__(f"map<{key.type}, {value.type}>",
f"MapType(key={repr(key)}, value={repr(value)})")
super().__init__(
f"map<{key.type}, {value.type}>",
f"MapType(key={repr(key)}, value={repr(value)})",
)
self._key_field = key
self._value_field = value

Expand Down
5 changes: 4 additions & 1 deletion python/src/iceberg/utils/bin_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@
# specific language governing permissions and limitations
# under the License.


class PackingIterator:
def __init__(self, items, target_weight, lookback, weight_func, largest_bin_first=False):
def __init__(
self, items, target_weight, lookback, weight_func, largest_bin_first=False
):
self.items = iter(items)
self.target_weight = target_weight
self.lookback = lookback
Expand Down
120 changes: 97 additions & 23 deletions python/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,47 @@
# specific language governing permissions and limitations
# under the License.

from iceberg.types import (BinaryType, BooleanType, DateType, DecimalType, DoubleType, FixedType,
FloatType, IntegerType, ListType, LongType, MapType, NestedField, StringType,
StructType, TimestampType, TimestamptzType, TimeType, UUIDType)
import pytest

from iceberg.types import (
BinaryType,
BooleanType,
DateType,
DecimalType,
DoubleType,
FixedType,
FloatType,
IntegerType,
ListType,
LongType,
MapType,
NestedField,
StringType,
StructType,
TimestampType,
TimestamptzType,
TimeType,
UUIDType,
)

@pytest.mark.parametrize("input_type",
[BooleanType, IntegerType, LongType, FloatType, DoubleType, DateType, TimeType,
TimestampType, TimestamptzType, StringType, UUIDType, BinaryType])

@pytest.mark.parametrize(
"input_type",
[
BooleanType,
IntegerType,
LongType,
FloatType,
DoubleType,
DateType,
TimeType,
TimestampType,
TimestamptzType,
StringType,
UUIDType,
BinaryType,
],
)
def test_repr_primitive_types(input_type):
assert input_type == eval(repr(input_type))

Expand All @@ -40,34 +72,58 @@ def test_decimal_type():
type_var = DecimalType(precision=9, scale=2)
assert type_var.precision == 9
assert type_var.scale == 2
assert str(type_var) == 'decimal(9, 2)'
assert str(type_var) == "decimal(9, 2)"
assert repr(type_var) == "DecimalType(precision=9, scale=2)"
assert str(type_var) == str(eval(repr(type_var)))


def test_struct_type():
type_var = StructType([NestedField(True, 1, "optional_field", IntegerType),
NestedField(False, 2, "required_field", FixedType(5)),
NestedField(False, 3, "required_field", StructType([
NestedField(True, 4, "optional_field", DecimalType(8, 2)),
NestedField(False, 5, "required_field", LongType)]))])
type_var = StructType(
[
NestedField(True, 1, "optional_field", IntegerType),
NestedField(False, 2, "required_field", FixedType(5)),
NestedField(
False,
3,
"required_field",
StructType(
[
NestedField(True, 4, "optional_field", DecimalType(8, 2)),
NestedField(False, 5, "required_field", LongType),
]
),
),
]
)
assert len(type_var.fields) == 3
assert str(type_var) == str(eval(repr(type_var)))


def test_list_type():
type_var = ListType(NestedField(False, 1, "required_field", StructType([
NestedField(True, 2, "optional_field", DecimalType(8, 2)),
NestedField(False, 3, "required_field", LongType)])))
type_var = ListType(
NestedField(
False,
1,
"required_field",
StructType(
[
NestedField(True, 2, "optional_field", DecimalType(8, 2)),
NestedField(False, 3, "required_field", LongType),
]
),
)
)
assert isinstance(type_var.element.type, StructType)
assert len(type_var.element.type.fields) == 2
assert type_var.element.field_id == 1
assert str(type_var) == str(eval(repr(type_var)))


def test_map_type():
type_var = MapType(NestedField(True, 1, "optional_field", DoubleType),
NestedField(False, 2, "required_field", UUIDType))
type_var = MapType(
NestedField(True, 1, "optional_field", DoubleType),
NestedField(False, 2, "required_field", UUIDType),
)
assert type_var.key.type == DoubleType
assert type_var.key.field_id == 1
assert type_var.value.type == UUIDType
Expand All @@ -76,12 +132,30 @@ def test_map_type():


def test_nested_field():
field_var = NestedField(True, 1, "optional_field1", StructType([
NestedField(True, 2, "optional_field2", ListType(
NestedField(False, 3, "required_field3", DoubleType))),
NestedField(False, 4, "required_field4", MapType(
NestedField(True, 5, "optional_field5", TimeType),
NestedField(False, 6, "required_field6", UUIDType)))]))
field_var = NestedField(
True,
1,
"optional_field1",
StructType(
[
NestedField(
True,
2,
"optional_field2",
ListType(NestedField(False, 3, "required_field3", DoubleType)),
),
NestedField(
False,
4,
"required_field4",
MapType(
NestedField(True, 5, "optional_field5", TimeType),
NestedField(False, 6, "required_field6", UUIDType),
),
),
]
),
)
assert field_var.is_optional
assert not field_var.is_required
assert field_var.field_id == 1
Expand Down
78 changes: 60 additions & 18 deletions python/tests/utils/test_bin_packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,34 +17,76 @@

import random

from iceberg.utils.bin_packing import PackingIterator
import pytest

from iceberg.utils.bin_packing import PackingIterator


@pytest.mark.parametrize("splits, lookback, split_size, open_cost", [
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am usually hesitated to add any auto-formatting functionality to a project, because sometimes the formatting is quite unnecessary, such as here. But I don't know what's the convention on python side, maybe this is desirable. Any thoughts? @jun-he

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The consistency is nice because individual styles in python can vary wildly. The auto sorting of imports is also super helpful since it guarantees imports to be neatly organized as stdlib, third party, and package imports, each section separated by a single space. Since we recommend auto formatters on the java side and have style checks, has that been a net positive?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, I am good with that then as long as it follows the general guideline of the language!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah black is definitely the standard in the python community these days. I'm also generally usually opposed but in this case, I think it makes sense.

Admittedly, the choices it makes are definitely kind of strange to me at times. But I do recognize that it's the standard formatter for the language and is generally used as a best practice.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👌 I feel it is good to have it here to help us to keep the code consistent. We can tune the settings to ensure it does the right thing.

([random.randint(0, 64) for x in range(200)], 20, 128, 4), # random splits
([], 20, 128, 4), # no splits
([0] * 100 + [random.randint(0, 64) in range(10)] + [0] * 100, 20, 128, 4) # sparse
])
@pytest.mark.parametrize(
"splits, lookback, split_size, open_cost",
[
([random.randint(0, 64) for x in range(200)], 20, 128, 4), # random splits
([], 20, 128, 4), # no splits
(
[0] * 100 + [random.randint(0, 64) in range(10)] + [0] * 100,
20,
128,
4,
), # sparse
],
)
def test_bin_packing(splits, lookback, split_size, open_cost):

def weight_func(x):
return max(x, open_cost)

item_list_sums = [sum(item)
for item in PackingIterator(splits, split_size, lookback, weight_func)]
item_list_sums = [
sum(item) for item in PackingIterator(splits, split_size, lookback, weight_func)
]
assert all([split_size >= item_sum >= 0 for item_sum in item_list_sums])


@pytest.mark.parametrize("splits, target_weight, lookback, largest_bin_first, expected_lists", [
([36, 36, 36, 36, 73, 110, 128], 128, 2, True, [[110], [128], [36, 73], [36, 36, 36]]),
([36, 36, 36, 36, 73, 110, 128], 128, 2, False, [[36, 36, 36], [36, 73], [110], [128]]),
([64, 64, 128, 32, 32, 32, 32], 128, 1, True, [[64, 64], [128], [32, 32, 32, 32]]),
([64, 64, 128, 32, 32, 32, 32], 128, 1, False, [[64, 64], [128], [32, 32, 32, 32]]),
])
def test_bin_packing_lookback(splits, target_weight, lookback, largest_bin_first, expected_lists):
@pytest.mark.parametrize(
"splits, target_weight, lookback, largest_bin_first, expected_lists",
[
(
[36, 36, 36, 36, 73, 110, 128],
128,
2,
True,
[[110], [128], [36, 73], [36, 36, 36]],
),
(
[36, 36, 36, 36, 73, 110, 128],
128,
2,
False,
[[36, 36, 36], [36, 73], [110], [128]],
),
(
[64, 64, 128, 32, 32, 32, 32],
128,
1,
True,
[[64, 64], [128], [32, 32, 32, 32]],
),
(
[64, 64, 128, 32, 32, 32, 32],
128,
1,
False,
[[64, 64], [128], [32, 32, 32, 32]],
),
],
)
def test_bin_packing_lookback(
splits, target_weight, lookback, largest_bin_first, expected_lists
):
def weight_func(x):
return x

assert [item for item in PackingIterator(
splits, target_weight, lookback, weight_func, largest_bin_first)] == expected_lists
assert [
item
for item in PackingIterator(
splits, target_weight, lookback, weight_func, largest_bin_first
)
] == expected_lists
Loading