Skip to content

Commit

Permalink
refactor(datatype): remove Category type and related APIs
Browse files Browse the repository at this point in the history
The catgeory type doesn't map well to most databases, and is also a
level of abstraction below ibis. Whether something is stored using
dictionary encoding is unrelated to expression APIs.

BREAKING CHANGE: `Category`, `CategoryValue`/`Column`/`Scalar` are removed. Use string types instead.
  • Loading branch information
cpcloud authored and kszucs committed Feb 6, 2023
1 parent b532c63 commit bb0ee78
Show file tree
Hide file tree
Showing 27 changed files with 95 additions and 195 deletions.
3 changes: 0 additions & 3 deletions ibis/backends/base/sql/alchemy/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,9 +166,6 @@ def _cast(t, op):

sa_arg = t.translate(arg)

if arg_dtype.is_category() and typ.is_int32():
return sa_arg

# specialize going from an integer type to a timestamp
if arg_dtype.is_integer() and typ.is_timestamp():
return t.integer_to_timestamp(sa_arg)
Expand Down
18 changes: 0 additions & 18 deletions ibis/backends/base/sql/compiler/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,24 +326,6 @@ def _bucket(op):
return result.op()


@rewrites(ops.CategoryLabel)
def _category_label(op):
# TODO(kszucs): avoid the expression roundtrip
expr = op.to_expr()
stmt = op.args[0].to_expr().case()
for i, label in enumerate(op.labels):
stmt = stmt.when(i, label)

if op.nulls is not None:
stmt = stmt.else_(op.nulls)

result = stmt.end()
if expr.has_name():
result = result.name(expr.get_name())

return result.op()


@rewrites(ops.Any)
def _any_expand(op):
return ops.Max(op.arg)
Expand Down
2 changes: 0 additions & 2 deletions ibis/backends/base/sql/registry/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,6 @@ def log(translator, op):
def cast(translator, op):
arg_formatted = translator.translate(op.arg)

if op.arg.output_dtype.is_category() and op.to.is_int32():
return arg_formatted
if op.arg.output_dtype.is_temporal() and op.to.is_int64():
return f'1000000 * unix_timestamp({arg_formatted})'
else:
Expand Down
11 changes: 0 additions & 11 deletions ibis/backends/dask/tests/execution/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,3 @@ def test_cast_to_decimal(t, df, type):
1 <= len(element.as_tuple().digits) <= type.precision
for element in result.compute().values
)


@pytest.mark.parametrize(
'column',
['plain_int64', 'dup_strings', 'dup_ints', 'strings_with_nulls'],
)
def test_cast_to_category(t, df, column):
test = t[column].cast('category').compile()
tm.assert_series_equal(
test.compute(), df[column].astype('category').compute(), check_index=False
)
4 changes: 2 additions & 2 deletions ibis/backends/dask/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def test_infer_dtype(value, expected_dtype):
DatetimeTZDtype(tz='US/Eastern', unit='ns'),
dt.Timestamp('US/Eastern'),
),
(CategoricalDtype(), dt.Category()),
(CategoricalDtype(), dt.String()),
],
)
def test_dask_dtype(dask_dtype, ibis_dtype):
Expand Down Expand Up @@ -95,7 +95,7 @@ def test_series_to_ibis_literal(core_client):
"interval('ns')",
),
(['foo', 'bar', 'hello'], "string"),
(pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.Category()),
(pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.String()),
],
)
def test_schema_infer(col_data, schema_type):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CASE
CAST(CASE
WHEN `f` < 10 THEN 0
WHEN 10 <= `f` THEN 1
ELSE CAST(NULL AS tinyint)
END
END AS int)
11 changes: 1 addition & 10 deletions ibis/backends/impala/tests/test_exprs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from ibis import literal as L
from ibis.backends.impala.compiler import ImpalaCompiler
from ibis.expr import api
from ibis.expr.datatypes import Category


def test_embedded_identifier_quoting(alltypes):
Expand Down Expand Up @@ -168,18 +167,10 @@ def _check_impala_output_types_match(con, table):
query = ImpalaCompiler.to_sql(table)
t = con.sql(query)

def _clean_type(x):
if isinstance(x, Category):
x = x.to_integer_type()
return x

left_schema, right_schema = t.schema(), table.schema()
for n, left_type, right_type in zip(
for n, left_ty, right_ty in zip(
left_schema.names, left_schema.types, right_schema.types
):
left_ty = _clean_type(left_type)
right_ty = _clean_type(right_type)

assert (
left_ty == right_ty
), f'Value for {n} had left type {left_ty} and right type {right_ty}\nquery:\n{query}'
Expand Down
4 changes: 1 addition & 3 deletions ibis/backends/pandas/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ def from_pandas_tzdtype(value):

@dt.dtype.register(CategoricalDtype)
def from_pandas_categorical(_):
return dt.Category()
return dt.String()


@dt.dtype.register(pd.core.dtypes.base.ExtensionDtype)
Expand Down Expand Up @@ -114,8 +114,6 @@ def ibis_dtype_to_pandas(ibis_dtype: dt.DataType):
return DatetimeTZDtype('ns', ibis_dtype.timezone)
elif ibis_dtype.is_interval():
return np.dtype(f'timedelta64[{ibis_dtype.unit}]')
elif ibis_dtype.is_category():
return CategoricalDtype()
else:
return _ibis_dtypes.get(type(ibis_dtype), np.dtype(np.object_))

Expand Down
1 change: 0 additions & 1 deletion ibis/backends/pandas/execution/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@
dt.string: str,
dt.timestamp: 'datetime64[ns]',
dt.boolean: np.bool_,
dt.category: 'category',
dt.json: str,
}

Expand Down
9 changes: 0 additions & 9 deletions ibis/backends/pandas/tests/execution/test_cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,12 +176,3 @@ def test_cast_to_decimal(t, df, type):
1 <= len(element.as_tuple().digits) <= type.precision
for element in result.values
)


@pytest.mark.parametrize(
'column',
['plain_int64', 'dup_strings', 'dup_ints', 'strings_with_nulls'],
)
def test_cast_to_category(t, df, column):
test = t[column].cast('category').execute()
tm.assert_series_equal(test, df[column].astype('category'))
4 changes: 2 additions & 2 deletions ibis/backends/pandas/tests/test_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def test_numpy_dtype_timedelta():
DatetimeTZDtype(tz='US/Eastern', unit='ns'),
dt.Timestamp('US/Eastern'),
),
(CategoricalDtype(), dt.Category()),
(CategoricalDtype(), dt.String()),
(pd.Series([], dtype="string").dtype, dt.String()),
],
)
Expand Down Expand Up @@ -171,7 +171,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):
"interval('ns')",
),
(['foo', 'bar', 'hello'], "string"),
(pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.Category()),
(pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.String()),
(pd.Series([b'1', b'2', b'3']), dt.string),
# mixed-integer
(pd.Series([1, 2, '3']), dt.binary),
Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/polars/datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,6 @@ def from_ibis_struct(dtype):
return pl.Struct(fields)


@to_polars_type.register(dt.Category)
def from_ibis_category(_):
return pl.Categorical


@to_polars_type.register(dt.Array)
def from_ibis_array(dtype):
return pl.List(to_polars_type(dtype.value_type))
Expand Down
14 changes: 10 additions & 4 deletions ibis/backends/postgres/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -466,12 +466,16 @@ def test_ifelse(alltypes, df, op, pandas_op):
# tier and histogram
param(
lambda d: d.bucket([0, 10, 25, 50, 100]),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
"int8"
),
id='include_over_false',
),
param(
lambda d: d.bucket([0, 10, 25, 50], include_over=True),
lambda s: pd.cut(s, [0, 10, 25, 50, np.inf], right=False, labels=False),
lambda s: pd.cut(
s, [0, 10, 25, 50, np.inf], right=False, labels=False
).astype("int8"),
id='include_over_true',
),
param(
Expand All @@ -492,15 +496,17 @@ def test_ifelse(alltypes, df, op, pandas_op):
),
param(
lambda d: d.bucket([10, 25, 50, 100], include_under=True),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
"int8"
),
id='include_under_true',
),
],
)
def test_bucket(alltypes, df, func, pandas_func):
expr = func(alltypes.double_col)
result = expr.execute()
expected = pandas_func(df.double_col).astype('category')
expected = pandas_func(df.double_col)
tm.assert_series_equal(result, expected, check_names=False)


Expand Down
5 changes: 0 additions & 5 deletions ibis/backends/sqlite/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,6 @@ def _value_to_temporal(arg, from_, to, **_):
raise com.UnsupportedOperationError(type(arg))


@sqlite_cast.register(object, dt.Category, dt.Int32)
def _category_to_int(arg, from_, to, **_):
return arg


@sqlite_cast.register(object, dt.DataType, dt.DataType)
def _default_cast_impl(arg, from_, to, translator=None):
assert translator is not None, "translator is None"
Expand Down
29 changes: 20 additions & 9 deletions ibis/backends/sqlite/tests/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas.testing as tm
import pytest
from packaging.version import parse
from pytest import param

import ibis
import ibis.expr.datatypes as dt
Expand Down Expand Up @@ -324,19 +325,26 @@ def test_ifelse(alltypes, df, func, expected_func):
('func', 'expected_func'),
[
# tier and histogram
(
param(
lambda d: d.bucket([0, 10, 25, 50, 100]),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
"int8"
),
id="default",
),
(
param(
lambda d: d.bucket([0, 10, 25, 50], include_over=True),
lambda s: pd.cut(s, [0, 10, 25, 50, np.inf], right=False, labels=False),
lambda s: pd.cut(
s, [0, 10, 25, 50, np.inf], right=False, labels=False
).astype("int8"),
id="include_over",
),
(
param(
lambda d: d.bucket([0, 10, 25, 50], close_extreme=False),
lambda s: pd.cut(s, [0, 10, 25, 50], right=False, labels=False),
id="no_close_extreme",
),
(
param(
lambda d: d.bucket([0, 10, 25, 50], closed='right', close_extreme=False),
lambda s: pd.cut(
s,
Expand All @@ -345,18 +353,21 @@ def test_ifelse(alltypes, df, func, expected_func):
right=True,
labels=False,
),
id="closed_right_no_close_extreme",
),
(
param(
lambda d: d.bucket([10, 25, 50, 100], include_under=True),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
"int8"
),
id="include_under",
),
],
)
def test_bucket(alltypes, df, func, expected_func):
expr = func(alltypes.double_col)
result = expr.execute()
expected = expected_func(df.double_col)
expected = pd.Series(pd.Categorical(expected))

tm.assert_series_equal(result, expected, check_names=False)

Expand Down
2 changes: 1 addition & 1 deletion ibis/expr/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1286,7 +1286,7 @@ def trailing_range_window(preceding, order_by, group_by=None):
coalesce = _deferred(ir.Value.coalesce)
greatest = _deferred(ir.Value.greatest)
least = _deferred(ir.Value.least)
category_label = _deferred(ir.CategoryValue.label)
category_label = _deferred(ir.IntegerColumn.label)

aggregate = ir.Table.aggregate
cross_join = ir.Table.cross_join
Expand Down
1 change: 0 additions & 1 deletion ibis/expr/datatypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ def can_cast_subtype(source: dt.DataType, target: dt.DataType, **kwargs) -> bool
return isinstance(target, source.__class__)


@castable.register(dt.Integer, dt.Category)
@castable.register(dt.Integer, (dt.Floating, dt.Decimal))
@castable.register(dt.Floating, dt.Decimal)
@castable.register((dt.Date, dt.Timestamp), (dt.Date, dt.Timestamp))
Expand Down
28 changes: 0 additions & 28 deletions ibis/expr/datatypes/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,6 @@ def is_binary(self) -> bool:
def is_boolean(self) -> bool:
return isinstance(self, Boolean)

def is_category(self) -> bool:
return isinstance(self, Category)

def is_date(self) -> bool:
return isinstance(self, Date)

Expand Down Expand Up @@ -672,29 +669,6 @@ def _pretty_piece(self) -> str:
return f"({self.unit!r})"


@public
class Category(Parametric):
cardinality = optional(instance_of(int))

scalar = ir.CategoryScalar
column = ir.CategoryColumn

def __repr__(self):
if self.cardinality is not None:
cardinality = repr(self.cardinality)
else:
cardinality = "unknown"
return f"{self.name}(cardinality={cardinality})"

def to_integer_type(self):
from ibis.expr.datatypes.value import infer

if self.cardinality is None:
return int64
else:
return infer(self.cardinality)


@public
class Struct(Parametric, Mapping):
"""Structured values."""
Expand Down Expand Up @@ -926,7 +900,6 @@ class INET(String):
time = Time()
timestamp = Timestamp()
interval = Interval()
category = Category()
# geo spatial data type
geometry = GeoSpatial(geotype="geometry")
geography = GeoSpatial(geotype="geography")
Expand Down Expand Up @@ -1031,7 +1004,6 @@ def from_numpy_dtype(value):
timestamp=timestamp,
dtype=dtype,
interval=interval,
category=category,
geometry=geometry,
geography=geography,
point=point,
Expand Down
1 change: 0 additions & 1 deletion ibis/expr/datatypes/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,6 @@ def parser():
| spaceless_string("timestamp").result(dt.Timestamp())
| spaceless_string("time").result(dt.time)
| spaceless_string("date").result(dt.date)
| spaceless_string("category").result(dt.category)
| spaceless_string("geometry").result(dt.GeoSpatial(geotype='geometry'))
| spaceless_string("geography").result(dt.GeoSpatial(geotype='geography'))
| spaceless_string("null").result(dt.null)
Expand Down
1 change: 0 additions & 1 deletion ibis/expr/datatypes/value.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ def _infer_object_array_dtype(x):
'mixed-integer-float': dt.float64,
'decimal': dt.float64,
'complex': dt.binary,
'categorical': dt.category,
'boolean': dt.boolean,
'datetime64': dt.timestamp,
'datetime': dt.timestamp,
Expand Down
Loading

0 comments on commit bb0ee78

Please sign in to comment.