refactor(datatype): remove Category type and related APIs

The catgeory type doesn't map well to most databases, and is also a level of abstraction below ibis. Whether something is stored using dictionary encoding is unrelated to expression APIs. BREAKING CHANGE: `Category`, `CategoryValue`/`Column`/`Scalar` are removed. Use string types instead.
ibis-project · Feb 6, 2023 · bb0ee78 · bb0ee78
1 parent b532c63
commit bb0ee78
Show file tree

Hide file tree

Showing 27 changed files with 95 additions and 195 deletions.
diff --git a/ibis/backends/base/sql/alchemy/registry.py b/ibis/backends/base/sql/alchemy/registry.py
@@ -166,9 +166,6 @@ def _cast(t, op):
 
     sa_arg = t.translate(arg)
 
-    if arg_dtype.is_category() and typ.is_int32():
-        return sa_arg
-
     # specialize going from an integer type to a timestamp
     if arg_dtype.is_integer() and typ.is_timestamp():
         return t.integer_to_timestamp(sa_arg)

diff --git a/ibis/backends/base/sql/compiler/translator.py b/ibis/backends/base/sql/compiler/translator.py
@@ -326,24 +326,6 @@ def _bucket(op):
     return result.op()
 
 
-@rewrites(ops.CategoryLabel)
-def _category_label(op):
-    # TODO(kszucs): avoid the expression roundtrip
-    expr = op.to_expr()
-    stmt = op.args[0].to_expr().case()
-    for i, label in enumerate(op.labels):
-        stmt = stmt.when(i, label)
-
-    if op.nulls is not None:
-        stmt = stmt.else_(op.nulls)
-
-    result = stmt.end()
-    if expr.has_name():
-        result = result.name(expr.get_name())
-
-    return result.op()
-
-
 @rewrites(ops.Any)
 def _any_expand(op):
     return ops.Max(op.arg)

diff --git a/ibis/backends/base/sql/registry/main.py b/ibis/backends/base/sql/registry/main.py
@@ -114,8 +114,6 @@ def log(translator, op):
 def cast(translator, op):
     arg_formatted = translator.translate(op.arg)
 
-    if op.arg.output_dtype.is_category() and op.to.is_int32():
-        return arg_formatted
     if op.arg.output_dtype.is_temporal() and op.to.is_int64():
         return f'1000000 * unix_timestamp({arg_formatted})'
     else:

diff --git a/ibis/backends/dask/tests/execution/test_cast.py b/ibis/backends/dask/tests/execution/test_cast.py
@@ -158,14 +158,3 @@ def test_cast_to_decimal(t, df, type):
         1 <= len(element.as_tuple().digits) <= type.precision
         for element in result.compute().values
     )
-
-
-@pytest.mark.parametrize(
-    'column',
-    ['plain_int64', 'dup_strings', 'dup_ints', 'strings_with_nulls'],
-)
-def test_cast_to_category(t, df, column):
-    test = t[column].cast('category').compile()
-    tm.assert_series_equal(
-        test.compute(), df[column].astype('category').compute(), check_index=False
-    )
diff --git a/ibis/backends/dask/tests/test_datatypes.py b/ibis/backends/dask/tests/test_datatypes.py
@@ -46,7 +46,7 @@ def test_infer_dtype(value, expected_dtype):
             DatetimeTZDtype(tz='US/Eastern', unit='ns'),
             dt.Timestamp('US/Eastern'),
         ),
-        (CategoricalDtype(), dt.Category()),
+        (CategoricalDtype(), dt.String()),
     ],
 )
 def test_dask_dtype(dask_dtype, ibis_dtype):
@@ -95,7 +95,7 @@ def test_series_to_ibis_literal(core_client):
             "interval('ns')",
         ),
         (['foo', 'bar', 'hello'], "string"),
-        (pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.Category()),
+        (pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.String()),
     ],
 )
 def test_schema_infer(col_data, schema_type):

diff --git a/...s/snapshots/test_bucket_histogram/test_bucket_to_case/include_over_include_under1/out.sql b/...s/snapshots/test_bucket_histogram/test_bucket_to_case/include_over_include_under1/out.sql
@@ -1,5 +1,5 @@
-CASE
+CAST(CASE
   WHEN `f` < 10 THEN 0
   WHEN 10 <= `f` THEN 1
   ELSE CAST(NULL AS tinyint)
-END
+END AS int)
diff --git a/ibis/backends/impala/tests/test_exprs.py b/ibis/backends/impala/tests/test_exprs.py
@@ -9,7 +9,6 @@
 from ibis import literal as L
 from ibis.backends.impala.compiler import ImpalaCompiler
 from ibis.expr import api
-from ibis.expr.datatypes import Category
 
 
 def test_embedded_identifier_quoting(alltypes):
@@ -168,18 +167,10 @@ def _check_impala_output_types_match(con, table):
     query = ImpalaCompiler.to_sql(table)
     t = con.sql(query)
 
-    def _clean_type(x):
-        if isinstance(x, Category):
-            x = x.to_integer_type()
-        return x
-
     left_schema, right_schema = t.schema(), table.schema()
-    for n, left_type, right_type in zip(
+    for n, left_ty, right_ty in zip(
         left_schema.names, left_schema.types, right_schema.types
     ):
-        left_ty = _clean_type(left_type)
-        right_ty = _clean_type(right_type)
-
         assert (
             left_ty == right_ty
         ), f'Value for {n} had left type {left_ty} and right type {right_ty}\nquery:\n{query}'

diff --git a/ibis/backends/pandas/client.py b/ibis/backends/pandas/client.py
@@ -58,7 +58,7 @@ def from_pandas_tzdtype(value):
 
 @dt.dtype.register(CategoricalDtype)
 def from_pandas_categorical(_):
-    return dt.Category()
+    return dt.String()
 
 
 @dt.dtype.register(pd.core.dtypes.base.ExtensionDtype)
@@ -114,8 +114,6 @@ def ibis_dtype_to_pandas(ibis_dtype: dt.DataType):
         return DatetimeTZDtype('ns', ibis_dtype.timezone)
     elif ibis_dtype.is_interval():
         return np.dtype(f'timedelta64[{ibis_dtype.unit}]')
-    elif ibis_dtype.is_category():
-        return CategoricalDtype()
     else:
         return _ibis_dtypes.get(type(ibis_dtype), np.dtype(np.object_))
 

diff --git a/ibis/backends/pandas/execution/constants.py b/ibis/backends/pandas/execution/constants.py
@@ -41,7 +41,6 @@
     dt.string: str,
     dt.timestamp: 'datetime64[ns]',
     dt.boolean: np.bool_,
-    dt.category: 'category',
     dt.json: str,
 }
 

diff --git a/ibis/backends/pandas/tests/execution/test_cast.py b/ibis/backends/pandas/tests/execution/test_cast.py
@@ -176,12 +176,3 @@ def test_cast_to_decimal(t, df, type):
         1 <= len(element.as_tuple().digits) <= type.precision
         for element in result.values
     )
-
-
-@pytest.mark.parametrize(
-    'column',
-    ['plain_int64', 'dup_strings', 'dup_ints', 'strings_with_nulls'],
-)
-def test_cast_to_category(t, df, column):
-    test = t[column].cast('category').execute()
-    tm.assert_series_equal(test, df[column].astype('category'))
diff --git a/ibis/backends/pandas/tests/test_datatypes.py b/ibis/backends/pandas/tests/test_datatypes.py
@@ -131,7 +131,7 @@ def test_numpy_dtype_timedelta():
             DatetimeTZDtype(tz='US/Eastern', unit='ns'),
             dt.Timestamp('US/Eastern'),
         ),
-        (CategoricalDtype(), dt.Category()),
+        (CategoricalDtype(), dt.String()),
         (pd.Series([], dtype="string").dtype, dt.String()),
     ],
 )
@@ -171,7 +171,7 @@ def test_pandas_dtype(pandas_dtype, ibis_dtype):
             "interval('ns')",
         ),
         (['foo', 'bar', 'hello'], "string"),
-        (pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.Category()),
+        (pd.Series(['a', 'b', 'c', 'a']).astype('category'), dt.String()),
         (pd.Series([b'1', b'2', b'3']), dt.string),
         # mixed-integer
         (pd.Series([1, 2, '3']), dt.binary),

diff --git a/ibis/backends/polars/datatypes.py b/ibis/backends/polars/datatypes.py
@@ -61,11 +61,6 @@ def from_ibis_struct(dtype):
     return pl.Struct(fields)
 
 
-@to_polars_type.register(dt.Category)
-def from_ibis_category(_):
-    return pl.Categorical
-
-
 @to_polars_type.register(dt.Array)
 def from_ibis_array(dtype):
     return pl.List(to_polars_type(dtype.value_type))

diff --git a/ibis/backends/postgres/tests/test_functions.py b/ibis/backends/postgres/tests/test_functions.py
@@ -466,12 +466,16 @@ def test_ifelse(alltypes, df, op, pandas_op):
         # tier and histogram
         param(
             lambda d: d.bucket([0, 10, 25, 50, 100]),
-            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
+            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
+                "int8"
+            ),
             id='include_over_false',
         ),
         param(
             lambda d: d.bucket([0, 10, 25, 50], include_over=True),
-            lambda s: pd.cut(s, [0, 10, 25, 50, np.inf], right=False, labels=False),
+            lambda s: pd.cut(
+                s, [0, 10, 25, 50, np.inf], right=False, labels=False
+            ).astype("int8"),
             id='include_over_true',
         ),
         param(
@@ -492,15 +496,17 @@ def test_ifelse(alltypes, df, op, pandas_op):
         ),
         param(
             lambda d: d.bucket([10, 25, 50, 100], include_under=True),
-            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
+            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
+                "int8"
+            ),
             id='include_under_true',
         ),
     ],
 )
 def test_bucket(alltypes, df, func, pandas_func):
     expr = func(alltypes.double_col)
     result = expr.execute()
-    expected = pandas_func(df.double_col).astype('category')
+    expected = pandas_func(df.double_col)
     tm.assert_series_equal(result, expected, check_names=False)
 
 

diff --git a/ibis/backends/sqlite/registry.py b/ibis/backends/sqlite/registry.py
@@ -55,11 +55,6 @@ def _value_to_temporal(arg, from_, to, **_):
     raise com.UnsupportedOperationError(type(arg))
 
 
-@sqlite_cast.register(object, dt.Category, dt.Int32)
-def _category_to_int(arg, from_, to, **_):
-    return arg
-
-
 @sqlite_cast.register(object, dt.DataType, dt.DataType)
 def _default_cast_impl(arg, from_, to, translator=None):
     assert translator is not None, "translator is None"

diff --git a/ibis/backends/sqlite/tests/test_functions.py b/ibis/backends/sqlite/tests/test_functions.py
@@ -8,6 +8,7 @@
 import pandas.testing as tm
 import pytest
 from packaging.version import parse
+from pytest import param
 
 import ibis
 import ibis.expr.datatypes as dt
@@ -324,19 +325,26 @@ def test_ifelse(alltypes, df, func, expected_func):
     ('func', 'expected_func'),
     [
         # tier and histogram
-        (
+        param(
             lambda d: d.bucket([0, 10, 25, 50, 100]),
-            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
+            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
+                "int8"
+            ),
+            id="default",
         ),
-        (
+        param(
             lambda d: d.bucket([0, 10, 25, 50], include_over=True),
-            lambda s: pd.cut(s, [0, 10, 25, 50, np.inf], right=False, labels=False),
+            lambda s: pd.cut(
+                s, [0, 10, 25, 50, np.inf], right=False, labels=False
+            ).astype("int8"),
+            id="include_over",
         ),
-        (
+        param(
             lambda d: d.bucket([0, 10, 25, 50], close_extreme=False),
             lambda s: pd.cut(s, [0, 10, 25, 50], right=False, labels=False),
+            id="no_close_extreme",
         ),
-        (
+        param(
             lambda d: d.bucket([0, 10, 25, 50], closed='right', close_extreme=False),
             lambda s: pd.cut(
                 s,
@@ -345,18 +353,21 @@ def test_ifelse(alltypes, df, func, expected_func):
                 right=True,
                 labels=False,
             ),
+            id="closed_right_no_close_extreme",
         ),
-        (
+        param(
             lambda d: d.bucket([10, 25, 50, 100], include_under=True),
-            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False),
+            lambda s: pd.cut(s, [0, 10, 25, 50, 100], right=False, labels=False).astype(
+                "int8"
+            ),
+            id="include_under",
         ),
     ],
 )
 def test_bucket(alltypes, df, func, expected_func):
     expr = func(alltypes.double_col)
     result = expr.execute()
     expected = expected_func(df.double_col)
-    expected = pd.Series(pd.Categorical(expected))
 
     tm.assert_series_equal(result, expected, check_names=False)
 

diff --git a/ibis/expr/api.py b/ibis/expr/api.py
@@ -1286,7 +1286,7 @@ def trailing_range_window(preceding, order_by, group_by=None):
 coalesce = _deferred(ir.Value.coalesce)
 greatest = _deferred(ir.Value.greatest)
 least = _deferred(ir.Value.least)
-category_label = _deferred(ir.CategoryValue.label)
+category_label = _deferred(ir.IntegerColumn.label)
 
 aggregate = ir.Table.aggregate
 cross_join = ir.Table.cross_join

diff --git a/ibis/expr/datatypes/cast.py b/ibis/expr/datatypes/cast.py
@@ -51,7 +51,6 @@ def can_cast_subtype(source: dt.DataType, target: dt.DataType, **kwargs) -> bool
     return isinstance(target, source.__class__)
 
 
-@castable.register(dt.Integer, dt.Category)
 @castable.register(dt.Integer, (dt.Floating, dt.Decimal))
 @castable.register(dt.Floating, dt.Decimal)
 @castable.register((dt.Date, dt.Timestamp), (dt.Date, dt.Timestamp))

diff --git a/ibis/expr/datatypes/core.py b/ibis/expr/datatypes/core.py
@@ -149,9 +149,6 @@ def is_binary(self) -> bool:
     def is_boolean(self) -> bool:
         return isinstance(self, Boolean)
 
-    def is_category(self) -> bool:
-        return isinstance(self, Category)
-
     def is_date(self) -> bool:
         return isinstance(self, Date)
 
@@ -672,29 +669,6 @@ def _pretty_piece(self) -> str:
         return f"({self.unit!r})"
 
 
-@public
-class Category(Parametric):
-    cardinality = optional(instance_of(int))
-
-    scalar = ir.CategoryScalar
-    column = ir.CategoryColumn
-
-    def __repr__(self):
-        if self.cardinality is not None:
-            cardinality = repr(self.cardinality)
-        else:
-            cardinality = "unknown"
-        return f"{self.name}(cardinality={cardinality})"
-
-    def to_integer_type(self):
-        from ibis.expr.datatypes.value import infer
-
-        if self.cardinality is None:
-            return int64
-        else:
-            return infer(self.cardinality)
-
-
 @public
 class Struct(Parametric, Mapping):
     """Structured values."""
@@ -926,7 +900,6 @@ class INET(String):
 time = Time()
 timestamp = Timestamp()
 interval = Interval()
-category = Category()
 # geo spatial data type
 geometry = GeoSpatial(geotype="geometry")
 geography = GeoSpatial(geotype="geography")
@@ -1031,7 +1004,6 @@ def from_numpy_dtype(value):
     timestamp=timestamp,
     dtype=dtype,
     interval=interval,
-    category=category,
     geometry=geometry,
     geography=geography,
     point=point,

diff --git a/ibis/expr/datatypes/parse.py b/ibis/expr/datatypes/parse.py
@@ -117,7 +117,6 @@ def parser():
         | spaceless_string("timestamp").result(dt.Timestamp())
         | spaceless_string("time").result(dt.time)
         | spaceless_string("date").result(dt.date)
-        | spaceless_string("category").result(dt.category)
         | spaceless_string("geometry").result(dt.GeoSpatial(geotype='geometry'))
         | spaceless_string("geography").result(dt.GeoSpatial(geotype='geography'))
         | spaceless_string("null").result(dt.null)

diff --git a/ibis/expr/datatypes/value.py b/ibis/expr/datatypes/value.py
@@ -199,7 +199,6 @@ def _infer_object_array_dtype(x):
             'mixed-integer-float': dt.float64,
             'decimal': dt.float64,
             'complex': dt.binary,
-            'categorical': dt.category,
             'boolean': dt.boolean,
             'datetime64': dt.timestamp,
             'datetime': dt.timestamp,