Support ExtensionDtypes as type arguments. (#2106)

ueshin · web-flow · commit 0d2cef0d9d13 · 2021-03-25T13:13:00.000-07:00
Support `ExtensionDtype`s as type arguments by reusing `NameTypeHolder` for DataFrame's type annotation.
Also support to infer Spark DataType from the return type annotation with `ExtensionDtype`s.

Before:

```py
&gt;&gt;&gt; ks.Series[pd.Int32Dtype()]
Traceback (most recent call last):
...
TypeError: Parameters to generic types must be types. Got Int32Dtype().
```

After:

```py
&gt;&gt;&gt; ks.Series[pd.Int32Dtype()]
databricks.koalas.typedef.typehints.SeriesType[databricks.koalas.series.NameType]
&gt;&gt;&gt; def a() -&gt; ks.Series[pd.Int32Dtype()]:
...   pass
...
&gt;&gt;&gt; infer_return_type(a)
SeriesType[IntegerType]
```
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -48,6 +48,7 @@
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_list_like, is_dict_like, is_scalar
+from pandas.api.extensions import ExtensionDtype
 
 if TYPE_CHECKING:
     from pandas.io.formats.style import Styler
@@ -333,6 +334,12 @@
 
 
 def _create_tuple_for_frame_type(params):
+    """
+    This is a workaround to support variadic generic in DataFrame.
+
+    See https://github.com/python/typing/issues/193
+    we always wraps the given type hints by a tuple to mimic the variadic generic.
+    """
     from databricks.koalas.typedef import NameTypeHolder
 
     if isinstance(params, zip):
@@ -365,8 +372,16 @@ def _create_tuple_for_frame_type(params):
 
     if not isinstance(params, Iterable):
         params = [params]
-    params = [param.type if isinstance(param, np.dtype) else param for param in params]
-    return Tuple[tuple(params)]
+
+    new_params = []
+    for param in params:
+        if isinstance(param, ExtensionDtype):
+            new_class = type("NameType", (NameTypeHolder,), {})
+            new_class.tpe = param
+            new_params.append(new_class)
+        else:
+            new_params.append(param.type if isinstance(param, np.dtype) else param)
+    return Tuple[tuple(new_params)]
 
 
 if (3, 5) <= sys.version_info < (3, 7):
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -31,6 +31,7 @@
 from pandas.core.accessor import CachedAccessor
 from pandas.io.formats.printing import pprint_thing
 from pandas.api.types import is_list_like, is_hashable
+from pandas.api.extensions import ExtensionDtype
 import pyspark
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Column
@@ -86,9 +87,9 @@
 from databricks.koalas.typedef import (
     infer_return_type,
     spark_type_to_pandas_dtype,
-    SeriesType,
     ScalarType,
     Scalar,
+    SeriesType,
 )
 
 
@@ -320,6 +321,32 @@
 str_type = str
 
 
+def _create_type_for_series_type(param):
+    from databricks.koalas.typedef import NameTypeHolder
+
+    if isinstance(param, ExtensionDtype):
+        new_class = type("NameType", (NameTypeHolder,), {})
+        new_class.tpe = param
+    else:
+        new_class = param.type if isinstance(param, np.dtype) else param
+
+    return SeriesType[new_class]
+
+
+if (3, 5) <= sys.version_info < (3, 7):
+    from typing import GenericMeta  # type: ignore
+
+    old_getitem = GenericMeta.__getitem__  # type: ignore
+
+    def new_getitem(self, params):
+        if hasattr(self, "is_series"):
+            return old_getitem(self, _create_type_for_series_type(params))
+        else:
+            return old_getitem(self, params)
+
+    GenericMeta.__getitem__ = new_getitem  # type: ignore
+
+
 class Series(Frame, IndexOpsMixin, Generic[T]):
     """
     Koalas Series that corresponds to pandas Series logically. This holds Spark Column
@@ -5978,8 +6005,13 @@ def __iter__(self):
 
     if sys.version_info >= (3, 7):
         # In order to support the type hints such as Series[...]. See DataFrame.__class_getitem__.
-        def __class_getitem__(cls, tpe):
-            return SeriesType[tpe]
+        def __class_getitem__(cls, params):
+            return _create_type_for_series_type(params)
+
+    elif (3, 5) <= sys.version_info < (3, 7):
+        # The implementation is in its metaclass so this flag is needed to distinguish
+        # Koalas Series.
+        is_series = None
 
 
 def unpack_scalar(sdf):
diff --git a/databricks/koalas/tests/test_typedef.py b/databricks/koalas/tests/test_typedef.py
@@ -21,6 +21,7 @@
 
 import pandas
 import pandas as pd
+from pandas.api.types import CategoricalDtype
 import numpy as np
 from pyspark.sql.types import (
     ArrayType,
@@ -103,6 +104,19 @@ def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
         expected = StructType([StructField("c0", LongType()), StructField("c1", LongType())])
         self.assertEqual(infer_return_type(func).tpe, expected)
 
+        pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"])})
+
+        def func() -> pd.Series[pdf.b.dtype]:  # type: ignore
+            pass
+
+        self.assertEqual(infer_return_type(func).tpe, LongType())
+
+        def func() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
+            pass
+
+        expected = StructType([StructField("c0", LongType()), StructField("c1", LongType())])
+        self.assertEqual(infer_return_type(func).tpe, expected)
+
     def test_if_pandas_implements_class_getitem(self):
         # the current type hint implementation of pandas DataFrame assumes pandas doesn't
         # implement '__class_getitem__'. This test case is to make sure pandas
@@ -145,6 +159,14 @@ def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
         )
         self.assertEqual(infer_return_type(func).tpe, expected)
 
+        pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical(["a", "b", "c"])})
+
+        def func() -> pd.DataFrame[zip(pdf.columns, pdf.dtypes)]:
+            pass
+
+        expected = StructType([StructField("a", LongType()), StructField("b", LongType())])
+        self.assertEqual(infer_return_type(func).tpe, expected)
+
     @unittest.skipIf(
         sys.version_info < (3, 7),
         "Type inference from pandas instances is supported with Python 3.7+",
@@ -188,6 +210,14 @@ def f() -> pd.DataFrame[pdf.dtypes]:  # type: ignore
 
         self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)
 
+        def try_infer_return_type():
+            def f() -> pd.Series[pdf.a.dtype]:  # type: ignore
+                pass
+
+            infer_return_type(f)
+
+        self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)
+
     def test_infer_schema_with_names_negative(self):
         def try_infer_return_type():
             def f() -> 'ks.DataFrame["a" : np.float : 1, "b":str:2]':  # noqa: F821
@@ -227,6 +257,14 @@ def f() -> ks.DataFrame[pdf.dtypes]:  # type: ignore
 
         self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)
 
+        def try_infer_return_type():
+            def f() -> ks.Series[pdf.a.dtype]:  # type: ignore
+                pass
+
+            infer_return_type(f)
+
+        self.assertRaisesRegex(TypeError, "object.*not understood", try_infer_return_type)
+
     def test_as_spark_type(self):
         type_mapper = {
             # binary
@@ -286,6 +324,8 @@ def test_as_spark_type(self):
             List[np.unicode_]: ArrayType(StringType()),
             List[datetime.datetime]: ArrayType(TimestampType()),
             List[np.datetime64]: ArrayType(TimestampType()),
+            # CategoricalDtype
+            CategoricalDtype(categories=["a", "b", "c"]): LongType(),
         }
 
         for numpy_or_python_type, spark_type in type_mapper.items():
diff --git a/databricks/koalas/typedef/typehints.py b/databricks/koalas/typedef/typehints.py
@@ -20,7 +20,7 @@
 import typing
 import datetime
 import decimal
-from inspect import getfullargspec, isclass
+from inspect import getfullargspec
 
 import numpy as np
 import pandas as pd
@@ -85,18 +85,15 @@ def __repr__(self):
 
 
 class DataFrameType(object):
-    def __init__(self, tpe, names=None):
+    def __init__(self, tpe, names):
         from databricks.koalas.utils import name_like_string
 
-        if names is None:
-            # Default names `c0, c1, ... cn`.
-            self.tpe = types.StructType(
-                [types.StructField("c%s" % i, tpe[i]) for i in range(len(tpe))]
-            )  # type: types.StructType
-        else:
-            self.tpe = types.StructType(
-                [types.StructField(name_like_string(n), t) for n, t in zip(names, tpe)]
-            )  # type: types.StructType
+        self.tpe = types.StructType(
+            [
+                types.StructField(name_like_string(n) if n is not None else ("c%s" % i), t)
+                for i, (n, t) in enumerate(zip(names, tpe))
+            ]
+        )  # type: types.StructType
 
     def __repr__(self):
         return "DataFrameType[{}]".format(self.tpe)
@@ -346,6 +343,22 @@ def infer_return_type(f) -> typing.Union[SeriesType, DataFrameType, ScalarType,
     ...     pass
     >>> infer_return_type(func).tpe
     StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))
+
+    >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
+    >>> def func() -> ks.DataFrame[pdf.dtypes]:
+    ...     pass
+    >>> infer_return_type(func).tpe
+    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))
+
+    >>> def func() -> ks.DataFrame[zip(pdf.columns, pdf.dtypes)]:
+    ...     pass
+    >>> infer_return_type(func).tpe
+    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))
+
+    >>> def func() -> ks.Series[pdf.b.dtype]:
+    ...     pass
+    >>> infer_return_type(func).tpe
+    LongType
     """
     # We should re-import to make sure the class 'SeriesType' is not treated as a class
     # within this module locally. See Series.__class_getitem__ which imports this class
@@ -357,17 +370,19 @@ def infer_return_type(f) -> typing.Union[SeriesType, DataFrameType, ScalarType,
     if isinstance(tpe, str):
         # This type hint can happen when given hints are string to avoid forward reference.
         tpe = resolve_string_type_hint(tpe)
+
     if hasattr(tpe, "__origin__") and (
-        issubclass(tpe.__origin__, SeriesType) or tpe.__origin__ == ks.Series
+        tpe.__origin__ == ks.DataFrame or tpe.__origin__ == ks.Series
     ):
-        # TODO: remove "tpe.__origin__ == ks.Series" when we drop Python 3.5 and 3.6.
-        inner = as_spark_type(tpe.__args__[0])
-        return SeriesType(inner)
+        # When Python version is lower then 3.7. Unwrap it to a Tuple/SeriesType type hints.
+        tpe = tpe.__args__[0]
 
-    if hasattr(tpe, "__origin__") and tpe.__origin__ == ks.DataFrame:
-        # When Python version is lower then 3.7. Unwrap it to a Tuple type
-        # hints.
+    if hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, SeriesType):
         tpe = tpe.__args__[0]
+        if issubclass(tpe, NameTypeHolder):
+            tpe = tpe.tpe
+        inner = as_spark_type(tpe)
+        return SeriesType(inner)
 
     # Note that, DataFrame type hints will create a Tuple.
     # Python 3.6 has `__name__`. Python 3.7 and 3.8 have `_name`.
@@ -381,13 +396,9 @@ def infer_return_type(f) -> typing.Union[SeriesType, DataFrameType, ScalarType,
             parameters = getattr(tuple_type, "__tuple_params__")
         else:
             parameters = getattr(tuple_type, "__args__")
-        if len(parameters) > 0 and all(
-            isclass(p) and issubclass(p, NameTypeHolder) for p in parameters
-        ):
-            names = [p.name for p in parameters if issubclass(p, NameTypeHolder)]
-            types = [p.tpe for p in parameters if issubclass(p, NameTypeHolder)]
-            return DataFrameType([as_spark_type(t) for t in types], names)
-        return DataFrameType([as_spark_type(t) for t in parameters])
+        names = [p.name if issubclass(p, NameTypeHolder) else None for p in parameters]
+        types = [p.tpe if issubclass(p, NameTypeHolder) else p for p in parameters]
+        return DataFrameType([as_spark_type(t) for t in types], names)
     inner = as_spark_type(tpe)
     if inner is None:
         return UnknownType(tpe)