Skip to content

Commit 1f552bf

Browse files
ueshinHyukjinKwon
authored andcommitted
Handle TimestampType separately when convert to pandas' dtype. (#798)
When the initialization of pandas in pyarrow is not done yet, it can't convert `pa.TimestampType` to pandas' dtype. In that case, the following example raises an error: ```py from datetime import datetime import databricks.koalas as ks kdf = ks.DataFrame({'t': [datetime(2019, 1, 1, 0, 0, 0), datetime(2019, 1, 2, 0, 0, 0), datetime(2019, 1, 3, 0, 0, 0)]}) kdf[kdf['t'] != kdf['t']] ``` ```py >>> kdf[kdf['t'] != kdf['t']] Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/Users/ueshin/workspace/databricks-koalas/master/databricks/koalas/frame.py", line 6646, in __repr__ pdf = self.head(max_display_count + 1)._to_internal_pandas() File "/Users/ueshin/workspace/databricks-koalas/master/databricks/koalas/frame.py", line 6639, in _to_internal_pandas return self._internal.pandas_df File "/Users/ueshin/workspace/databricks-koalas/master/databricks/koalas/utils.py", line 338, in _lazy_property setattr(self, attr_name, fn(self)) File "/Users/ueshin/workspace/databricks-koalas/master/databricks/koalas/internal.py", line 638, in pandas_df for field in sdf.schema}) File "/Users/ueshin/workspace/databricks-koalas/master/databricks/koalas/internal.py", line 638, in <dictcomp> for field in sdf.schema}) File "pyarrow/types.pxi", line 404, in pyarrow.lib.TimestampType.to_pandas_dtype File "/Users/ueshin/workspace/databricks-koalas/miniconda/envs/databricks-koalas_3.6/lib/python3.6/site-packages/pyarrow/pandas_compat.py", line 625, in make_datetimetz return _pandas_api.datetimetz_type('ns', tz=tz) TypeError: 'NoneType' object is not callable ``` We know the dtype should be `np.dtype('datetime64[ns]')`, so we don't need to rely on pyarrow's implementation. ```py >>> kdf[kdf['t'] != kdf['t']] Empty DataFrame Columns: [t] Index: [] ``` Resolves #772.
1 parent 52685f7 commit 1f552bf

File tree

4 files changed

+22
-9
lines changed

4 files changed

+22
-9
lines changed

databricks/koalas/base.py

+3-7
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,11 @@
2626
from pandas.api.types import is_list_like
2727
from pyspark import sql as spark
2828
from pyspark.sql import functions as F, Window
29-
from pyspark.sql.types import DoubleType, FloatType, LongType, StringType, TimestampType, \
30-
to_arrow_type
29+
from pyspark.sql.types import DoubleType, FloatType, LongType, StringType, TimestampType
3130

3231
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
3332
from databricks.koalas.internal import _InternalFrame
34-
from databricks.koalas.typedef import pandas_wraps
33+
from databricks.koalas.typedef import pandas_wraps, spark_type_to_pandas_dtype
3534
from databricks.koalas.utils import align_diff_series, scol_for
3635

3736

@@ -219,10 +218,7 @@ def dtype(self):
219218
>>> s.rename("a").to_frame().set_index("a").index.dtype
220219
dtype('<M8[ns]')
221220
"""
222-
if type(self.spark_type) == TimestampType:
223-
return np.dtype('datetime64[ns]')
224-
else:
225-
return np.dtype(to_arrow_type(self.spark_type).to_pandas_dtype())
221+
return spark_type_to_pandas_dtype(self.spark_type)
226222

227223
@property
228224
def empty(self):

databricks/koalas/internal.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232

3333
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
3434
from databricks.koalas.config import get_option
35-
from databricks.koalas.typedef import infer_pd_series_spark_type
35+
from databricks.koalas.typedef import infer_pd_series_spark_type, spark_type_to_pandas_dtype
3636
from databricks.koalas.utils import column_index_level, default_session, lazy_property, scol_for
3737

3838

@@ -634,7 +634,7 @@ def pandas_df(self):
634634
sdf = self.spark_internal_df
635635
pdf = sdf.toPandas()
636636
if len(pdf) == 0 and len(sdf.schema) > 0:
637-
pdf = pdf.astype({field.name: to_arrow_type(field.dataType).to_pandas_dtype()
637+
pdf = pdf.astype({field.name: spark_type_to_pandas_dtype(field.dataType)
638638
for field in sdf.schema})
639639

640640
index_columns = self.index_columns

databricks/koalas/tests/test_dataframe.py

+9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
# limitations under the License.
1515
#
1616

17+
from datetime import date, datetime
1718
import inspect
1819

1920
import numpy as np
@@ -1642,3 +1643,11 @@ def test_transform(self):
16421643

16431644
with self.assertRaisesRegex(AssertionError, "the first argument should be a callable"):
16441645
kdf.transform(1)
1646+
1647+
def test_empty_timestamp(self):
1648+
pdf = pd.DataFrame({'t': [datetime(2019, 1, 1, 0, 0, 0),
1649+
datetime(2019, 1, 2, 0, 0, 0),
1650+
datetime(2019, 1, 3, 0, 0, 0)]})
1651+
kdf = ks.from_pandas(pdf)
1652+
self.assert_eq(kdf[kdf['t'] != kdf['t']], pdf[pdf['t'] != pdf['t']])
1653+
self.assert_eq(kdf[kdf['t'] != kdf['t']].dtypes, pdf[pdf['t'] != pdf['t']].dtypes)

databricks/koalas/typedef.py

+8
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,14 @@ def as_spark_type(tpe) -> types.DataType:
145145
return _known_types.get(tpe, None)
146146

147147

148+
def spark_type_to_pandas_dtype(spark_type):
149+
""" Return the given Spark DataType to pandas dtype. """
150+
if isinstance(spark_type, types.TimestampType):
151+
return np.dtype('datetime64[ns]')
152+
else:
153+
return np.dtype(types.to_arrow_type(spark_type).to_pandas_dtype())
154+
155+
148156
def as_python_type(spark_tpe):
149157
return _py_conversions.get(spark_tpe, None)
150158

0 commit comments

Comments
 (0)