Skip to content

Commit

Permalink
Adjust data when all the values in a column are nulls. (#2004)
Browse files Browse the repository at this point in the history
For Spark < 3.0, when all the values in a column are nulls, it will be `None` regardless of its data type.

```py
>>> pdf = pd.DataFrame(
...             {
...                 "a": [None, None, None, "a"],
...                 "b": [None, None, None, 1],
...                 "c": [None, None, None] + list(np.arange(1, 2).astype("i1")),
...                 "d": [None, None, None, 1.0],
...                 "e": [None, None, None, True],
...                 "f": [None, None, None] + list(pd.date_range("20130101", periods=1)),
...             },
...         )
>>>
>>> kdf = ks.from_pandas(pdf)
>>> kdf.iloc[:-1]
      a     b     c     d     e     f
0  None  None  None  None  None  None
1  None  None  None  None  None  None
2  None  None  None  None  None  None
```

whereas for pandas:

```py
>>> pdf.iloc[:-1]
      a   b   c   d     e   f
0  None NaN NaN NaN  None NaT
1  None NaN NaN NaN  None NaT
2  None NaN NaN NaN  None NaT
```

With Spark >= 3.0 seems fine:

```py
>>> kdf.iloc[:-1]
      a   b   c   d     e   f
0  None NaN NaN NaN  None NaT
1  None NaN NaN NaN  None NaT
2  None NaN NaN NaN  None NaT
```
  • Loading branch information
ueshin authored Jan 13, 2021
1 parent 3ce2d87 commit 3cde582
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 9 deletions.
15 changes: 14 additions & 1 deletion databricks/koalas/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
"""
An internal immutable DataFrame with some metadata to manage indexes.
"""
from distutils.version import LooseVersion
import re
from typing import List, Optional, Tuple, Union, TYPE_CHECKING
from itertools import accumulate
Expand All @@ -25,11 +26,12 @@
import numpy as np
import pandas as pd
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
import pyspark
from pyspark import sql as spark
from pyspark._globals import _NoValue, _NoValueType
from pyspark.sql import functions as F, Window
from pyspark.sql.functions import PandasUDFType, pandas_udf
from pyspark.sql.types import BooleanType, DataType, StructField, StructType, LongType
from pyspark.sql.types import BooleanType, DataType, IntegralType, StructField, StructType, LongType

try:
from pyspark.sql.types import to_arrow_type
Expand Down Expand Up @@ -818,6 +820,17 @@ def to_pandas_frame(self) -> pd.DataFrame:
pdf = pdf.astype(
{field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}
)
elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
for field in sdf.schema:
if field.nullable and pdf[field.name].isnull().all():
if isinstance(field.dataType, BooleanType):
pdf[field.name] = pdf[field.name].astype(np.object)
elif isinstance(field.dataType, IntegralType):
pdf[field.name] = pdf[field.name].astype(np.float64)
else:
pdf[field.name] = pdf[field.name].astype(
spark_type_to_pandas_dtype(field.dataType)
)

column_names = []
for i, (label, spark_column, column_name) in enumerate(
Expand Down
31 changes: 23 additions & 8 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,13 +410,28 @@ def test_empty_dataframe(self):
self.assertRaises(ValueError, lambda: ks.from_pandas(pdf))

def test_all_null_dataframe(self):
pdf = pd.DataFrame(
{
"a": [None, None, None, "a"],
"b": [None, None, None, 1],
"c": [None, None, None] + list(np.arange(1, 2).astype("i1")),
"d": [None, None, None, 1.0],
"e": [None, None, None, True],
"f": [None, None, None] + list(pd.date_range("20130101", periods=1)),
},
)
kdf = ks.from_pandas(pdf)

self.assert_eq(kdf.iloc[:-1], pdf.iloc[:-1])

with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
self.assert_eq(kdf.iloc[:-1], pdf.iloc[:-1])

pdf = pd.DataFrame(
{
"a": pd.Series([None, None, None], dtype="float64"),
"b": pd.Series([None, None, None], dtype="str"),
},
index=np.random.rand(3),
)

self.assertRaises(ValueError, lambda: ks.from_pandas(pdf))
Expand All @@ -427,14 +442,14 @@ def test_all_null_dataframe(self):
def test_nullable_object(self):
pdf = pd.DataFrame(
{
"a": list("abc") + [np.nan],
"b": list(range(1, 4)) + [np.nan],
"c": list(np.arange(3, 6).astype("i1")) + [np.nan],
"d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan],
"e": [True, False, True, np.nan],
"f": list(pd.date_range("20130101", periods=3)) + [np.nan],
"a": list("abc") + [np.nan, None],
"b": list(range(1, 4)) + [np.nan, None],
"c": list(np.arange(3, 6).astype("i1")) + [np.nan, None],
"d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan, None],
"e": [True, False, True, np.nan, None],
"f": list(pd.date_range("20130101", periods=3)) + [np.nan, None],
},
index=np.random.rand(4),
index=np.random.rand(5),
)

kdf = ks.from_pandas(pdf)
Expand Down

0 comments on commit 3cde582

Please sign in to comment.