Adjust data when all the values in a column are nulls. (#2004)

For Spark < 3.0, when all the values in a column are nulls, it will be `None` regardless of its data type. ```py >>> pdf = pd.DataFrame( ... { ... "a": [None, None, None, "a"], ... "b": [None, None, None, 1], ... "c": [None, None, None] + list(np.arange(1, 2).astype("i1")), ... "d": [None, None, None, 1.0], ... "e": [None, None, None, True], ... "f": [None, None, None] + list(pd.date_range("20130101", periods=1)), ... }, ... ) >>> >>> kdf = ks.from_pandas(pdf) >>> kdf.iloc[:-1] a b c d e f 0 None None None None None None 1 None None None None None None 2 None None None None None None ``` whereas for pandas: ```py >>> pdf.iloc[:-1] a b c d e f 0 None NaN NaN NaN None NaT 1 None NaN NaN NaN None NaT 2 None NaN NaN NaN None NaT ``` With Spark >= 3.0 seems fine: ```py >>> kdf.iloc[:-1] a b c d e f 0 None NaN NaN NaN None NaT 1 None NaN NaN NaN None NaT 2 None NaN NaN NaN None NaT ```
databricks · Jan 13, 2021 · 3cde582 · 3cde582
1 parent 3ce2d87
commit 3cde582
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 9 deletions.
diff --git a/databricks/koalas/internal.py b/databricks/koalas/internal.py
@@ -17,6 +17,7 @@
 """
 An internal immutable DataFrame with some metadata to manage indexes.
 """
+from distutils.version import LooseVersion
 import re
 from typing import List, Optional, Tuple, Union, TYPE_CHECKING
 from itertools import accumulate
@@ -25,11 +26,12 @@
 import numpy as np
 import pandas as pd
 from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
+import pyspark
 from pyspark import sql as spark
 from pyspark._globals import _NoValue, _NoValueType
 from pyspark.sql import functions as F, Window
 from pyspark.sql.functions import PandasUDFType, pandas_udf
-from pyspark.sql.types import BooleanType, DataType, StructField, StructType, LongType
+from pyspark.sql.types import BooleanType, DataType, IntegralType, StructField, StructType, LongType
 
 try:
     from pyspark.sql.types import to_arrow_type
@@ -818,6 +820,17 @@ def to_pandas_frame(self) -> pd.DataFrame:
             pdf = pdf.astype(
                 {field.name: spark_type_to_pandas_dtype(field.dataType) for field in sdf.schema}
             )
+        elif LooseVersion(pyspark.__version__) < LooseVersion("3.0"):
+            for field in sdf.schema:
+                if field.nullable and pdf[field.name].isnull().all():
+                    if isinstance(field.dataType, BooleanType):
+                        pdf[field.name] = pdf[field.name].astype(np.object)
+                    elif isinstance(field.dataType, IntegralType):
+                        pdf[field.name] = pdf[field.name].astype(np.float64)
+                    else:
+                        pdf[field.name] = pdf[field.name].astype(
+                            spark_type_to_pandas_dtype(field.dataType)
+                        )
 
         column_names = []
         for i, (label, spark_column, column_name) in enumerate(

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -410,13 +410,28 @@ def test_empty_dataframe(self):
             self.assertRaises(ValueError, lambda: ks.from_pandas(pdf))
 
     def test_all_null_dataframe(self):
+        pdf = pd.DataFrame(
+            {
+                "a": [None, None, None, "a"],
+                "b": [None, None, None, 1],
+                "c": [None, None, None] + list(np.arange(1, 2).astype("i1")),
+                "d": [None, None, None, 1.0],
+                "e": [None, None, None, True],
+                "f": [None, None, None] + list(pd.date_range("20130101", periods=1)),
+            },
+        )
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(kdf.iloc[:-1], pdf.iloc[:-1])
+
+        with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}):
+            self.assert_eq(kdf.iloc[:-1], pdf.iloc[:-1])
 
         pdf = pd.DataFrame(
             {
                 "a": pd.Series([None, None, None], dtype="float64"),
                 "b": pd.Series([None, None, None], dtype="str"),
             },
-            index=np.random.rand(3),
         )
 
         self.assertRaises(ValueError, lambda: ks.from_pandas(pdf))
@@ -427,14 +442,14 @@ def test_all_null_dataframe(self):
     def test_nullable_object(self):
         pdf = pd.DataFrame(
             {
-                "a": list("abc") + [np.nan],
-                "b": list(range(1, 4)) + [np.nan],
-                "c": list(np.arange(3, 6).astype("i1")) + [np.nan],
-                "d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan],
-                "e": [True, False, True, np.nan],
-                "f": list(pd.date_range("20130101", periods=3)) + [np.nan],
+                "a": list("abc") + [np.nan, None],
+                "b": list(range(1, 4)) + [np.nan, None],
+                "c": list(np.arange(3, 6).astype("i1")) + [np.nan, None],
+                "d": list(np.arange(4.0, 7.0, dtype="float64")) + [np.nan, None],
+                "e": [True, False, True, np.nan, None],
+                "f": list(pd.date_range("20130101", periods=3)) + [np.nan, None],
             },
-            index=np.random.rand(4),
+            index=np.random.rand(5),
         )
 
         kdf = ks.from_pandas(pdf)