From c907b5f2729d7016fb0d887e0004a362a79d253a Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 29 Jul 2020 14:59:48 +0900 Subject: [PATCH 1/6] Bug fixing for hasnans with BooleanType --- databricks/koalas/base.py | 2 +- databricks/koalas/tests/test_indexes.py | 9 +++++++++ databricks/koalas/tests/test_series.py | 9 +++++++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 072eb86fc3..523e8d0763 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -391,7 +391,7 @@ def hasnans(self): sdf = self._internal.spark_frame scol = self.spark.column - return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] + return sdf.select(F.max(scol.isNull())).collect()[0][0] @property def is_monotonic(self): diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index c862f3718f..80118dcdcb 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1364,3 +1364,12 @@ def test_abs(self): kidx = ks.MultiIndex.from_tuples([(1, 2)], names=["level1", "level2"]) with self.assertRaisesRegex(TypeError, "perform __abs__ with this index"): abs(kidx) + + def test_hasnans(self): + pidx = pd.Index([True, False, True, True]) + kidx = ks.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) + + pidx = pd.Index([True, False, np.nan, True]) + kidx = ks.from_pandas(pidx) + self.assert_eq(pidx.hasnans, kidx.hasnans) diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 82d03d8027..39572c33fe 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1894,3 +1894,12 @@ def test_tail(self): self.assert_eq(pser.tail(-1001), kser.tail(-1001)) with self.assertRaisesRegex(TypeError, "bad operand type for unary -: 'str'"): kser.tail("10") + + def test_hasnans(self): + pser = pd.Series([True, False, True, True]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([True, False, np.nan, True]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) From 029a4c369b25cc92e56d26c904c9ffdb2ceb1a06 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 29 Jul 2020 17:22:25 +0900 Subject: [PATCH 2/6] Addressed the comment --- databricks/koalas/base.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index 523e8d0763..e279836696 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -29,7 +29,15 @@ from pandas.core.accessor import CachedAccessor from pyspark import sql as spark from pyspark.sql import functions as F, Window, Column -from pyspark.sql.types import DateType, DoubleType, FloatType, LongType, StringType, TimestampType +from pyspark.sql.types import ( + DateType, + DoubleType, + FloatType, + LongType, + StringType, + TimestampType, + BooleanType, +) from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas import numpy_compat @@ -391,7 +399,11 @@ def hasnans(self): sdf = self._internal.spark_frame scol = self.spark.column - return sdf.select(F.max(scol.isNull())).collect()[0][0] + if isinstance(self.spark.data_type, BooleanType): + # `BooleanType` cannot contain `np.nan`. + return sdf.select(F.max(scol.isNull())).collect()[0][0] + else: + return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] @property def is_monotonic(self): From 9394b2c584808057d254516cae2dba4a42d556cd Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 30 Jul 2020 12:39:22 +0900 Subject: [PATCH 3/6] Addressed the Type --- databricks/koalas/base.py | 8 ++++---- databricks/koalas/tests/test_series.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index e279836696..cac30bd33d 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -399,11 +399,11 @@ def hasnans(self): sdf = self._internal.spark_frame scol = self.spark.column - if isinstance(self.spark.data_type, BooleanType): - # `BooleanType` cannot contain `np.nan`. - return sdf.select(F.max(scol.isNull())).collect()[0][0] - else: + if isinstance(self.spark.data_type, DoubleType): + # Check `isnan` for the DoubleType since the type of np.nan is DoubleType in PySpark. return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] + else: + return sdf.select(F.max(scol.isNull())).collect()[0][0] @property def is_monotonic(self): diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py index 39572c33fe..10f475250d 100644 --- a/databricks/koalas/tests/test_series.py +++ b/databricks/koalas/tests/test_series.py @@ -1896,6 +1896,7 @@ def test_tail(self): kser.tail("10") def test_hasnans(self): + # BooleanType pser = pd.Series([True, False, True, True]) kser = ks.from_pandas(pser) self.assert_eq(pser.hasnans, kser.hasnans) @@ -1903,3 +1904,12 @@ def test_hasnans(self): pser = pd.Series([True, False, np.nan, True]) kser = ks.from_pandas(pser) self.assert_eq(pser.hasnans, kser.hasnans) + + # TimestampType + pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) From 5ac27639bb0722bf71235c669aeb724b168d9255 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 30 Jul 2020 12:40:02 +0900 Subject: [PATCH 4/6] Addressed the types --- databricks/koalas/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index cac30bd33d..c6f069a672 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -36,7 +36,6 @@ LongType, StringType, TimestampType, - BooleanType, ) from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. From 6014358c5cb436f1404944cb3d07adf4adc47ad1 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 30 Jul 2020 12:42:51 +0900 Subject: [PATCH 5/6] Timestamp test for Index --- databricks/koalas/tests/test_indexes.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/databricks/koalas/tests/test_indexes.py b/databricks/koalas/tests/test_indexes.py index 80118dcdcb..443d5e8f2f 100644 --- a/databricks/koalas/tests/test_indexes.py +++ b/databricks/koalas/tests/test_indexes.py @@ -1366,6 +1366,7 @@ def test_abs(self): abs(kidx) def test_hasnans(self): + # BooleanType pidx = pd.Index([True, False, True, True]) kidx = ks.from_pandas(pidx) self.assert_eq(pidx.hasnans, kidx.hasnans) @@ -1373,3 +1374,12 @@ def test_hasnans(self): pidx = pd.Index([True, False, np.nan, True]) kidx = ks.from_pandas(pidx) self.assert_eq(pidx.hasnans, kidx.hasnans) + + # TimestampType + pser = pd.Series([pd.Timestamp("2020-07-30") for _ in range(3)]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) + + pser = pd.Series([pd.Timestamp("2020-07-30"), np.nan, pd.Timestamp("2020-07-30")]) + kser = ks.from_pandas(pser) + self.assert_eq(pser.hasnans, kser.hasnans) From a65b5c42456eb63e2fe1be2640a02991f5736b2b Mon Sep 17 00:00:00 2001 From: itholic Date: Fri, 31 Jul 2020 16:06:34 +0900 Subject: [PATCH 6/6] also check FloatType --- databricks/koalas/base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/databricks/koalas/base.py b/databricks/koalas/base.py index c6f069a672..7a9a6caf26 100644 --- a/databricks/koalas/base.py +++ b/databricks/koalas/base.py @@ -398,8 +398,7 @@ def hasnans(self): sdf = self._internal.spark_frame scol = self.spark.column - if isinstance(self.spark.data_type, DoubleType): - # Check `isnan` for the DoubleType since the type of np.nan is DoubleType in PySpark. + if isinstance(self.spark.data_type, (DoubleType, FloatType)): return sdf.select(F.max(scol.isNull() | F.isnan(scol))).collect()[0][0] else: return sdf.select(F.max(scol.isNull())).collect()[0][0]