diff --git a/python/pyspark/pandas/frame.py b/python/pyspark/pandas/frame.py index 8fbe1b8f926af..41ab03a5c0b4d 100644 --- a/python/pyspark/pandas/frame.py +++ b/python/pyspark/pandas/frame.py @@ -83,6 +83,7 @@ DecimalType, TimestampType, TimestampNTZType, + NullType, ) from pyspark.sql.window import Window @@ -797,7 +798,7 @@ def _reduce_for_stat_function( new_column_labels.append(label) if len(exprs) == 1: - return Series([]) + return Series([], dtype="float64") sdf = self._internal.spark_frame.select(*exprs) @@ -12128,11 +12129,6 @@ def quantile( 0.50 3.0 7.0 0.75 4.0 8.0 """ - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) axis = validate_axis(axis) if axis != 0: raise NotImplementedError('axis should be either 0 or "index" currently.') @@ -12155,7 +12151,7 @@ def quantile( def quantile(psser: "Series") -> PySparkColumn: spark_type = psser.spark.data_type spark_column = psser.spark.column - if isinstance(spark_type, (BooleanType, NumericType)): + if isinstance(spark_type, (BooleanType, NumericType, NullType)): return F.percentile_approx(spark_column.cast(DoubleType()), qq, accuracy) else: raise TypeError( diff --git a/python/pyspark/pandas/generic.py b/python/pyspark/pandas/generic.py index c346889863b34..b540045f88f4a 100644 --- a/python/pyspark/pandas/generic.py +++ b/python/pyspark/pandas/generic.py @@ -1419,11 +1419,6 @@ def product( nan """ axis = validate_axis(axis) - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `None` in 4.0.0.", - FutureWarning, - ) if numeric_only is None and axis == 0: numeric_only = True diff --git a/python/pyspark/pandas/groupby.py b/python/pyspark/pandas/groupby.py index df671d71eecac..e66d08400b46b 100644 --- a/python/pyspark/pandas/groupby.py +++ b/python/pyspark/pandas/groupby.py @@ -614,9 +614,10 @@ def mean(self, numeric_only: Optional[bool] = True) -> FrameLike: Parameters ---------- - numeric_only : bool, default False + numeric_only : bool, default True Include only float, int, boolean columns. If None, will attempt to use - everything, then use only numeric data. + everything, then use only numeric data. False is not supported. + This parameter is mainly for pandas compatibility. .. versionadded:: 3.4.0 @@ -646,11 +647,6 @@ def mean(self, numeric_only: Optional[bool] = True) -> FrameLike: 2 4.0 1.500000 1.000000 """ self._validate_agg_columns(numeric_only=numeric_only, function_name="median") - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `True` in 4.0.0.", - FutureWarning, - ) return self._reduce_for_stat_function( F.mean, accepted_spark_types=(NumericType,), bool_to_numeric=True @@ -920,7 +916,7 @@ def sum(self, numeric_only: Optional[bool] = True, min_count: int = 0) -> FrameL ) # TODO: sync the doc. - def var(self, ddof: int = 1) -> FrameLike: + def var(self, ddof: int = 1, numeric_only: Optional[bool] = True) -> FrameLike: """ Compute variance of groups, excluding missing values. @@ -935,6 +931,13 @@ def var(self, ddof: int = 1) -> FrameLike: .. versionchanged:: 3.4.0 Supported including arbitary integers. + numeric_only : bool, default True + Include only float, int, boolean columns. If None, will attempt to use + everything, then use only numeric data. False is not supported. + This parameter is mainly for pandas compatibility. + + .. versionadded:: 4.0.0 + Examples -------- >>> df = ps.DataFrame({"A": [1, 2, 1, 2], "B": [True, False, False, True], @@ -961,6 +964,7 @@ def var(col: Column) -> Column: var, accepted_spark_types=(NumericType,), bool_to_numeric=True, + numeric_only=numeric_only, ) def skew(self) -> FrameLike: diff --git a/python/pyspark/pandas/series.py b/python/pyspark/pandas/series.py index d1b1d7631795c..7fa08c6d9b24c 100644 --- a/python/pyspark/pandas/series.py +++ b/python/pyspark/pandas/series.py @@ -67,6 +67,7 @@ Row, StructType, TimestampType, + NullType, ) from pyspark.sql.window import Window from pyspark.sql.utils import get_column_class, get_window_class @@ -4024,7 +4025,7 @@ def quantile( def quantile(psser: Series) -> PySparkColumn: spark_type = psser.spark.data_type spark_column = psser.spark.column - if isinstance(spark_type, (BooleanType, NumericType)): + if isinstance(spark_type, (BooleanType, NumericType, NullType)): return F.percentile_approx(spark_column.cast(DoubleType()), q_float, accuracy) else: raise TypeError( @@ -4059,7 +4060,8 @@ def rank( ascending : boolean, default True False for ranks by high (1) to low (N) numeric_only : bool, optional - If set to True, rank numeric Series, or return an empty Series for non-numeric Series + If set to True, rank numeric Series, or raise TypeError for non-numeric Series. + False is not supported. This parameter is mainly for pandas compatibility. Returns ------- @@ -4127,18 +4129,10 @@ def rank( y b z c Name: A, dtype: object - - >>> s.rank(numeric_only=True) - Series([], Name: A, dtype: float64) """ - warnings.warn( - "Default value of `numeric_only` will be changed to `False` " - "instead of `None` in 4.0.0.", - FutureWarning, - ) is_numeric = isinstance(self.spark.data_type, (NumericType, BooleanType)) if numeric_only and not is_numeric: - return ps.Series([], dtype="float64", name=self.name) + raise TypeError("Series.rank does not allow numeric_only=True with non-numeric dtype.") else: return self._rank(method, ascending).spark.analyzed diff --git a/python/pyspark/pandas/tests/computation/test_any_all.py b/python/pyspark/pandas/tests/computation/test_any_all.py index 3574254d1dbf0..64f293c48d64a 100644 --- a/python/pyspark/pandas/tests/computation/test_any_all.py +++ b/python/pyspark/pandas/tests/computation/test_any_all.py @@ -39,10 +39,6 @@ def df_pair(self): psdf = ps.from_pandas(pdf) return pdf, psdf - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43812): Enable DataFrameTests.test_all for pandas 2.0.0.", - ) def test_all(self): pdf = pd.DataFrame( { @@ -105,9 +101,15 @@ def test_all(self): self.assert_eq(psdf.all(skipna=True), pdf.all(skipna=True)) self.assert_eq(psdf.all(), pdf.all()) self.assert_eq( - ps.DataFrame([np.nan]).all(skipna=False), pd.DataFrame([np.nan]).all(skipna=False) + ps.DataFrame([np.nan]).all(skipna=False), + pd.DataFrame([np.nan]).all(skipna=False), + almost=True, + ) + self.assert_eq( + ps.DataFrame([None]).all(skipna=True), + pd.DataFrame([None]).all(skipna=True), + almost=True, ) - self.assert_eq(ps.DataFrame([None]).all(skipna=True), pd.DataFrame([None]).all(skipna=True)) def test_any(self): pdf = pd.DataFrame( diff --git a/python/pyspark/pandas/tests/computation/test_compute.py b/python/pyspark/pandas/tests/computation/test_compute.py index d4b49f2ac8b01..9a29cb236a8d6 100644 --- a/python/pyspark/pandas/tests/computation/test_compute.py +++ b/python/pyspark/pandas/tests/computation/test_compute.py @@ -283,10 +283,6 @@ def test_nunique(self): self.assert_eq(psdf.nunique(), pdf.nunique()) self.assert_eq(psdf.nunique(dropna=False), pdf.nunique(dropna=False)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43810): Enable DataFrameSlowTests.test_quantile for pandas 2.0.0.", - ) def test_quantile(self): pdf, psdf = self.df_pair @@ -332,59 +328,57 @@ def test_quantile(self): pdf = pd.DataFrame({"x": ["a", "b", "c"]}) psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5)) - self.assert_eq(psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75])) + self.assert_eq(psdf.quantile(0.5), pdf.quantile(0.5, numeric_only=True)) + self.assert_eq( + psdf.quantile([0.25, 0.5, 0.75]), pdf.quantile([0.25, 0.5, 0.75], numeric_only=True) + ) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): psdf.quantile(0.5, numeric_only=False) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): psdf.quantile([0.25, 0.5, 0.75], numeric_only=False) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43558): Enable DataFrameSlowTests.test_product for pandas 2.0.0.", - ) def test_product(self): pdf = pd.DataFrame( {"A": [1, 2, 3, 4, 5], "B": [10, 20, 30, 40, 50], "C": ["a", "b", "c", "d", "e"]} ) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index()) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index()) # Named columns pdf.columns.name = "Koalas" psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index()) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index()) # MultiIndex columns pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index()) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index()) # Named MultiIndex columns pdf.columns.names = ["Hello", "Koalas"] psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index()) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index()) # No numeric columns pdf = pd.DataFrame({"key": ["a", "b", "c"], "val": ["x", "y", "z"]}) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index()) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index()) # No numeric named columns pdf.columns.name = "Koalas" psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), almost=True) # No numeric MultiIndex columns pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y")]) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), almost=True) # No numeric named MultiIndex columns pdf.columns.names = ["Hello", "Koalas"] psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), almost=True) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), almost=True) # All NaN columns pdf = pd.DataFrame( @@ -395,22 +389,22 @@ def test_product(self): } ) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), check_exact=False) # All NaN named columns pdf.columns.name = "Koalas" psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), check_exact=False) # All NaN MultiIndex columns pdf.columns = pd.MultiIndex.from_tuples([("a", "x"), ("b", "y"), ("c", "z")]) psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), check_exact=False) # All NaN named MultiIndex columns pdf.columns.names = ["Hello", "Koalas"] psdf = ps.from_pandas(pdf) - self.assert_eq(pdf.prod(), psdf.prod().sort_index(), check_exact=False) + self.assert_eq(pdf.prod(numeric_only=True), psdf.prod().sort_index(), check_exact=False) class FrameComputeTests(FrameComputeMixin, ComparisonTestBase, SQLTestUtils): diff --git a/python/pyspark/pandas/tests/groupby/test_stat.py b/python/pyspark/pandas/tests/groupby/test_stat.py index 8a5096942e689..44bb3b0070914 100644 --- a/python/pyspark/pandas/tests/groupby/test_stat.py +++ b/python/pyspark/pandas/tests/groupby/test_stat.py @@ -58,12 +58,10 @@ def _test_stat_func(self, func, check_exact=True): check_exact=check_exact, ) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43554): Enable GroupByTests.test_basic_stat_funcs for pandas 2.0.0.", - ) def test_basic_stat_funcs(self): - self._test_stat_func(lambda groupby_obj: groupby_obj.var(), check_exact=False) + self._test_stat_func( + lambda groupby_obj: groupby_obj.var(numeric_only=True), check_exact=False + ) pdf, psdf = self.pdf, self.psdf @@ -102,12 +100,12 @@ def test_basic_stat_funcs(self): self.assert_eq( psdf.groupby("A").std().sort_index(), - pdf.groupby("A").std().sort_index(), + pdf.groupby("A").std(numeric_only=True).sort_index(), check_exact=False, ) self.assert_eq( psdf.groupby("A").sem().sort_index(), - pdf.groupby("A").sem().sort_index(), + pdf.groupby("A").sem(numeric_only=True).sort_index(), check_exact=False, ) @@ -115,17 +113,11 @@ def test_basic_stat_funcs(self): # self._test_stat_func(lambda groupby_obj: groupby_obj.sum(), check_exact=False) self.assert_eq( psdf.groupby("A").sum().sort_index(), - pdf.groupby("A").sum().sort_index(), + pdf.groupby("A").sum(numeric_only=True).sort_index(), check_exact=False, ) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43706): Enable GroupByTests.test_mean " "for pandas 2.0.0.", - ) def test_mean(self): - self._test_stat_func(lambda groupby_obj: groupby_obj.mean()) - self._test_stat_func(lambda groupby_obj: groupby_obj.mean(numeric_only=None)) self._test_stat_func(lambda groupby_obj: groupby_obj.mean(numeric_only=True)) psdf = self.psdf with self.assertRaises(TypeError): @@ -267,10 +259,6 @@ def test_nth(self): with self.assertRaisesRegex(TypeError, "Invalid index"): self.psdf.groupby("B").nth("x") - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43551): Enable GroupByTests.test_prod for pandas 2.0.0.", - ) def test_prod(self): pdf = pd.DataFrame( { @@ -286,19 +274,12 @@ def test_prod(self): psdf = ps.from_pandas(pdf) for n in [0, 1, 2, 128, -1, -2, -128]: - self._test_stat_func( - lambda groupby_obj: groupby_obj.prod(min_count=n), check_exact=False - ) - self._test_stat_func( - lambda groupby_obj: groupby_obj.prod(numeric_only=None, min_count=n), - check_exact=False, - ) self._test_stat_func( lambda groupby_obj: groupby_obj.prod(numeric_only=True, min_count=n), check_exact=False, ) self.assert_eq( - pdf.groupby("A").prod(min_count=n).sort_index(), + pdf.groupby("A").prod(min_count=n, numeric_only=True).sort_index(), psdf.groupby("A").prod(min_count=n).sort_index(), almost=True, ) diff --git a/python/pyspark/pandas/tests/series/test_stat.py b/python/pyspark/pandas/tests/series/test_stat.py index 2c25e21954d71..b54357bede440 100644 --- a/python/pyspark/pandas/tests/series/test_stat.py +++ b/python/pyspark/pandas/tests/series/test_stat.py @@ -321,10 +321,6 @@ def test_median(self): with self.assertRaisesRegex(TypeError, "accuracy must be an integer; however"): ps.Series([24.0, 21.0, 25.0, 33.0, 26.0]).median(accuracy="a") - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43463): Enable SeriesTests.test_rank for pandas 2.0.0.", - ) def test_rank(self): pser = pd.Series([1, 2, 3, 1], name="x") psser = ps.from_pandas(pser) @@ -338,10 +334,6 @@ def test_rank(self): non_numeric_pser = pd.Series(["a", "c", "b", "d"], name="x", index=[10, 11, 12, 13]) non_numeric_psser = ps.from_pandas(non_numeric_pser) - self.assert_eq( - non_numeric_pser.rank(numeric_only=True), - non_numeric_psser.rank(numeric_only=True), - ) self.assert_eq( non_numeric_pser.rank(numeric_only=None), non_numeric_psser.rank(numeric_only=None).sort_index(), @@ -350,10 +342,14 @@ def test_rank(self): non_numeric_pser.rank(numeric_only=False), non_numeric_psser.rank(numeric_only=False).sort_index(), ) - self.assert_eq( - (non_numeric_pser + "x").rank(numeric_only=True), - (non_numeric_psser + "x").rank(numeric_only=True), - ) + + msg = "Series.rank does not allow numeric_only=True with non-numeric dtype." + with self.assertRaisesRegex(TypeError, msg): + non_numeric_psser.rank(numeric_only=True) + + msg = "Series.rank does not allow numeric_only=True with non-numeric dtype." + with self.assertRaisesRegex(TypeError, msg): + (non_numeric_psser + "x").rank(numeric_only=True) msg = "method must be one of 'average', 'min', 'max', 'first', 'dense'" with self.assertRaisesRegex(ValueError, msg): @@ -378,10 +374,6 @@ def test_round(self): with self.assertRaisesRegex(TypeError, msg): psser.round(1.5) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43469): Enable SeriesTests.test_quantile for pandas 2.0.0.", - ) def test_quantile(self): pser = pd.Series([]) psser = ps.from_pandas(pser) @@ -520,10 +512,6 @@ def test_div_zero_and_nan(self): self.assert_eq(pser // 0, psser // 0) self.assert_eq(pser.floordiv(np.nan), psser.floordiv(np.nan)) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43481): Enable SeriesTests.test_product for pandas 2.0.0.", - ) def test_product(self): pser = pd.Series([10, 20, 30, 40, 50]) psser = ps.from_pandas(pser) @@ -539,23 +527,18 @@ def test_product(self): psser = ps.from_pandas(pser) self.assert_eq(pser.prod(), psser.prod()) - # Empty Series - pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) - # Boolean Series pser = pd.Series([True, True, True]) psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + self.assert_eq(pser.prod(numeric_only=True), psser.prod()) pser = pd.Series([False, False, False]) psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + self.assert_eq(pser.prod(numeric_only=True), psser.prod()) pser = pd.Series([True, False, True]) psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(), psser.prod()) + self.assert_eq(pser.prod(numeric_only=True), psser.prod()) # With `min_count` parameter pser = pd.Series([10, 20, 30, 40, 50]) @@ -572,10 +555,10 @@ def test_product(self): psser = ps.from_pandas(pser) self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) - pser = pd.Series([]) - psser = ps.from_pandas(pser) - self.assert_eq(pser.prod(min_count=1), psser.prod(min_count=1)) - + with self.assertRaisesRegex(TypeError, "Could not convert object \\(void\\) to numeric"): + ps.Series([]).prod(numeric_only=True) + with self.assertRaisesRegex(TypeError, "Could not convert object \\(void\\) to numeric"): + ps.Series([]).prod(min_count=1) with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"): ps.Series(["a", "b", "c"]).prod() with self.assertRaisesRegex( diff --git a/python/pyspark/pandas/tests/test_stats.py b/python/pyspark/pandas/tests/test_stats.py index ec56fa7ef1aee..be5340dafdc89 100644 --- a/python/pyspark/pandas/tests/test_stats.py +++ b/python/pyspark/pandas/tests/test_stats.py @@ -33,15 +33,24 @@ class StatsTestsMixin: def _test_stat_functions(self, pdf_or_pser, psdf_or_psser): - functions = ["max", "min", "mean", "sum", "count"] + self.assert_eq( + psdf_or_psser.count(), + pdf_or_pser.count(), + almost=True, + ) + + functions = ["max", "min", "mean", "sum"] for funcname in functions: - self.assert_eq(getattr(psdf_or_psser, funcname)(), getattr(pdf_or_pser, funcname)()) + self.assert_eq( + getattr(psdf_or_psser, funcname)(), + getattr(pdf_or_pser, funcname)(numeric_only=True), + ) functions = ["std", "var", "product", "sem"] for funcname in functions: self.assert_eq( getattr(psdf_or_psser, funcname)(), - getattr(pdf_or_pser, funcname)(), + getattr(pdf_or_pser, funcname)(numeric_only=True), check_exact=False, ) @@ -49,7 +58,7 @@ def _test_stat_functions(self, pdf_or_pser, psdf_or_psser): for funcname in functions: self.assert_eq( getattr(psdf_or_psser, funcname)(ddof=0), - getattr(pdf_or_pser, funcname)(ddof=0), + getattr(pdf_or_pser, funcname)(ddof=0, numeric_only=True), check_exact=False, ) @@ -76,11 +85,6 @@ def test_stat_functions_multiindex_column(self): self._test_stat_functions(pdf.A, psdf.A) self._test_stat_functions(pdf, psdf) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43499): Enable SeriesTests.test_stat_functions_with_no_numeric_columns " - "for pandas 2.0.0.", - ) def test_stat_functions_with_no_numeric_columns(self): pdf = pd.DataFrame( { @@ -407,10 +411,6 @@ def test_series_corr(self): almost=True, ) - @unittest.skipIf( - LooseVersion(pd.__version__) >= LooseVersion("2.0.0"), - "TODO(SPARK-43497): Enable SeriesTests.test_cov_corr_meta for pandas 2.0.0.", - ) def test_cov_corr_meta(self): # Disable arrow execution since corr() is using UDT internally which is not supported. with self.sql_conf({SPARK_CONF_ARROW_ENABLED: False}): @@ -428,7 +428,7 @@ def test_cov_corr_meta(self): index=pd.Index([1, 2, 3], name="myindex"), ) psdf = ps.from_pandas(pdf) - self.assert_eq(psdf.corr(), pdf.corr(), check_exact=False) + self.assert_eq(psdf.corr(), pdf.corr(numeric_only=True), check_exact=False) def test_stats_on_boolean_dataframe(self): pdf = pd.DataFrame({"A": [True, False, True], "B": [False, False, True]})