Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Python type name instead of Spark's in error messages. #1985

Merged
merged 5 commits into from
Dec 26, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -10198,7 +10198,11 @@ def quantile(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

if isinstance(q, list):
# First calculate the percentiles from all columns and map it to each `quantiles`
Expand Down
55 changes: 45 additions & 10 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
from databricks.koalas.internal import InternalFrame
from databricks.koalas.spark import functions as SF
from databricks.koalas.typedef import Scalar
from databricks.koalas.typedef import Scalar, spark_type_to_pandas_dtype
from databricks.koalas.utils import (
is_name_like_tuple,
is_name_like_value,
Expand Down Expand Up @@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.mean(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.coalesce(F.sum(spark_column), F.lit(0))

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
if isinstance(spark_type, IntegralType):
scol = F.round(scol).cast(LongType())
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return F.coalesce(scol, F.lit(1))

Expand Down Expand Up @@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.skewness(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
return F.kurtosis(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1633,7 +1653,11 @@ def std(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
if ddof == 0:
return F.stddev_pop(spark_column)
else:
Expand Down Expand Up @@ -1703,7 +1727,11 @@ def var(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)
if ddof == 0:
return F.var_pop(spark_column)
else:
Expand Down Expand Up @@ -1807,7 +1835,11 @@ def median(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return self._reduce_for_stat_function(
median, name="median", numeric_only=numeric_only, axis=axis
Expand Down Expand Up @@ -1885,7 +1917,10 @@ def abs(kser):
return kser.spark.transform(F.abs)
else:
raise TypeError(
"bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
"bad operand type for abs(): {} ({})".format(
spark_type_to_pandas_dtype(kser.spark.data_type),
kser.spark.data_type.simpleString(),
)
)

return self._apply_series_op(abs)
Expand Down
22 changes: 18 additions & 4 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@
from databricks.koalas.spark import functions as SF
from databricks.koalas.spark.accessors import SparkSeriesMethods
from databricks.koalas.strings import StringMethods
from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
from databricks.koalas.typedef import (
infer_return_type,
spark_type_to_pandas_dtype,
SeriesType,
ScalarType,
Scalar,
)


# This regular expression pattern is complied and defined here to avoid to compile the same
Expand Down Expand Up @@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError(
"Could not convert {} to numeric".format(spark_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(spark_type), spark_type.simpleString()
)
)

return self._reduce_for_stat_function(quantile, name="quantile")
Expand Down Expand Up @@ -5703,7 +5711,10 @@ def _cumsum(self, skipna, part_cols=()):
kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
elif not isinstance(kser.spark.data_type, NumericType):
raise TypeError(
"Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(kser.spark.data_type),
kser.spark.data_type.simpleString(),
)
)
return kser._cum(F.sum, skipna, part_cols)

Expand Down Expand Up @@ -5731,7 +5742,10 @@ def _cumprod(self, skipna, part_cols=()):
scol = F.round(scol).cast(LongType())
else:
raise TypeError(
"Could not convert {} to numeric".format(self.spark.data_type.simpleString())
"Could not convert {} ({}) to numeric".format(
spark_type_to_pandas_dtype(self.spark.data_type),
self.spark.data_type.simpleString(),
)
)

return self._with_new_scol(scol)
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4283,9 +4283,9 @@ def test_quantile(self):
self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.quantile(0.5, numeric_only=False)
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)

def test_pct_change(self):
Expand Down
10 changes: 6 additions & 4 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,9 +1267,9 @@ def test_quantile(self):
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).quantile()
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])

def test_idxmax(self):
Expand Down Expand Up @@ -2228,9 +2228,11 @@ def test_product(self):
kser = ks.from_pandas(pser)
self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
ks.Series(["a", "b", "c"]).prod()
with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
with self.assertRaisesRegex(
TypeError, "Could not convert datetime64\\[ns\\] \\(timestamp\\) to numeric"
):
ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()

def test_hasnans(self):
Expand Down
12 changes: 8 additions & 4 deletions databricks/koalas/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,9 +140,13 @@ def test_abs(self):
self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())

with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(
TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
):
kdf.abs()
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(
TypeError, "bad operand type for abs\\(\\): object \\(string\\)"
):
kdf.D.abs()

def test_axis_on_dataframe(self):
Expand Down Expand Up @@ -331,8 +335,8 @@ def test_numeric_only_unsupported(self):
pdf[["i", "b"]].sum(numeric_only=False).astype(int),
)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.sum(numeric_only=False)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert object \\(string\\) to numeric"):
kdf.s.sum()