Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Python type name instead of Spark's in error messages. #1985

Merged
merged 5 commits into from
Dec 26, 2020
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@
as_spark_type,
infer_return_type,
spark_type_to_pandas_dtype,
spark_type_to_python_type,
DataFrameType,
SeriesType,
)
Expand Down Expand Up @@ -10200,7 +10201,11 @@ def quantile(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

if isinstance(q, list):
# First calculate the percentiles from all columns and map it to each `quantiles`
Expand Down
54 changes: 44 additions & 10 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
from databricks.koalas.indexing import AtIndexer, iAtIndexer, iLocIndexer, LocIndexer
from databricks.koalas.internal import InternalFrame
from databricks.koalas.spark import functions as SF
from databricks.koalas.typedef import Scalar
from databricks.koalas.typedef import Scalar, spark_type_to_python_type
from databricks.koalas.utils import (
is_name_like_tuple,
is_name_like_value,
Expand Down Expand Up @@ -1133,7 +1133,11 @@ def mean(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.mean(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1208,7 +1212,11 @@ def sum(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.coalesce(F.sum(spark_column), F.lit(0))

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1294,7 +1302,11 @@ def prod(spark_column, spark_type):
if isinstance(spark_type, IntegralType):
scol = F.round(scol).cast(LongType())
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return F.coalesce(scol, F.lit(1))

Expand Down Expand Up @@ -1345,7 +1357,11 @@ def skew(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.skewness(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1394,7 +1410,11 @@ def kurtosis(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.kurtosis(spark_column)

return self._reduce_for_stat_function(
Expand Down Expand Up @@ -1621,7 +1641,11 @@ def std(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.stddev(spark_column)

return self._reduce_for_stat_function(std, name="std", axis=axis, numeric_only=numeric_only)
Expand Down Expand Up @@ -1674,7 +1698,11 @@ def var(spark_column, spark_type):
if isinstance(spark_type, BooleanType):
spark_column = spark_column.cast(LongType())
elif not isinstance(spark_type, NumericType):
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)
return F.variance(spark_column)

return self._reduce_for_stat_function(var, name="var", axis=axis, numeric_only=numeric_only)
Expand Down Expand Up @@ -1773,7 +1801,11 @@ def median(spark_column, spark_type):
if isinstance(spark_type, (BooleanType, NumericType)):
return SF.percentile_approx(spark_column.cast(DoubleType()), 0.5, accuracy)
else:
raise TypeError("Could not convert {} to numeric".format(spark_type.simpleString()))
raise TypeError(
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return self._reduce_for_stat_function(
median, name="median", numeric_only=numeric_only, axis=axis
Expand Down Expand Up @@ -1851,7 +1883,9 @@ def abs(kser):
return kser.spark.transform(F.abs)
else:
raise TypeError(
"bad operand type for abs(): {}".format(kser.spark.data_type.simpleString())
"bad operand type for abs(): {}".format(
spark_type_to_python_type(kser.spark.data_type).__name__
)
)

return self._apply_series_op(abs)
Expand Down
20 changes: 16 additions & 4 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,13 @@
from databricks.koalas.spark import functions as SF
from databricks.koalas.spark.accessors import SparkSeriesMethods
from databricks.koalas.strings import StringMethods
from databricks.koalas.typedef import infer_return_type, SeriesType, ScalarType, Scalar
from databricks.koalas.typedef import (
infer_return_type,
spark_type_to_python_type,
SeriesType,
ScalarType,
Scalar,
)


# This regular expression pattern is complied and defined here to avoid to compile the same
Expand Down Expand Up @@ -3302,7 +3308,9 @@ def quantile(spark_column, spark_type):
return SF.percentile_approx(spark_column.cast(DoubleType()), q, accuracy)
else:
raise TypeError(
"Could not convert {} to numeric".format(spark_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(spark_type).__name__
)
)

return self._reduce_for_stat_function(quantile, name="quantile")
Expand Down Expand Up @@ -5703,7 +5711,9 @@ def _cumsum(self, skipna, part_cols=()):
kser = kser.spark.transform(lambda scol: scol.cast(LongType()))
elif not isinstance(kser.spark.data_type, NumericType):
raise TypeError(
"Could not convert {} to numeric".format(kser.spark.data_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(kser.spark.data_type).__name__
)
)
return kser._cum(F.sum, skipna, part_cols)

Expand Down Expand Up @@ -5731,7 +5741,9 @@ def _cumprod(self, skipna, part_cols=()):
scol = F.round(scol).cast(LongType())
else:
raise TypeError(
"Could not convert {} to numeric".format(self.spark.data_type.simpleString())
"Could not convert {} to numeric".format(
spark_type_to_python_type(self.spark.data_type).__name__
)
)

return self._with_new_scol(scol)
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -4283,9 +4283,9 @@ def test_quantile(self):
self.assert_eq(kdf.quantile(0.5), pd.Series(name=0.5))
self.assert_eq(kdf.quantile([0.25, 0.5, 0.75]), pd.DataFrame(index=[0.25, 0.5, 0.75]))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.quantile(0.5, numeric_only=False)
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.quantile([0.25, 0.5, 0.75], numeric_only=False)

def test_pct_change(self):
Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,9 +1267,9 @@ def test_quantile(self):
with self.assertRaisesRegex(ValueError, "q must be a float or an array of floats;"):
ks.Series([24.0, 21.0, 25.0, 33.0, 26.0]).quantile(q=["a"])

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).quantile()
with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).quantile([0.25, 0.5, 0.75])

def test_idxmax(self):
Expand Down Expand Up @@ -2228,9 +2228,9 @@ def test_product(self):
kser = ks.from_pandas(pser)
self.assert_eq(pser.prod(min_count=1), kser.prod(min_count=1))

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
ks.Series(["a", "b", "c"]).prod()
with self.assertRaisesRegex(TypeError, "Could not convert timestamp to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert datetime to numeric"):
ks.Series([pd.Timestamp("2016-01-01") for _ in range(3)]).prod()

def test_hasnans(self):
Expand Down
8 changes: 4 additions & 4 deletions databricks/koalas/tests/test_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,9 @@ def test_abs(self):
self.assert_eq(kdf[["B", "C"]].abs(), pdf[["B", "C"]].abs())
self.assert_eq(kdf[["E"]].abs(), pdf[["E"]].abs())

with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
kdf.abs()
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): string"):
with self.assertRaisesRegex(TypeError, "bad operand type for abs\\(\\): str"):
kdf.D.abs()

def test_axis_on_dataframe(self):
Expand Down Expand Up @@ -307,8 +307,8 @@ def test_numeric_only_unsupported(self):
pdf[["i", "b"]].sum(numeric_only=False).astype(int),
)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.sum(numeric_only=False)

with self.assertRaisesRegex(TypeError, "Could not convert string to numeric"):
with self.assertRaisesRegex(TypeError, "Could not convert str to numeric"):
kdf.s.sum()
11 changes: 10 additions & 1 deletion databricks/koalas/tests/test_typedef.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
TimestampType,
)

from databricks.koalas.typedef import infer_return_type, as_spark_type
from databricks.koalas.typedef import infer_return_type, as_spark_type, spark_type_to_python_type
from databricks import koalas as ks


Expand Down Expand Up @@ -246,7 +246,15 @@ def test_as_spark_type(self):
# DecimalType
decimal.Decimal: DecimalType(38, 18),
# ArrayType
list: ArrayType(StringType()),
np.ndarray: ArrayType(StringType()),
}

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
self.assertEqual(as_spark_type(spark_type_to_python_type(spark_type)), spark_type)

type_mapper = {
List[bytes]: ArrayType(BinaryType()),
List[np.character]: ArrayType(BinaryType()),
List[np.bytes_]: ArrayType(BinaryType()),
Expand Down Expand Up @@ -274,3 +282,4 @@ def test_as_spark_type(self):

for numpy_or_python_type, spark_type in type_mapper.items():
self.assertEqual(as_spark_type(numpy_or_python_type), spark_type)
self.assertEqual(spark_type_to_python_type(spark_type), list)
43 changes: 41 additions & 2 deletions databricks/koalas/typedef/typehints.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def as_spark_type(tpe) -> types.DataType:
"""
# TODO: Add "boolean" and "string" types.
# ArrayType
if tpe in (np.ndarray,):
if tpe in (list, np.ndarray,):
ueshin marked this conversation as resolved.
Show resolved Hide resolved
return types.ArrayType(types.StringType())
elif hasattr(tpe, "__origin__") and issubclass(tpe.__origin__, list):
return types.ArrayType(as_spark_type(tpe.__args__[0]))
Expand Down Expand Up @@ -142,7 +142,46 @@ def as_spark_type(tpe) -> types.DataType:
raise TypeError("Type %s was not understood." % tpe)


def spark_type_to_pandas_dtype(spark_type):
def spark_type_to_python_type(spark_type: types.DataType) -> type:
""" Return the given Spark DataType to Python type. """
# ArrayType
if isinstance(spark_type, types.ArrayType):
return list
ueshin marked this conversation as resolved.
Show resolved Hide resolved
# BinaryType
elif isinstance(spark_type, types.BinaryType):
return bytes
# BooleanType
elif isinstance(spark_type, types.BooleanType):
return bool
# DateType
elif isinstance(spark_type, types.DateType):
return datetime.date
# NumericType
elif isinstance(spark_type, types.ByteType):
return np.int8
elif isinstance(spark_type, types.DecimalType):
return decimal.Decimal
elif isinstance(spark_type, types.DoubleType):
return float
elif isinstance(spark_type, types.FloatType):
return np.float32
elif isinstance(spark_type, types.IntegerType):
return np.int32
elif isinstance(spark_type, types.LongType):
return int
elif isinstance(spark_type, types.ShortType):
return np.int16
# StringType
elif isinstance(spark_type, types.StringType):
return str
# TimestampType
elif isinstance(spark_type, types.TimestampType):
return datetime.datetime
else:
raise TypeError("Type %s was not understood." % spark_type.simpleString())


def spark_type_to_pandas_dtype(spark_type: types.DataType) -> np.dtype:
ueshin marked this conversation as resolved.
Show resolved Hide resolved
""" Return the given Spark DataType to pandas dtype. """
if isinstance(spark_type, (types.DateType, types.StructType, types.UserDefinedType)):
return np.dtype("object")
Expand Down
7 changes: 4 additions & 3 deletions docs/source/user_guide/types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -164,9 +164,9 @@ np.ndarray ArrayType(StringType())

The table below shows which Python data types are matched to which PySpark data types internally in Koalas.

================= ===================
================= =======================
Python PySpark
================= ===================
================= =======================
bytes BinaryType
int LongType
float DoubleType
Expand All @@ -175,7 +175,8 @@ bool BooleanType
datetime.datetime TimestampType
datetime.date DateType
decimal.Decimal DecimalType(38, 18)
================= ===================
list ArrayType(StringType())
================= =======================

For decimal type, Koalas uses Spark's system default precision and scale.

Expand Down