-
Notifications
You must be signed in to change notification settings - Fork 29.3k
[SPARK-24554][PYTHON][SQL] Add MapType support for PySpark with Arrow #30393
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
642934f
8a563ca
3ee35b8
dfec9a5
aed88b2
cbe6c23
2db803b
a92af2f
dec2797
78b2604
b257470
3f2ef98
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,14 +20,15 @@ | |
| pandas instances during the type conversion. | ||
| """ | ||
|
|
||
| from pyspark.sql.types import ByteType, ShortType, IntegerType, LongType, FloatType, \ | ||
| DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, ArrayType, \ | ||
| StructType, StructField, BooleanType | ||
| from pyspark.sql.types import BooleanType, ByteType, ShortType, IntegerType, LongType, \ | ||
| FloatType, DoubleType, DecimalType, StringType, BinaryType, DateType, TimestampType, \ | ||
| ArrayType, MapType, StructType, StructField | ||
|
|
||
|
|
||
| def to_arrow_type(dt): | ||
| """ Convert Spark data type to pyarrow type | ||
| """ | ||
| from distutils.version import LooseVersion | ||
| import pyarrow as pa | ||
| if type(dt) == BooleanType: | ||
| arrow_type = pa.bool_() | ||
|
|
@@ -58,6 +59,13 @@ def to_arrow_type(dt): | |
| if type(dt.elementType) in [StructType, TimestampType]: | ||
| raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) | ||
| arrow_type = pa.list_(to_arrow_type(dt.elementType)) | ||
| elif type(dt) == MapType: | ||
| if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): | ||
| raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") | ||
| if type(dt.keyType) in [StructType, TimestampType] or \ | ||
| type(dt.valueType) in [StructType, TimestampType]: | ||
| raise TypeError("Unsupported type in conversion to Arrow: " + str(dt)) | ||
| arrow_type = pa.map_(to_arrow_type(dt.keyType), to_arrow_type(dt.valueType)) | ||
| elif type(dt) == StructType: | ||
| if any(type(field.dataType) == StructType for field in dt): | ||
| raise TypeError("Nested StructType not supported in conversion to Arrow") | ||
|
|
@@ -81,6 +89,8 @@ def to_arrow_schema(schema): | |
| def from_arrow_type(at): | ||
| """ Convert pyarrow type to Spark data type. | ||
| """ | ||
| from distutils.version import LooseVersion | ||
| import pyarrow as pa | ||
| import pyarrow.types as types | ||
| if types.is_boolean(at): | ||
| spark_type = BooleanType() | ||
|
|
@@ -110,6 +120,12 @@ def from_arrow_type(at): | |
| if types.is_timestamp(at.value_type): | ||
| raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) | ||
| spark_type = ArrayType(from_arrow_type(at.value_type)) | ||
| elif types.is_map(at): | ||
| if LooseVersion(pa.__version__) < LooseVersion("2.0.0"): | ||
| raise TypeError("MapType is only supported with pyarrow 2.0.0 and above") | ||
| if types.is_timestamp(at.key_type) or types.is_timestamp(at.item_type): | ||
| raise TypeError("Unsupported type in conversion from Arrow: " + str(at)) | ||
| spark_type = MapType(from_arrow_type(at.key_type), from_arrow_type(at.item_type)) | ||
| elif types.is_struct(at): | ||
| if any(types.is_struct(field.type) for field in at): | ||
| raise TypeError("Nested StructType not supported in conversion from Arrow: " + str(at)) | ||
|
|
@@ -306,3 +322,23 @@ def _check_series_convert_timestamps_tz_local(s, timezone): | |
| `pandas.Series` where if it is a timestamp, has been converted to tz-naive | ||
| """ | ||
| return _check_series_convert_timestamps_localize(s, timezone, None) | ||
|
|
||
|
|
||
| def _convert_map_items_to_dict(s): | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note: these conversion functions are because pyarrow expects map items as a list of (key, value) pairs, and has this format when converting to Pandas also. The reason is that the arrow spec could allow for duplicate key values in a row, and doesn't say how these should be handled exactly. So by having these conversions, we match the non-arrow behavior for maps, with a dictionary as input/output. |
||
| """ | ||
| Convert a series with items as list of (key, value), as made from an Arrow column of map type, | ||
| to dict for compatibility with non-arrow MapType columns. | ||
| :param s: pandas.Series of lists of (key, value) pairs | ||
| :return: pandas.Series of dictionaries | ||
| """ | ||
| return s.apply(lambda m: None if m is None else {k: v for k, v in m}) | ||
|
|
||
|
|
||
| def _convert_dict_to_map_items(s): | ||
| """ | ||
| Convert a series of dictionaries to list of (key, value) pairs to match expected data | ||
| for Arrow column of map type. | ||
| :param s: pandas.Series of dictionaries | ||
| :return: pandas.Series of lists of (key, value) pairs | ||
| """ | ||
| return s.apply(lambda d: list(d.items()) if d is not None else None) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I should probably mention MapType only for pyarrow 2.0.0..
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done