Skip to content

Commit 13190a4

Browse files
ueshinHyukjinKwon
authored andcommitted
[SPARK-22874][PYSPARK][SQL] Modify checking pandas version to use LooseVersion.
## What changes were proposed in this pull request? Currently we check pandas version by capturing if `ImportError` for the specific imports is raised or not but we can compare `LooseVersion` of the version strings as the same as we're checking pyarrow version. ## How was this patch tested? Existing tests. Author: Takuya UESHIN <[email protected]> Closes #20054 from ueshin/issues/SPARK-22874.
1 parent 8df1da3 commit 13190a4

File tree

6 files changed

+38
-36
lines changed

6 files changed

+38
-36
lines changed

python/pyspark/sql/dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1906,9 +1906,9 @@ def toPandas(self):
19061906
if self.sql_ctx.getConf("spark.sql.execution.arrow.enabled", "false").lower() == "true":
19071907
try:
19081908
from pyspark.sql.types import _check_dataframe_localize_timestamps
1909-
from pyspark.sql.utils import _require_minimum_pyarrow_version
1909+
from pyspark.sql.utils import require_minimum_pyarrow_version
19101910
import pyarrow
1911-
_require_minimum_pyarrow_version()
1911+
require_minimum_pyarrow_version()
19121912
tables = self._collectAsArrow()
19131913
if tables:
19141914
table = pyarrow.concat_tables(tables)

python/pyspark/sql/session.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -493,15 +493,14 @@ def _create_from_pandas_with_arrow(self, pdf, schema, timezone):
493493
data types will be used to coerce the data in Pandas to Arrow conversion.
494494
"""
495495
from pyspark.serializers import ArrowSerializer, _create_batch
496-
from pyspark.sql.types import from_arrow_schema, to_arrow_type, \
497-
_old_pandas_exception_message, TimestampType
498-
from pyspark.sql.utils import _require_minimum_pyarrow_version
499-
try:
500-
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
501-
except ImportError as e:
502-
raise ImportError(_old_pandas_exception_message(e))
496+
from pyspark.sql.types import from_arrow_schema, to_arrow_type, TimestampType
497+
from pyspark.sql.utils import require_minimum_pandas_version, \
498+
require_minimum_pyarrow_version
499+
500+
require_minimum_pandas_version()
501+
require_minimum_pyarrow_version()
503502

504-
_require_minimum_pyarrow_version()
503+
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
505504

506505
# Determine arrow types to coerce data when creating batches
507506
if isinstance(schema, StructType):

python/pyspark/sql/tests.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@
5353
try:
5454
import pandas
5555
try:
56-
import pandas.api
56+
from pyspark.sql.utils import require_minimum_pandas_version
57+
require_minimum_pandas_version()
5758
_have_pandas = True
5859
except:
5960
_have_old_pandas = True
@@ -2600,7 +2601,7 @@ def test_to_pandas(self):
26002601
@unittest.skipIf(not _have_old_pandas, "Old Pandas not installed")
26012602
def test_to_pandas_old(self):
26022603
with QuietTest(self.sc):
2603-
with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
2604+
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
26042605
self._to_pandas()
26052606

26062607
@unittest.skipIf(not _have_pandas, "Pandas not installed")
@@ -2643,7 +2644,7 @@ def test_create_dataframe_from_old_pandas(self):
26432644
pdf = pd.DataFrame({"ts": [datetime(2017, 10, 31, 1, 1, 1)],
26442645
"d": [pd.Timestamp.now().date()]})
26452646
with QuietTest(self.sc):
2646-
with self.assertRaisesRegexp(ImportError, 'Pandas \(.*\) must be installed'):
2647+
with self.assertRaisesRegexp(ImportError, 'Pandas >= .* must be installed'):
26472648
self.spark.createDataFrame(pdf)
26482649

26492650

python/pyspark/sql/types.py

Lines changed: 13 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1678,13 +1678,6 @@ def from_arrow_schema(arrow_schema):
16781678
for field in arrow_schema])
16791679

16801680

1681-
def _old_pandas_exception_message(e):
1682-
""" Create an error message for importing old Pandas.
1683-
"""
1684-
msg = "note: Pandas (>=0.19.2) must be installed and available on calling Python process"
1685-
return "%s\n%s" % (_exception_message(e), msg)
1686-
1687-
16881681
def _check_dataframe_localize_timestamps(pdf, timezone):
16891682
"""
16901683
Convert timezone aware timestamps to timezone-naive in the specified timezone or local timezone
@@ -1693,10 +1686,10 @@ def _check_dataframe_localize_timestamps(pdf, timezone):
16931686
:param timezone: the timezone to convert. if None then use local timezone
16941687
:return pandas.DataFrame where any timezone aware columns have been converted to tz-naive
16951688
"""
1696-
try:
1697-
from pandas.api.types import is_datetime64tz_dtype
1698-
except ImportError as e:
1699-
raise ImportError(_old_pandas_exception_message(e))
1689+
from pyspark.sql.utils import require_minimum_pandas_version
1690+
require_minimum_pandas_version()
1691+
1692+
from pandas.api.types import is_datetime64tz_dtype
17001693
tz = timezone or 'tzlocal()'
17011694
for column, series in pdf.iteritems():
17021695
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
@@ -1714,10 +1707,10 @@ def _check_series_convert_timestamps_internal(s, timezone):
17141707
:param timezone: the timezone to convert. if None then use local timezone
17151708
:return pandas.Series where if it is a timestamp, has been UTC normalized without a time zone
17161709
"""
1717-
try:
1718-
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
1719-
except ImportError as e:
1720-
raise ImportError(_old_pandas_exception_message(e))
1710+
from pyspark.sql.utils import require_minimum_pandas_version
1711+
require_minimum_pandas_version()
1712+
1713+
from pandas.api.types import is_datetime64_dtype, is_datetime64tz_dtype
17211714
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?
17221715
if is_datetime64_dtype(s.dtype):
17231716
tz = timezone or 'tzlocal()'
@@ -1737,11 +1730,11 @@ def _check_series_convert_timestamps_localize(s, from_timezone, to_timezone):
17371730
:param to_timezone: the timezone to convert to. if None then use local timezone
17381731
:return pandas.Series where if it is a timestamp, has been converted to tz-naive
17391732
"""
1740-
try:
1741-
import pandas as pd
1742-
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
1743-
except ImportError as e:
1744-
raise ImportError(_old_pandas_exception_message(e))
1733+
from pyspark.sql.utils import require_minimum_pandas_version
1734+
require_minimum_pandas_version()
1735+
1736+
import pandas as pd
1737+
from pandas.api.types import is_datetime64tz_dtype, is_datetime64_dtype
17451738
from_tz = from_timezone or 'tzlocal()'
17461739
to_tz = to_timezone or 'tzlocal()'
17471740
# TODO: handle nested timestamps, such as ArrayType(TimestampType())?

python/pyspark/sql/udf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,9 @@ def _create_udf(f, returnType, evalType):
3737
if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF or \
3838
evalType == PythonEvalType.SQL_PANDAS_GROUP_MAP_UDF:
3939
import inspect
40-
from pyspark.sql.utils import _require_minimum_pyarrow_version
40+
from pyspark.sql.utils import require_minimum_pyarrow_version
4141

42-
_require_minimum_pyarrow_version()
42+
require_minimum_pyarrow_version()
4343
argspec = inspect.getargspec(f)
4444

4545
if evalType == PythonEvalType.SQL_PANDAS_SCALAR_UDF and len(argspec.args) == 0 and \

python/pyspark/sql/utils.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,16 @@ def toJArray(gateway, jtype, arr):
112112
return jarr
113113

114114

115-
def _require_minimum_pyarrow_version():
115+
def require_minimum_pandas_version():
116+
""" Raise ImportError if minimum version of Pandas is not installed
117+
"""
118+
from distutils.version import LooseVersion
119+
import pandas
120+
if LooseVersion(pandas.__version__) < LooseVersion('0.19.2'):
121+
raise ImportError("Pandas >= 0.19.2 must be installed on calling Python process")
122+
123+
124+
def require_minimum_pyarrow_version():
116125
""" Raise ImportError if minimum version of pyarrow is not installed
117126
"""
118127
from distutils.version import LooseVersion

0 commit comments

Comments
 (0)