3131 import Queue
3232else :
3333 import queue as Queue
34+ from distutils .version import LooseVersion
3435
3536
3637# Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
3940
4041from sparktestsupport import SPARK_HOME # noqa (suppress pep8 warnings)
4142from sparktestsupport .shellutils import which , subprocess_check_output # noqa
42- from sparktestsupport .modules import all_modules # noqa
43+ from sparktestsupport .modules import all_modules , pyspark_sql # noqa
4344
4445
4546python_modules = dict ((m .name , m ) for m in all_modules if m .python_test_goals if m .name != 'root' )
@@ -151,6 +152,55 @@ def parse_opts():
151152 return opts
152153
153154
155+ def _check_dependencies (python_exec , modules_to_test ):
156+ # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
157+ # explicitly prints out. See SPARK-23300.
158+ if pyspark_sql in modules_to_test :
159+ # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
160+ minimum_pyarrow_version = '0.8.0'
161+ minimum_pandas_version = '0.19.2'
162+
163+ try :
164+ pyarrow_version = subprocess_check_output (
165+ [python_exec , "-c" , "import pyarrow; print(pyarrow.__version__)" ],
166+ universal_newlines = True ,
167+ stderr = open (os .devnull , 'w' )).strip ()
168+ if LooseVersion (pyarrow_version ) >= LooseVersion (minimum_pyarrow_version ):
169+ LOGGER .info ("Will test PyArrow related features against Python executable "
170+ "'%s' in '%s' module." % (python_exec , pyspark_sql .name ))
171+ else :
172+ LOGGER .warning (
173+ "Will skip PyArrow related features against Python executable "
174+ "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
175+ "%s was found." % (
176+ python_exec , pyspark_sql .name , minimum_pyarrow_version , pyarrow_version ))
177+ except :
178+ LOGGER .warning (
179+ "Will skip PyArrow related features against Python executable "
180+ "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
181+ "was not found." % (python_exec , pyspark_sql .name , minimum_pyarrow_version ))
182+
183+ try :
184+ pandas_version = subprocess_check_output (
185+ [python_exec , "-c" , "import pandas; print(pandas.__version__)" ],
186+ universal_newlines = True ,
187+ stderr = open (os .devnull , 'w' )).strip ()
188+ if LooseVersion (pandas_version ) >= LooseVersion (minimum_pandas_version ):
189+ LOGGER .info ("Will test Pandas related features against Python executable "
190+ "'%s' in '%s' module." % (python_exec , pyspark_sql .name ))
191+ else :
192+ LOGGER .warning (
193+ "Will skip Pandas related features against Python executable "
194+ "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
195+ "%s was found." % (
196+ python_exec , pyspark_sql .name , minimum_pandas_version , pandas_version ))
197+ except :
198+ LOGGER .warning (
199+ "Will skip Pandas related features against Python executable "
200+ "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
201+ "was not found." % (python_exec , pyspark_sql .name , minimum_pandas_version ))
202+
203+
154204def main ():
155205 opts = parse_opts ()
156206 if (opts .verbose ):
@@ -175,6 +225,10 @@ def main():
175225
176226 task_queue = Queue .PriorityQueue ()
177227 for python_exec in python_execs :
228+ # Check if the python executable has proper dependencies installed to run tests
229+ # for given modules properly.
230+ _check_dependencies (python_exec , modules_to_test )
231+
178232 python_implementation = subprocess_check_output (
179233 [python_exec , "-c" , "import platform; print(platform.python_implementation())" ],
180234 universal_newlines = True ).strip ()
0 commit comments