[SPARK-23300][TESTS][BRANCH-2.3] Prints out if Pandas and PyArrow are installed or not in PySpark SQL tests

HyukjinKwon · HyukjinKwon · commit 2ba07d5b101c · 2018-02-08T09:29:31.000+09:00
This PR backports #20473 to branch-2.3. Author: hyukjinkwon <gurwls223@gmail.com> Closes #20533 from HyukjinKwon/backport-20473.
diff --git a/python/run-tests.py b/python/run-tests.py
@@ -31,6 +31,7 @@
     import Queue
 else:
     import queue as Queue
+from distutils.version import LooseVersion
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -39,7 +40,7 @@
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
 from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
-from sparktestsupport.modules import all_modules  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
 
 python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
@@ -151,6 +152,55 @@ def parse_opts():
     return opts
 
 
+def _check_dependencies(python_exec, modules_to_test):
+    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
+    # explicitly prints out. See SPARK-23300.
+    if pyspark_sql in modules_to_test:
+        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
+        minimum_pyarrow_version = '0.8.0'
+        minimum_pandas_version = '0.19.2'
+
+        try:
+            pyarrow_version = subprocess_check_output(
+                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
+                LOGGER.info("Will test PyArrow related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip PyArrow related features against Python executable "
+                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
+        except:
+            LOGGER.warning(
+                "Will skip PyArrow related features against Python executable "
+                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
+
+        try:
+            pandas_version = subprocess_check_output(
+                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
+                LOGGER.info("Will test Pandas related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip Pandas related features against Python executable "
+                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
+        except:
+            LOGGER.warning(
+                "Will skip Pandas related features against Python executable "
+                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+
+
 def main():
     opts = parse_opts()
     if (opts.verbose):
@@ -175,6 +225,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
+        # Check if the python executable has proper dependencies installed to run tests
+        # for given modules properly.
+        _check_dependencies(python_exec, modules_to_test)
+
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()