[SPARK-23300][TESTS] Prints out if Pandas and PyArrow are installed or not in PySpark SQL tests

HyukjinKwon · HyukjinKwon · commit 8141c3e3ddb5 · 2018-02-06T16:08:15.000+09:00
## What changes were proposed in this pull request? This PR proposes to log if PyArrow and Pandas are installed or not so we can check if related tests are going to be skipped or not. ## How was this patch tested? Manually tested: I don't have PyArrow installed in PyPy. ```bash $ ./run-tests --python-executables=python3 ``` ``` ... Will test against the following Python executables: ['python3'] Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming'] Will test PyArrow related features against Python executable 'python3' in 'pyspark-sql' module. Will test Pandas related features against Python executable 'python3' in 'pyspark-sql' module. Starting test(python3): pyspark.mllib.tests Starting test(python3): pyspark.sql.tests Starting test(python3): pyspark.streaming.tests Starting test(python3): pyspark.tests ``` ```bash $ ./run-tests --modules=pyspark-streaming ``` ``` ... Will test against the following Python executables: ['python2.7', 'pypy'] Will test the following Python modules: ['pyspark-streaming'] Starting test(pypy): pyspark.streaming.tests Starting test(pypy): pyspark.streaming.util Starting test(python2.7): pyspark.streaming.tests Starting test(python2.7): pyspark.streaming.util ``` ```bash $ ./run-tests ``` ``` ... Will test against the following Python executables: ['python2.7', 'pypy'] Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming'] Will test PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module. Will test Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module. Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 0.8.0 is required; however, PyArrow was not found. Will test Pandas related features against Python executable 'pypy' in 'pyspark-sql' module. Starting test(pypy): pyspark.streaming.tests Starting test(pypy): pyspark.sql.tests Starting test(pypy): pyspark.tests Starting test(python2.7): pyspark.mllib.tests ``` ```bash $ ./run-tests --modules=pyspark-sql --python-executables=pypy ``` ``` ... Will test against the following Python executables: ['pypy'] Will test the following Python modules: ['pyspark-sql'] Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 0.8.0 is required; however, PyArrow was not found. Will test Pandas related features against Python executable 'pypy' in 'pyspark-sql' module. Starting test(pypy): pyspark.sql.tests Starting test(pypy): pyspark.sql.catalog Starting test(pypy): pyspark.sql.column Starting test(pypy): pyspark.sql.conf ``` After some modification to produce other cases: ```bash $ ./run-tests ``` ``` ... Will test against the following Python executables: ['python2.7', 'pypy'] Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming'] Will skip PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module. PyArrow >= 20.0.0 is required; however, PyArrow 0.8.0 was found. Will skip Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module. Pandas >= 20.0.0 is required; however, Pandas 0.20.2 was found. Will skip PyArrow related features against Python executable 'pypy' in 'pyspark-sql' module. PyArrow >= 20.0.0 is required; however, PyArrow was not found. Will skip Pandas related features against Python executable 'pypy' in 'pyspark-sql' module. Pandas >= 20.0.0 is required; however, Pandas 0.22.0 was found. Starting test(pypy): pyspark.sql.tests Starting test(pypy): pyspark.streaming.tests Starting test(pypy): pyspark.tests Starting test(python2.7): pyspark.mllib.tests ``` ```bash ./run-tests-with-coverage ``` ``` ... Will test against the following Python executables: ['python2.7', 'pypy'] Will test the following Python modules: ['pyspark-core', 'pyspark-ml', 'pyspark-mllib', 'pyspark-sql', 'pyspark-streaming'] Will test PyArrow related features against Python executable 'python2.7' in 'pyspark-sql' module. Will test Pandas related features against Python executable 'python2.7' in 'pyspark-sql' module. Coverage is not installed in Python executable 'pypy' but 'COVERAGE_PROCESS_START' environment variable is set, exiting. ``` Author: hyukjinkwon <gurwls223@gmail.com> Closes #20473 from HyukjinKwon/SPARK-23300.
diff --git a/python/run-tests.py b/python/run-tests.py
@@ -31,15 +31,16 @@
     import Queue
 else:
     import queue as Queue
+from distutils.version import LooseVersion
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
 sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../dev/"))
 
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which, subprocess_check_output, run_cmd  # noqa
-from sparktestsupport.modules import all_modules  # noqa
+from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
 
 python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
@@ -151,6 +152,67 @@ def parse_opts():
     return opts
 
 
+def _check_dependencies(python_exec, modules_to_test):
+    if "COVERAGE_PROCESS_START" in os.environ:
+        # Make sure if coverage is installed.
+        try:
+            subprocess_check_output(
+                [python_exec, "-c", "import coverage"],
+                stderr=open(os.devnull, 'w'))
+        except:
+            print_red("Coverage is not installed in Python executable '%s' "
+                      "but 'COVERAGE_PROCESS_START' environment variable is set, "
+                      "exiting." % python_exec)
+            sys.exit(-1)
+
+    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
+    # explicitly prints out. See SPARK-23300.
+    if pyspark_sql in modules_to_test:
+        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
+        minimum_pyarrow_version = '0.8.0'
+        minimum_pandas_version = '0.19.2'
+
+        try:
+            pyarrow_version = subprocess_check_output(
+                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
+                LOGGER.info("Will test PyArrow related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip PyArrow related features against Python executable "
+                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
+        except:
+            LOGGER.warning(
+                "Will skip PyArrow related features against Python executable "
+                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
+
+        try:
+            pandas_version = subprocess_check_output(
+                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
+                LOGGER.info("Will test Pandas related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip Pandas related features against Python executable "
+                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
+        except:
+            LOGGER.warning(
+                "Will skip Pandas related features against Python executable "
+                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+
+
 def main():
     opts = parse_opts()
     if (opts.verbose):
@@ -175,9 +237,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
-        if "COVERAGE_PROCESS_START" in os.environ:
-            # Make sure if coverage is installed.
-            run_cmd([python_exec, "-c", "import coverage"])
+        # Check if the python executable has proper dependencies installed to run tests
+        # for given modules properly.
+        _check_dependencies(python_exec, modules_to_test)
+
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()