From fe2943e00d78aff2fea41871b2d8091d4393bbef Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Thu, 1 Feb 2018 21:55:19 +0900
Subject: [PATCH 1/2] Prints out if Pandas and PyArrow are installed or not in
 PySpark SQL tests

---
 python/run-tests.py | 72 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 67 insertions(+), 5 deletions(-)

diff --git a/python/run-tests.py b/python/run-tests.py
index f03284c33428..5f174ec9c043 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -31,6 +31,7 @@
     import Queue
 else:
     import queue as Queue
+from distutils.version import LooseVersion
 
 
 # Append `SPARK_HOME/dev` to the Python path so that we can import the sparktestsupport module
@@ -38,8 +39,8 @@
 
 
 from sparktestsupport import SPARK_HOME  # noqa (suppress pep8 warnings)
-from sparktestsupport.shellutils import which, subprocess_check_output, run_cmd  # noqa
-from sparktestsupport.modules import all_modules  # noqa
+from sparktestsupport.shellutils import which, subprocess_check_output  # noqa
+from sparktestsupport.modules import all_modules, pyspark_sql  # noqa
 
 
 python_modules = dict((m.name, m) for m in all_modules if m.python_test_goals if m.name != 'root')
@@ -151,6 +152,66 @@ def parse_opts():
     return opts
 
 
+def _check_dependencies(python_exec, modules_to_test):
+    if "COVERAGE_PROCESS_START" in os.environ:
+        # Make sure if coverage is installed.
+        try:
+            subprocess_check_output(
+                [python_exec, "-c", "import coverage"],
+                stderr=open(os.devnull, 'w'))
+        except:
+            print_red("Coverage is not installed in Python executable '%s' "
+                      "but 'COVERAGE_PROCESS_START' environment variable is set, "
+                      "exiting." % python_exec)
+            sys.exit(-1)
+
+    # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
+    # explicitly prints out. See SPARK-23300.
+    if pyspark_sql in modules_to_test:
+        minimum_pyarrow_version = '0.8.0'
+        minimum_pandas_version = '0.19.2'
+
+        try:
+            pyarrow_version = subprocess_check_output(
+                [python_exec, "-c", "import pyarrow; print(pyarrow.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pyarrow_version) >= LooseVersion(minimum_pyarrow_version):
+                LOGGER.info("Will test PyArrow related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip PyArrow related features against Python executable "
+                    "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pyarrow_version, pyarrow_version))
+        except:
+            LOGGER.warning(
+                "Will skip PyArrow related features against Python executable "
+                "'%s' in '%s' module. PyArrow >= %s is required; however, PyArrow "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pyarrow_version))
+
+        try:
+            pandas_version = subprocess_check_output(
+                [python_exec, "-c", "import pandas; print(pandas.__version__)"],
+                universal_newlines=True,
+                stderr=open(os.devnull, 'w')).strip()
+            if LooseVersion(pandas_version) >= LooseVersion(minimum_pandas_version):
+                LOGGER.info("Will test Pandas related features against Python executable "
+                            "'%s' in '%s' module." % (python_exec, pyspark_sql.name))
+            else:
+                LOGGER.warning(
+                    "Will skip Pandas related features against Python executable "
+                    "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                    "%s was found." % (
+                        python_exec, pyspark_sql.name, minimum_pandas_version, pandas_version))
+        except:
+            LOGGER.warning(
+                "Will skip Pandas related features against Python executable "
+                "'%s' in '%s' module. Pandas >= %s is required; however, Pandas "
+                "was not found." % (python_exec, pyspark_sql.name, minimum_pandas_version))
+
+
 def main():
     opts = parse_opts()
     if (opts.verbose):
@@ -175,9 +236,10 @@ def main():
 
     task_queue = Queue.PriorityQueue()
     for python_exec in python_execs:
-        if "COVERAGE_PROCESS_START" in os.environ:
-            # Make sure if coverage is installed.
-            run_cmd([python_exec, "-c", "import coverage"])
+        # Check if the python executable has proper dependencies installed to run tests
+        # for given modules properly.
+        _check_dependencies(python_exec, modules_to_test)
+
         python_implementation = subprocess_check_output(
             [python_exec, "-c", "import platform; print(platform.python_implementation())"],
             universal_newlines=True).strip()

From 78f5879a75c085ec558f26ba83d7334542623418 Mon Sep 17 00:00:00 2001
From: hyukjinkwon <gurwls223@gmail.com>
Date: Mon, 5 Feb 2018 11:03:47 +0900
Subject: [PATCH 2/2] Replace it to a prettier comment

---
 python/run-tests.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/run-tests.py b/python/run-tests.py
index 5f174ec9c043..6b41b5ee2281 100755
--- a/python/run-tests.py
+++ b/python/run-tests.py
@@ -168,6 +168,7 @@ def _check_dependencies(python_exec, modules_to_test):
     # If we should test 'pyspark-sql', it checks if PyArrow and Pandas are installed and
     # explicitly prints out. See SPARK-23300.
     if pyspark_sql in modules_to_test:
+        # TODO(HyukjinKwon): Relocate and deduplicate these version specifications.
         minimum_pyarrow_version = '0.8.0'
         minimum_pandas_version = '0.19.2'