From 8a965a51be6190f0db864ca7b1ba37269b3a55bc Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Fri, 23 Mar 2018 10:33:47 -0700 Subject: [PATCH 1/7] initial commit: check for test udf and hive assembly --- python/pyspark/sql/tests.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 967cc83166f3..0f1a56d56771 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -21,6 +21,7 @@ individual modules. """ import os +import glob import sys import subprocess import pydoc @@ -84,6 +85,31 @@ from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException +def found_file(pattern): + SPARK_HOME = os.environ["SPARK_HOME"] + files = glob.glob(os.path.join(SPARK_HOME, pattern)) + return len(files) > 0 + + +def search_hive_assembly_jars(): + pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" + if not found_file(pattern): + raise Exception( + ("Failed to find Hive assembly jar. ") + + "You need to build Spark with " + "'build/sbt -Phive package' or " + "'build/mvn -DskipTests -Phive package' before running this test.") + + +def search_test_udf_classes(): + pattern = "sql/core/target/scala-*/test-classes/" + \ + "test/org/apache/spark/sql/JavaStringLength.class" + if not found_file(pattern): + raise Exception( + ("Failed to find test udf classes. ") + + "You need to build Spark with 'build/sbt sql/test:compile'") + + class UTCOffsetTimezone(datetime.tzinfo): """ Specifies timezone in UTC offset @@ -5205,6 +5231,8 @@ def test_invalid_args(self): if __name__ == "__main__": from pyspark.sql.tests import * + search_hive_assembly_jars() + search_test_udf_classes() if xmlrunner: unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports')) else: From 01857e1375625d87a1548e69104a36d06745b974 Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Thu, 29 Mar 2018 16:21:31 -0700 Subject: [PATCH 2/7] Move component checks from pyspark.sql.tests to the modules that care --- python/pyspark/sql/readwriter.py | 13 ++++++++++++- python/pyspark/sql/tests.py | 28 ---------------------------- python/pyspark/sql/udf.py | 11 +++++++++++ 3 files changed, 23 insertions(+), 29 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index e5288636c596..0b65049014cf 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -949,13 +949,24 @@ def jdbc(self, url, table, mode=None, properties=None): def _test(): import doctest import os + import os.path + import glob import tempfile import py4j from pyspark.context import SparkContext from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter - os.chdir(os.environ["SPARK_HOME"]) + SPARK_HOME = os.environ["SPARK_HOME"] + filename_pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" + if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): + raise Exception( + ("Failed to find Hive assembly jar. ") + + "You need to build Spark with " + "'build/sbt -Phive package' or " + "'build/mvn -DskipTests -Phive package' before running this test.") + + os.chdir(SPARK_HOME) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 0f1a56d56771..967cc83166f3 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -21,7 +21,6 @@ individual modules. """ import os -import glob import sys import subprocess import pydoc @@ -85,31 +84,6 @@ from pyspark.sql.utils import AnalysisException, ParseException, IllegalArgumentException -def found_file(pattern): - SPARK_HOME = os.environ["SPARK_HOME"] - files = glob.glob(os.path.join(SPARK_HOME, pattern)) - return len(files) > 0 - - -def search_hive_assembly_jars(): - pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" - if not found_file(pattern): - raise Exception( - ("Failed to find Hive assembly jar. ") + - "You need to build Spark with " - "'build/sbt -Phive package' or " - "'build/mvn -DskipTests -Phive package' before running this test.") - - -def search_test_udf_classes(): - pattern = "sql/core/target/scala-*/test-classes/" + \ - "test/org/apache/spark/sql/JavaStringLength.class" - if not found_file(pattern): - raise Exception( - ("Failed to find test udf classes. ") + - "You need to build Spark with 'build/sbt sql/test:compile'") - - class UTCOffsetTimezone(datetime.tzinfo): """ Specifies timezone in UTC offset @@ -5231,8 +5205,6 @@ def test_invalid_args(self): if __name__ == "__main__": from pyspark.sql.tests import * - search_hive_assembly_jars() - search_test_udf_classes() if xmlrunner: unittest.main(testRunner=xmlrunner.XMLTestRunner(output='target/test-reports')) else: diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 24dd06c26089..4028aaa2724a 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -385,8 +385,19 @@ def registerJavaUDAF(self, name, javaClassName): def _test(): import doctest + import os + import os.path + import glob from pyspark.sql import SparkSession import pyspark.sql.udf + + SPARK_HOME = os.environ["SPARK_HOME"] + filename_pattern = "sql/core/target/scala-*/test-classes/" + \ + "test/org/apache/spark/sql/JavaStringLength.class" + if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): + raise Exception( + ("Failed to find test udf classes. ") + + "You need to build Spark with 'build/sbt sql/test:compile'") globs = pyspark.sql.udf.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ From 4eeca667c06a16338889a296333356ab0faef626 Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Thu, 29 Mar 2018 16:45:29 -0700 Subject: [PATCH 3/7] Cleanup --- python/pyspark/sql/readwriter.py | 7 +++---- python/pyspark/sql/udf.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 0b65049014cf..6f4545f63d5b 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -961,10 +961,9 @@ def _test(): filename_pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): raise Exception( - ("Failed to find Hive assembly jar. ") + - "You need to build Spark with " - "'build/sbt -Phive package' or " - "'build/mvn -DskipTests -Phive package' before running this test.") + "Failed to find Hive assembly jar. You need to build Spark with " + "'build/sbt -Phive package' or 'build/mvn -DskipTests -Phive package' " + "before running this test.") os.chdir(SPARK_HOME) diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 4028aaa2724a..332d75525629 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -396,7 +396,7 @@ def _test(): "test/org/apache/spark/sql/JavaStringLength.class" if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): raise Exception( - ("Failed to find test udf classes. ") + + "Failed to find test udf classes. " "You need to build Spark with 'build/sbt sql/test:compile'") globs = pyspark.sql.udf.__dict__.copy() spark = SparkSession.builder\ From 0f830e2144da50c8b2a5239a61fa08fae40384e0 Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Thu, 29 Mar 2018 21:09:23 -0700 Subject: [PATCH 4/7] Skip HiveSparkSubmitTests if Hive not available --- python/pyspark/sql/tests.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/python/pyspark/sql/tests.py b/python/pyspark/sql/tests.py index 967cc83166f3..003bcd482743 100644 --- a/python/pyspark/sql/tests.py +++ b/python/pyspark/sql/tests.py @@ -2977,6 +2977,20 @@ def test_create_dateframe_from_pandas_with_dst(self): class HiveSparkSubmitTests(SparkSubmitTests): + @classmethod + def setUpClass(cls): + # get a SparkContext to check for availability of Hive + sc = SparkContext('local[4]', cls.__name__) + try: + sc._jvm.org.apache.hadoop.hive.conf.HiveConf() + except py4j.protocol.Py4JError: + raise unittest.SkipTest("Hive is not available") + except TypeError: + raise unittest.SkipTest("Hive is not available") + finally: + # we don't need SparkContext for the test + sc.stop() + def test_hivecontext(self): # This test checks that HiveContext is using Hive metastore (SPARK-16224). # It sets a metastore url and checks if there is a derby dir created by From 85c3a21e0adccb480ce0e26f52738ac41deed585 Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Tue, 3 Apr 2018 07:16:03 -0700 Subject: [PATCH 5/7] If Hive not enabled, skip doctests that will fail --- python/pyspark/sql/readwriter.py | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 6f4545f63d5b..1e2f6d8a5c98 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -945,7 +945,6 @@ def jdbc(self, url, table, mode=None, properties=None): jprop.setProperty(k, properties[k]) self.mode(mode)._jwrite.jdbc(url, table, jprop) - def _test(): import doctest import os @@ -957,15 +956,15 @@ def _test(): from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter - SPARK_HOME = os.environ["SPARK_HOME"] - filename_pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" - if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): - raise Exception( - "Failed to find Hive assembly jar. You need to build Spark with " - "'build/sbt -Phive package' or 'build/mvn -DskipTests -Phive package' " - "before running this test.") + # SPARK_HOME = os.environ["SPARK_HOME"] + # filename_pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" + # if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): + # raise Exception( + # "Failed to find Hive assembly jar. You need to build Spark with " + # "'build/sbt -Phive package' or 'build/mvn -DskipTests -Phive package' " + # "before running this test.") - os.chdir(SPARK_HOME) + os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') @@ -974,6 +973,19 @@ def _test(): except py4j.protocol.Py4JError: spark = SparkSession(sc) + hive_enabled = True + try: + sc._jvm.org.apache.hadoop.hive.conf.HiveConf() + except py4j.protocol.Py4JError: + hive_enabled = False + except TypeError: + hive_enabled = False + + # if hive is not enabled, then skip doctests that will fail + if not hive_enabled: + m = pyspark.sql.readwriter + m.__dict__["DataFrameReader"].__dict__["table"].__doc__ = "" + globs['tempfile'] = tempfile globs['os'] = os globs['sc'] = sc From 28f0548bf207fa2fc7a402a34229ea6e9df4ee77 Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Tue, 3 Apr 2018 10:15:51 -0700 Subject: [PATCH 6/7] If test udf files not compiled, skip doctests that will fail --- python/pyspark/sql/readwriter.py | 13 ++++--------- python/pyspark/sql/udf.py | 10 +++++++--- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 1e2f6d8a5c98..3da3acad20ba 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -945,6 +945,7 @@ def jdbc(self, url, table, mode=None, properties=None): jprop.setProperty(k, properties[k]) self.mode(mode)._jwrite.jdbc(url, table, jprop) + def _test(): import doctest import os @@ -956,14 +957,6 @@ def _test(): from pyspark.sql import SparkSession, Row import pyspark.sql.readwriter - # SPARK_HOME = os.environ["SPARK_HOME"] - # filename_pattern = "assembly/target/scala-*/jars/spark-hive_*-*.jar" - # if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): - # raise Exception( - # "Failed to find Hive assembly jar. You need to build Spark with " - # "'build/sbt -Phive package' or 'build/mvn -DskipTests -Phive package' " - # "before running this test.") - os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.readwriter.__dict__.copy() @@ -981,8 +974,10 @@ def _test(): except TypeError: hive_enabled = False - # if hive is not enabled, then skip doctests that will fail if not hive_enabled: + # if hive is not enabled, then skip doctests that need hive + # TODO: Need to communicate with outside world that this test + # has been skipped. m = pyspark.sql.readwriter m.__dict__["DataFrameReader"].__dict__["table"].__doc__ = "" diff --git a/python/pyspark/sql/udf.py b/python/pyspark/sql/udf.py index 332d75525629..39eff97c5cd3 100644 --- a/python/pyspark/sql/udf.py +++ b/python/pyspark/sql/udf.py @@ -395,9 +395,13 @@ def _test(): filename_pattern = "sql/core/target/scala-*/test-classes/" + \ "test/org/apache/spark/sql/JavaStringLength.class" if not glob.glob(os.path.join(SPARK_HOME, filename_pattern)): - raise Exception( - "Failed to find test udf classes. " - "You need to build Spark with 'build/sbt sql/test:compile'") + # if test udf files are not compiled, then skip the below doctests + # TODO: Need to communicate with outside world that these tests + # have been skipped. + m = pyspark.sql.udf + m.__dict__["UDFRegistration"].__dict__["registerJavaFunction"].__doc__ = "" + m.__dict__["UDFRegistration"].__dict__["registerJavaUDAF"].__doc__ = "" + globs = pyspark.sql.udf.__dict__.copy() spark = SparkSession.builder\ .master("local[4]")\ From db14acbb3a90c9da184fc9c909640e07100c38fa Mon Sep 17 00:00:00 2001 From: Bruce Robbins Date: Tue, 3 Apr 2018 11:38:19 -0700 Subject: [PATCH 7/7] Clean up --- python/pyspark/sql/readwriter.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py index 3da3acad20ba..18f7aebae7fe 100644 --- a/python/pyspark/sql/readwriter.py +++ b/python/pyspark/sql/readwriter.py @@ -949,8 +949,6 @@ def jdbc(self, url, table, mode=None, properties=None): def _test(): import doctest import os - import os.path - import glob import tempfile import py4j from pyspark.context import SparkContext