From c5dcb4b3218679378170e6d8e6ee9801e7ebf8b4 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 28 Oct 2022 02:59:59 -0700 Subject: [PATCH 1/4] [SPARK-40951][PYSPARK][TESTS] pyspark-connect tests should be skipped if pandas doesn't exist --- .../sql/tests/connect/test_connect_basic.py | 14 +++++++++----- .../connect/test_connect_column_expressions.py | 14 +++++++++----- .../sql/tests/connect/test_connect_plan_only.py | 11 +++++++---- .../sql/tests/connect/test_connect_select_ops.py | 10 ++++++---- python/pyspark/testing/connectutils.py | 15 ++++++++++----- 5 files changed, 41 insertions(+), 23 deletions(-) diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 459b05cc37aa..5c7cd9450589 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -18,14 +18,17 @@ import unittest import shutil import tempfile +from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message -import pandas +if have_pandas: + import pandas from pyspark.sql import SparkSession, Row from pyspark.sql.types import StructType, StructField, LongType, StringType -from pyspark.sql.connect.client import RemoteSparkSession -from pyspark.sql.connect.function_builder import udf -from pyspark.sql.connect.functions import lit +if have_pandas: + from pyspark.sql.connect.client import RemoteSparkSession + from pyspark.sql.connect.function_builder import udf + from pyspark.sql.connect.functions import lit from pyspark.sql.dataframe import DataFrame from pyspark.testing.connectutils import should_test_connect, connect_requirement_message from pyspark.testing.utils import ReusedPySparkTestCase @@ -36,7 +39,8 @@ class SparkConnectSQLTestCase(ReusedPySparkTestCase): """Parent test fixture class for all Spark Connect related test cases.""" - connect: RemoteSparkSession + if have_pandas: + connect: RemoteSparkSession tbl_name: str df_text: "DataFrame" diff --git a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py index 6036b63d76f2..3de391650f8b 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py +++ b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py @@ -15,14 +15,18 @@ # limitations under the License. # +import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture -from pyspark.sql.connect.proto import Expression as ProtoExpression -import pyspark.sql.connect as c -import pyspark.sql.connect.plan as p -import pyspark.sql.connect.column as col -import pyspark.sql.connect.functions as fun +from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message +if have_pandas: + from pyspark.sql.connect.proto import Expression as ProtoExpression + import pyspark.sql.connect as c + import pyspark.sql.connect.plan as p + import pyspark.sql.connect.column as col + import pyspark.sql.connect.functions as fun +@unittest.skipIf(not have_pandas, pandas_requirement_message) class SparkConnectColumnExpressionSuite(PlanOnlyTestFixture): def test_simple_column_expressions(self): df = c.DataFrame.withPlan(p.Read("table")) diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py index 450f5c70faba..6a5ddd1737da 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py @@ -17,12 +17,15 @@ import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture -import pyspark.sql.connect.proto as proto -from pyspark.sql.connect.readwriter import DataFrameReader -from pyspark.sql.connect.function_builder import UserDefinedFunction, udf -from pyspark.sql.types import StringType +from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message +if have_pandas: + import pyspark.sql.connect.proto as proto + from pyspark.sql.connect.readwriter import DataFrameReader + from pyspark.sql.connect.function_builder import UserDefinedFunction, udf + from pyspark.sql.types import StringType +@unittest.skipIf(not have_pandas, pandas_requirement_message) class SparkConnectTestsPlanOnly(PlanOnlyTestFixture): """These test cases exercise the interface to the proto plan generation but do not call Spark.""" diff --git a/python/pyspark/sql/tests/connect/test_connect_select_ops.py b/python/pyspark/sql/tests/connect/test_connect_select_ops.py index e89b4b34ea01..26278bf594e7 100644 --- a/python/pyspark/sql/tests/connect/test_connect_select_ops.py +++ b/python/pyspark/sql/tests/connect/test_connect_select_ops.py @@ -15,10 +15,12 @@ # limitations under the License. # from pyspark.testing.connectutils import PlanOnlyTestFixture -from pyspark.sql.connect import DataFrame -from pyspark.sql.connect.functions import col -from pyspark.sql.connect.plan import Read -import pyspark.sql.connect.proto as proto +from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message +if have_pandas: + from pyspark.sql.connect import DataFrame + from pyspark.sql.connect.functions import col + from pyspark.sql.connect.plan import Read + import pyspark.sql.connect.proto as proto class SparkConnectToProtoSuite(PlanOnlyTestFixture): diff --git a/python/pyspark/testing/connectutils.py b/python/pyspark/testing/connectutils.py index 700b7bb72e18..626a49516ec6 100644 --- a/python/pyspark/testing/connectutils.py +++ b/python/pyspark/testing/connectutils.py @@ -18,13 +18,18 @@ from typing import Any, Dict import functools import unittest +from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message -from pyspark.sql.connect import DataFrame -from pyspark.sql.connect.plan import Read -from pyspark.testing.utils import search_jar +if have_pandas: + from pyspark.sql.connect import DataFrame + from pyspark.sql.connect.plan import Read + from pyspark.testing.utils import search_jar + + connect_jar = search_jar("connector/connect", "spark-connect-assembly-", "spark-connect") +else: + connect_jar = None -connect_jar = search_jar("connector/connect", "spark-connect-assembly-", "spark-connect") if connect_jar is None: connect_requirement_message = ( "Skipping all Spark Connect Python tests as the optional Spark Connect project was " @@ -38,7 +43,7 @@ os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, plugin_args, existing_args]) connect_requirement_message = None # type: ignore -should_test_connect = connect_requirement_message is None +should_test_connect = connect_requirement_message is None and have_pandas class MockRemoteSession: From 4059b163248eb99c4ce30eea8402990c8a08bb32 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 28 Oct 2022 03:25:06 -0700 Subject: [PATCH 2/4] reformat --- python/pyspark/sql/tests/connect/test_connect_basic.py | 1 + .../pyspark/sql/tests/connect/test_connect_column_expressions.py | 1 + python/pyspark/sql/tests/connect/test_connect_plan_only.py | 1 + python/pyspark/sql/tests/connect/test_connect_select_ops.py | 1 + 4 files changed, 4 insertions(+) diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 5c7cd9450589..51fa430bd960 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -25,6 +25,7 @@ from pyspark.sql import SparkSession, Row from pyspark.sql.types import StructType, StructField, LongType, StringType + if have_pandas: from pyspark.sql.connect.client import RemoteSparkSession from pyspark.sql.connect.function_builder import udf diff --git a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py index 3de391650f8b..36c526ecd4cf 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py +++ b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py @@ -18,6 +18,7 @@ import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message + if have_pandas: from pyspark.sql.connect.proto import Expression as ProtoExpression import pyspark.sql.connect as c diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py index 6a5ddd1737da..ee2943754fb1 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py @@ -18,6 +18,7 @@ from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message + if have_pandas: import pyspark.sql.connect.proto as proto from pyspark.sql.connect.readwriter import DataFrameReader diff --git a/python/pyspark/sql/tests/connect/test_connect_select_ops.py b/python/pyspark/sql/tests/connect/test_connect_select_ops.py index 26278bf594e7..5d51c46fb2d0 100644 --- a/python/pyspark/sql/tests/connect/test_connect_select_ops.py +++ b/python/pyspark/sql/tests/connect/test_connect_select_ops.py @@ -16,6 +16,7 @@ # from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message + if have_pandas: from pyspark.sql.connect import DataFrame from pyspark.sql.connect.functions import col From 59548300ce9eceb3f4d9de4b94c8110dd6c10adb Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 28 Oct 2022 03:33:46 -0700 Subject: [PATCH 3/4] fix style --- python/pyspark/sql/tests/connect/test_connect_basic.py | 2 +- python/pyspark/sql/tests/connect/test_connect_select_ops.py | 3 +++ python/pyspark/testing/connectutils.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/pyspark/sql/tests/connect/test_connect_basic.py b/python/pyspark/sql/tests/connect/test_connect_basic.py index 51fa430bd960..17d50f0f50e9 100644 --- a/python/pyspark/sql/tests/connect/test_connect_basic.py +++ b/python/pyspark/sql/tests/connect/test_connect_basic.py @@ -18,7 +18,7 @@ import unittest import shutil import tempfile -from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message +from pyspark.testing.sqlutils import have_pandas if have_pandas: import pandas diff --git a/python/pyspark/sql/tests/connect/test_connect_select_ops.py b/python/pyspark/sql/tests/connect/test_connect_select_ops.py index 5d51c46fb2d0..7524d7e551ac 100644 --- a/python/pyspark/sql/tests/connect/test_connect_select_ops.py +++ b/python/pyspark/sql/tests/connect/test_connect_select_ops.py @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import unittest + from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message @@ -24,6 +26,7 @@ import pyspark.sql.connect.proto as proto +@unittest.skipIf(not have_pandas, pandas_requirement_message) class SparkConnectToProtoSuite(PlanOnlyTestFixture): def test_select_with_columns_and_strings(self): df = DataFrame.withPlan(Read("table")) diff --git a/python/pyspark/testing/connectutils.py b/python/pyspark/testing/connectutils.py index 626a49516ec6..d9bced3af114 100644 --- a/python/pyspark/testing/connectutils.py +++ b/python/pyspark/testing/connectutils.py @@ -18,7 +18,7 @@ from typing import Any, Dict import functools import unittest -from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message +from pyspark.testing.sqlutils import have_pandas if have_pandas: from pyspark.sql.connect import DataFrame From ecb40b1b6b6cd9ba3cb8b3b45649a7a93727f713 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Fri, 28 Oct 2022 09:30:45 -0700 Subject: [PATCH 4/4] fix mypy style --- .../sql/tests/connect/test_connect_column_expressions.py | 3 ++- python/pyspark/sql/tests/connect/test_connect_plan_only.py | 3 ++- python/pyspark/sql/tests/connect/test_connect_select_ops.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py index 36c526ecd4cf..790a987e8809 100644 --- a/python/pyspark/sql/tests/connect/test_connect_column_expressions.py +++ b/python/pyspark/sql/tests/connect/test_connect_column_expressions.py @@ -15,6 +15,7 @@ # limitations under the License. # +from typing import cast import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture from pyspark.testing.sqlutils import have_pandas, pandas_requirement_message @@ -27,7 +28,7 @@ import pyspark.sql.connect.functions as fun -@unittest.skipIf(not have_pandas, pandas_requirement_message) +@unittest.skipIf(not have_pandas, cast(str, pandas_requirement_message)) class SparkConnectColumnExpressionSuite(PlanOnlyTestFixture): def test_simple_column_expressions(self): df = c.DataFrame.withPlan(p.Read("table")) diff --git a/python/pyspark/sql/tests/connect/test_connect_plan_only.py b/python/pyspark/sql/tests/connect/test_connect_plan_only.py index ee2943754fb1..14b939e019ba 100644 --- a/python/pyspark/sql/tests/connect/test_connect_plan_only.py +++ b/python/pyspark/sql/tests/connect/test_connect_plan_only.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from typing import cast import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture @@ -26,7 +27,7 @@ from pyspark.sql.types import StringType -@unittest.skipIf(not have_pandas, pandas_requirement_message) +@unittest.skipIf(not have_pandas, cast(str, pandas_requirement_message)) class SparkConnectTestsPlanOnly(PlanOnlyTestFixture): """These test cases exercise the interface to the proto plan generation but do not call Spark.""" diff --git a/python/pyspark/sql/tests/connect/test_connect_select_ops.py b/python/pyspark/sql/tests/connect/test_connect_select_ops.py index 7524d7e551ac..a29c70541462 100644 --- a/python/pyspark/sql/tests/connect/test_connect_select_ops.py +++ b/python/pyspark/sql/tests/connect/test_connect_select_ops.py @@ -14,6 +14,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # +from typing import cast import unittest from pyspark.testing.connectutils import PlanOnlyTestFixture @@ -26,7 +27,7 @@ import pyspark.sql.connect.proto as proto -@unittest.skipIf(not have_pandas, pandas_requirement_message) +@unittest.skipIf(not have_pandas, cast(str, pandas_requirement_message)) class SparkConnectToProtoSuite(PlanOnlyTestFixture): def test_select_with_columns_and_strings(self): df = DataFrame.withPlan(Read("table"))