[SPARK-28881][PYTHON][TESTS] Add a test to make sure toPandas with Arrow optimization throws an exception per maxResultSize

HyukjinKwon · Willi Raschkowski · commit 879caf41c64b · 2020-06-05T12:27:06.000+02:00
### What changes were proposed in this pull request? This PR proposes to add a test case for: ```bash ./bin/pyspark --conf spark.driver.maxResultSize=1m spark.conf.set("spark.sql.execution.arrow.enabled",True) ``` ```python spark.range(10000000).toPandas() ``` ``` Empty DataFrame Columns: [id] Index: [] ``` which can result in partial results (see apache#25593 (comment)). This regression was found between Spark 2.3 and Spark 2.4, and accidentally fixed. ### Why are the changes needed? To prevent the same regression in the future. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Test was added. Closes apache#25594 from HyukjinKwon/SPARK-28881. Authored-by: HyukjinKwon <gurwls223@apache.org> Signed-off-by: HyukjinKwon <gurwls223@apache.org>
diff --git a/python/pyspark/sql/tests/test_arrow.py b/python/pyspark/sql/tests/test_arrow.py
@@ -22,7 +22,7 @@
 import unittest
 import warnings
 
-from pyspark.sql import Row
+from pyspark.sql import Row, SparkSession
 from pyspark.sql.functions import udf
 from pyspark.sql.types import *
 from pyspark.testing.sqlutils import ReusedSQLTestCase, have_pandas, have_pyarrow, \
@@ -430,6 +430,35 @@ def run_test(num_records, num_parts, max_records, use_delay=False):
             run_test(*case)
 
 
+@unittest.skipIf(
+    not have_pandas or not have_pyarrow,
+    pandas_requirement_message or pyarrow_requirement_message)
+class MaxResultArrowTests(unittest.TestCase):
+    # These tests are separate as 'spark.driver.maxResultSize' configuration
+    # is a static configuration to Spark context.
+
+    @classmethod
+    def setUpClass(cls):
+        cls.spark = SparkSession.builder \
+            .master("local[4]") \
+            .appName(cls.__name__) \
+            .config("spark.driver.maxResultSize", "10k") \
+            .getOrCreate()
+
+        # Explicitly enable Arrow and disable fallback.
+        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
+        cls.spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled", "false")
+
+    @classmethod
+    def tearDownClass(cls):
+        if hasattr(cls, "spark"):
+            cls.spark.stop()
+
+    def test_exception_by_max_results(self):
+        with self.assertRaisesRegexp(Exception, "is bigger than"):
+            self.spark.range(0, 10000, 1, 100).toPandas()
+
+
 class EncryptionArrowTests(ArrowTests):
 
     @classmethod