Add configuration value: compute.max_rows (#721)

itholic · HyukjinKwon · commit 4618def6ec96 · 2019-09-02T11:02:24.000+09:00
related with #718 i added compute.max_rows
diff --git a/databricks/koalas/config.py b/databricks/koalas/config.py
@@ -35,6 +35,12 @@
     # just a truncated repr.
     "display.max_rows": 1000,  # TODO: None should support unlimited.
 
+    # 'compute.max_rows sets the limit of the current DataFrame. Set `None` to unlimit
+    # the input length. When the limit is set, it is executed by the shortcut by collecting
+    # the data into driver side, and then using pandas API. If the limit is unset,
+    # the operation is executed by PySpark. Default is 1000.
+    "compute.max_rows": 1000,  # TODO: None should support unlimited.
+
     # This determines whether or not to operate between two different dataframs.
     # For example, 'combine_frames' function internally performs a join operation which can be
     # expensive in general.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -1354,7 +1354,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=Tr
 
     # TODO: enable doctests once we drop Spark 2.3.x (due to type coercion logic
     #  when creating arrays)
-    def transpose(self, limit: Optional[int] = 1000):
+    def transpose(self):
         """
         Transpose index and columns.
 
@@ -1365,23 +1365,17 @@ def transpose(self, limit: Optional[int] = 1000):
         .. note:: This method is based on an expensive operation due to the nature
             of big data. Internally it needs to generate each row for each value, and
             then group twice - it is a huge operation. To prevent misusage, this method
-            has the default limit of input length, 1000 and raises a ValueError.
+            has the 'compute.max_rows' default limit of input length, and raises a ValueError.
 
+                >>> from databricks.koalas.config import get_option, set_option
+                >>> set_option('compute.max_rows', 1000)
                 >>> ks.DataFrame({'a': range(1001)}).transpose()  # doctest: +NORMALIZE_WHITESPACE
                 Traceback (most recent call last):
                   ...
                 ValueError: Current DataFrame has more then the given limit 1000 rows.
-                Please use df.transpose(limit=<maximum number of rows>) to retrieve more than
-                1000 rows. Note that, before changing the given 'limit', this operation is
-                considerably expensive.
-
-        Parameters
-        ----------
-        limit : int, optional
-            This parameter sets the limit of the current DataFrame. Set `None` to unlimit
-            the input length. When the limit is set, it is executed by the shortcut by collecting
-            the data into driver side, and then using pandas API. If the limit is unset,
-            the operation is executed by PySpark. Default is 1000.
+                Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option'
+                to retrieve to retrieve more than 1000 rows. Note that, before changing the
+                'compute.max_rows', this operation is considerably expensive.
 
         Returns
         -------
@@ -1461,14 +1455,16 @@ def transpose(self, limit: Optional[int] = 1000):
         1    float64
         dtype: object
         """
-        if limit is not None:
-            pdf = self.head(limit + 1)._to_internal_pandas()
-            if len(pdf) > limit:
+        max_compute_count = get_option("compute.max_rows")
+        if max_compute_count is not None:
+            pdf = self.head(max_compute_count + 1)._to_internal_pandas()
+            if len(pdf) > max_compute_count:
                 raise ValueError(
-                    "Current DataFrame has more then the given limit %s rows. Please use "
-                    "df.transpose(limit=<maximum number of rows>) to retrieve more than %s rows. "
-                    "Note that, before changing the given 'limit', this operation is considerably "
-                    "expensive." % (limit, limit))
+                    "Current DataFrame has more then the given limit {0} rows. "
+                    "Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option' "
+                    "to retrieve to retrieve more than {0} rows. Note that, before changing the "
+                    "'compute.max_rows', this operation is considerably expensive."
+                    .format(max_compute_count))
             return DataFrame(pdf.transpose())
 
         # Explode the data to be pairs.
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -21,6 +21,7 @@
 from pyspark.sql.utils import AnalysisException
 
 from databricks import koalas as ks
+from databricks.koalas.config import set_option, reset_option
 from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
 from databricks.koalas.exceptions import PandasNotImplementedError
 from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame
@@ -1200,13 +1201,17 @@ def test_transpose(self):
             columns=['score', 'kids', 'age'])
         kdf2 = ks.from_pandas(pdf2)
 
-        self.assertEqual(
-            repr(pdf1.transpose().sort_index()),
-            repr(kdf1.transpose(limit=None).sort_index()))
+        set_option("compute.max_rows", None)
+        try:
+            self.assertEqual(
+                repr(pdf1.transpose().sort_index()),
+                repr(kdf1.transpose().sort_index()))
 
-        self.assert_eq(
-            repr(pdf2.transpose().sort_index()),
-            repr(kdf2.transpose(limit=None).sort_index()))
+            self.assert_eq(
+                repr(pdf2.transpose().sort_index()),
+                repr(kdf2.transpose().sort_index()))
+        except:
+            reset_option("compute.max_rows")
 
         self.assertEqual(
             repr(pdf1.transpose().sort_index()),
@@ -1222,9 +1227,13 @@ def test_transpose(self):
                                                              ('rg2', 'z')]))
         kdf3 = ks.from_pandas(pdf3)
 
-        self.assertEqual(
-            repr(pdf3.transpose().sort_index()),
-            repr(kdf3.transpose(limit=None).sort_index()))
+        set_option("compute.max_rows", None)
+        try:
+            self.assertEqual(
+                repr(pdf3.transpose().sort_index()),
+                repr(kdf3.transpose().sort_index()))
+        finally:
+            reset_option("compute.max_rows")
 
         self.assertEqual(
             repr(pdf3.transpose().sort_index()),