Skip to content

Commit 4618def

Browse files
itholicHyukjinKwon
authored andcommitted
Add configuration value: compute.max_rows (#721)
related with #718 i added compute.max_rows
1 parent f8973e8 commit 4618def

File tree

3 files changed

+40
-29
lines changed

3 files changed

+40
-29
lines changed

databricks/koalas/config.py

+6
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,12 @@
3535
# just a truncated repr.
3636
"display.max_rows": 1000, # TODO: None should support unlimited.
3737

38+
# 'compute.max_rows sets the limit of the current DataFrame. Set `None` to unlimit
39+
# the input length. When the limit is set, it is executed by the shortcut by collecting
40+
# the data into driver side, and then using pandas API. If the limit is unset,
41+
# the operation is executed by PySpark. Default is 1000.
42+
"compute.max_rows": 1000, # TODO: None should support unlimited.
43+
3844
# This determines whether or not to operate between two different dataframs.
3945
# For example, 'combine_frames' function internally performs a join operation which can be
4046
# expensive in general.

databricks/koalas/frame.py

+16-20
Original file line numberDiff line numberDiff line change
@@ -1354,7 +1354,7 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, index=Tr
13541354

13551355
# TODO: enable doctests once we drop Spark 2.3.x (due to type coercion logic
13561356
# when creating arrays)
1357-
def transpose(self, limit: Optional[int] = 1000):
1357+
def transpose(self):
13581358
"""
13591359
Transpose index and columns.
13601360
@@ -1365,23 +1365,17 @@ def transpose(self, limit: Optional[int] = 1000):
13651365
.. note:: This method is based on an expensive operation due to the nature
13661366
of big data. Internally it needs to generate each row for each value, and
13671367
then group twice - it is a huge operation. To prevent misusage, this method
1368-
has the default limit of input length, 1000 and raises a ValueError.
1368+
has the 'compute.max_rows' default limit of input length, and raises a ValueError.
13691369
1370+
>>> from databricks.koalas.config import get_option, set_option
1371+
>>> set_option('compute.max_rows', 1000)
13701372
>>> ks.DataFrame({'a': range(1001)}).transpose() # doctest: +NORMALIZE_WHITESPACE
13711373
Traceback (most recent call last):
13721374
...
13731375
ValueError: Current DataFrame has more then the given limit 1000 rows.
1374-
Please use df.transpose(limit=<maximum number of rows>) to retrieve more than
1375-
1000 rows. Note that, before changing the given 'limit', this operation is
1376-
considerably expensive.
1377-
1378-
Parameters
1379-
----------
1380-
limit : int, optional
1381-
This parameter sets the limit of the current DataFrame. Set `None` to unlimit
1382-
the input length. When the limit is set, it is executed by the shortcut by collecting
1383-
the data into driver side, and then using pandas API. If the limit is unset,
1384-
the operation is executed by PySpark. Default is 1000.
1376+
Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option'
1377+
to retrieve to retrieve more than 1000 rows. Note that, before changing the
1378+
'compute.max_rows', this operation is considerably expensive.
13851379
13861380
Returns
13871381
-------
@@ -1461,14 +1455,16 @@ def transpose(self, limit: Optional[int] = 1000):
14611455
1 float64
14621456
dtype: object
14631457
"""
1464-
if limit is not None:
1465-
pdf = self.head(limit + 1)._to_internal_pandas()
1466-
if len(pdf) > limit:
1458+
max_compute_count = get_option("compute.max_rows")
1459+
if max_compute_count is not None:
1460+
pdf = self.head(max_compute_count + 1)._to_internal_pandas()
1461+
if len(pdf) > max_compute_count:
14671462
raise ValueError(
1468-
"Current DataFrame has more then the given limit %s rows. Please use "
1469-
"df.transpose(limit=<maximum number of rows>) to retrieve more than %s rows. "
1470-
"Note that, before changing the given 'limit', this operation is considerably "
1471-
"expensive." % (limit, limit))
1463+
"Current DataFrame has more then the given limit {0} rows. "
1464+
"Please set 'compute.max_rows' by using 'databricks.koalas.config.set_option' "
1465+
"to retrieve to retrieve more than {0} rows. Note that, before changing the "
1466+
"'compute.max_rows', this operation is considerably expensive."
1467+
.format(max_compute_count))
14721468
return DataFrame(pdf.transpose())
14731469

14741470
# Explode the data to be pairs.

databricks/koalas/tests/test_dataframe.py

+18-9
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
from pyspark.sql.utils import AnalysisException
2222

2323
from databricks import koalas as ks
24+
from databricks.koalas.config import set_option, reset_option
2425
from databricks.koalas.testing.utils import ReusedSQLTestCase, SQLTestUtils
2526
from databricks.koalas.exceptions import PandasNotImplementedError
2627
from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame
@@ -1200,13 +1201,17 @@ def test_transpose(self):
12001201
columns=['score', 'kids', 'age'])
12011202
kdf2 = ks.from_pandas(pdf2)
12021203

1203-
self.assertEqual(
1204-
repr(pdf1.transpose().sort_index()),
1205-
repr(kdf1.transpose(limit=None).sort_index()))
1204+
set_option("compute.max_rows", None)
1205+
try:
1206+
self.assertEqual(
1207+
repr(pdf1.transpose().sort_index()),
1208+
repr(kdf1.transpose().sort_index()))
12061209

1207-
self.assert_eq(
1208-
repr(pdf2.transpose().sort_index()),
1209-
repr(kdf2.transpose(limit=None).sort_index()))
1210+
self.assert_eq(
1211+
repr(pdf2.transpose().sort_index()),
1212+
repr(kdf2.transpose().sort_index()))
1213+
except:
1214+
reset_option("compute.max_rows")
12101215

12111216
self.assertEqual(
12121217
repr(pdf1.transpose().sort_index()),
@@ -1222,9 +1227,13 @@ def test_transpose(self):
12221227
('rg2', 'z')]))
12231228
kdf3 = ks.from_pandas(pdf3)
12241229

1225-
self.assertEqual(
1226-
repr(pdf3.transpose().sort_index()),
1227-
repr(kdf3.transpose(limit=None).sort_index()))
1230+
set_option("compute.max_rows", None)
1231+
try:
1232+
self.assertEqual(
1233+
repr(pdf3.transpose().sort_index()),
1234+
repr(kdf3.transpose().sort_index()))
1235+
finally:
1236+
reset_option("compute.max_rows")
12281237

12291238
self.assertEqual(
12301239
repr(pdf3.transpose().sort_index()),

0 commit comments

Comments
 (0)