Skip to content

Commit

Permalink
Fix comments & tests
Browse files Browse the repository at this point in the history
  • Loading branch information
itholic committed Aug 30, 2019
1 parent d4068ca commit d35f53d
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 31 deletions.
22 changes: 2 additions & 20 deletions databricks/koalas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,26 +35,8 @@
# just a truncated repr.
"display.max_rows": 1000, # TODO: None should support unlimited.

# This sets the default index type
# There are three types of default index that can be configured by `compute.default_index_type`
# environment variable.
# - sequence: It implements a sequence that increases one by one, by Window function without
# specifying partition. Therefore, it ends up with whole partition in single node.
# This index type should be avoided when the data is large. This is default.
# - distributed: It implements a monotonically increasing sequence simply by using
# Spark's `monotonically_increasing_id` function. If the index does not have to be
# a sequence that increases one by one, this index should be used.
# Performance-wise, this index almost does not have any penalty comparing to
# other index types. Note that we cannot use this type of index for combining
# two dataframes because it is not guaranteed to have the same indexes in two
# dataframes.
# - distributed-sequence: It implements a sequence that increases one by one, by group-by and
# group-map approach. It still generates the sequential index globally.
# If the default index must be the sequence in a large dataset, this
# index has to be used.
# Note that if more data are added to the data source after creating this index,
# then it does not guarantee the sequential index.
"compute.default_index_type": "sequence"
# This sets the default index type: sequence, distributed and distributed-sequence.
"compute.default_index_type": "sequence",
} # type: Dict[str, Any]


Expand Down
18 changes: 7 additions & 11 deletions databricks/koalas/tests/test_default_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
import os

import pandas as pd

from databricks import koalas as ks
from databricks.koalas.config import set_option, reset_option
from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils


Expand All @@ -26,13 +25,12 @@ class OneByOneDefaultIndexTest(ReusedSQLTestCase, TestUtils):
@classmethod
def setUpClass(cls):
super(OneByOneDefaultIndexTest, cls).setUpClass()
cls.default_index = os.environ.get('DEFAULT_INDEX', 'sequence')
os.environ['DEFAULT_INDEX'] = 'sequence'
set_option('compute.default_index_type', 'sequence')

@classmethod
def tearDownClass(cls):
super(OneByOneDefaultIndexTest, cls).tearDownClass()
os.environ['DEFAULT_INDEX'] = cls.default_index
reset_option('compute.default_index_type')

def test_default_index(self):
sdf = self.spark.range(1000)
Expand All @@ -44,13 +42,12 @@ class DistributedOneByOneDefaultIndexTest(ReusedSQLTestCase, TestUtils):
@classmethod
def setUpClass(cls):
super(DistributedOneByOneDefaultIndexTest, cls).setUpClass()
cls.default_index = os.environ.get('DEFAULT_INDEX', 'sequence')
os.environ['DEFAULT_INDEX'] = 'distributed-sequence'
set_option('compute.default_index_type', 'distributed-sequence')

@classmethod
def tearDownClass(cls):
super(DistributedOneByOneDefaultIndexTest, cls).tearDownClass()
os.environ['DEFAULT_INDEX'] = cls.default_index
reset_option('compute.default_index_type')

def test_default_index(self):
sdf = self.spark.range(1000)
Expand All @@ -62,13 +59,12 @@ class DistributedDefaultIndexTest(ReusedSQLTestCase, TestUtils):
@classmethod
def setUpClass(cls):
super(DistributedDefaultIndexTest, cls).setUpClass()
cls.default_index = os.environ.get('DEFAULT_INDEX', 'sequence')
os.environ['DEFAULT_INDEX'] = 'distributed'
set_option('compute.default_index_type', 'distributed')

@classmethod
def tearDownClass(cls):
super(DistributedDefaultIndexTest, cls).tearDownClass()
os.environ['DEFAULT_INDEX'] = cls.default_index
reset_option('compute.default_index_type')

def test_default_index(self):
sdf = self.spark.range(1000)
Expand Down

0 comments on commit d35f53d

Please sign in to comment.