From 7e39b472c533add5c5e021bdf696ddb5f4ec1670 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 30 Dec 2019 12:02:07 +0900 Subject: [PATCH 01/14] fix #1158 --- databricks/koalas/indexing.py | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index c25120b985..26a6b96747 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -290,13 +290,12 @@ class LocIndexer(_LocIndexerLike): Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. - Also note that the row for 'sidewinder' is included since 'sidewinder' - is between 'cobra' and 'viper'. + Also note that the row for 'sidewinder' is not included since 'sidewinder' + is not between 'cobra' and 'viper'. >>> df.loc['cobra':'viper', 'max_speed'] - cobra 1 - viper 4 - sidewinder 7 + cobra 1 + viper 4 Name: max_speed, dtype: int64 Conditional that returns a boolean Series @@ -400,16 +399,23 @@ def _select_rows(self, rows_sel): # If slice is None - select everything, so nothing to do return None, None elif len(self._internal.index_columns) == 1: - start = rows_sel.start - stop = rows_sel.stop - + sdf = self._kdf_or_kser._internal.sdf index_column = self._kdf_or_kser.index.to_series() - index_data_type = index_column.spark_type + + # get natural order from '__natural_order__' from start to stop + # based on index_columns to keep natural order. + start = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ + .where(index_column._scol == rows_sel.start) \ + .first()[0] + stop = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ + .where(index_column._scol == rows_sel.stop) \ + .first()[0] + cond = [] if start is not None: - cond.append(index_column._scol >= F.lit(start).cast(index_data_type)) + cond.append(sdf[NATURAL_ORDER_COLUMN_NAME] >= F.lit(start)) if stop is not None: - cond.append(index_column._scol <= F.lit(stop).cast(index_data_type)) + cond.append(sdf[NATURAL_ORDER_COLUMN_NAME] <= F.lit(stop)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None From 83c1b8194fa54a342bfe0ad7e9d0d00588d7e2f4 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 30 Dec 2019 13:21:01 +0900 Subject: [PATCH 02/14] separate logic to catch KeyError --- databricks/koalas/indexing.py | 36 ++++++++++++++++-------- databricks/koalas/tests/test_indexing.py | 8 ++++++ 2 files changed, 33 insertions(+), 11 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 26a6b96747..b031e4992a 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -22,7 +22,7 @@ from pandas.api.types import is_list_like from pyspark import sql as spark from pyspark.sql import functions as F -from pyspark.sql.types import BooleanType +from pyspark.sql.types import BooleanType, StringType from pyspark.sql.utils import AnalysisException from databricks.koalas.internal import _InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME @@ -401,21 +401,35 @@ def _select_rows(self, rows_sel): elif len(self._internal.index_columns) == 1: sdf = self._kdf_or_kser._internal.sdf index_column = self._kdf_or_kser.index.to_series() + index_data_type = index_column.spark_type + start = rows_sel.start + stop = rows_sel.stop # get natural order from '__natural_order__' from start to stop - # based on index_columns to keep natural order. - start = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ - .where(index_column._scol == rows_sel.start) \ - .first()[0] - stop = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ - .where(index_column._scol == rows_sel.stop) \ - .first()[0] - + # to keep natural order when type of rows are StringType + if isinstance(index_data_type, StringType): + start = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ + .where(index_column._scol == start) \ + .first() + stop = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ + .where(index_column._scol == stop) \ + .first() + order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + if start is not None: + start = start[0] + else: + raise KeyError(rows_sel.start) + if stop is not None: + stop = stop[0] + else: + raise KeyError(rows_sel.stop) + else: + order_column = index_column._scol cond = [] if start is not None: - cond.append(sdf[NATURAL_ORDER_COLUMN_NAME] >= F.lit(start)) + cond.append(order_column >= F.lit(start)) if stop is not None: - cond.append(sdf[NATURAL_ORDER_COLUMN_NAME] <= F.lit(stop)) + cond.append(order_column <= F.lit(stop)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index c2ed505699..f1458de50a 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -223,6 +223,14 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) + # KeyError test for string type index + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], + index=['cobra', 'viper', 'sidewinder'], + columns=['max_speed', 'shield']) + + self.assertRaises(KeyError, lambda: kdf.loc['cobra':'koalas']) + self.assertRaises(KeyError, lambda: kdf.loc['koalas':'viper']) + def test_loc_non_informative_index(self): pdf = pd.DataFrame({'x': [1, 2, 3, 4]}, index=[10, 20, 30, 40]) kdf = ks.from_pandas(pdf) From f53d735f29e78bc4a48ca0666151ac011aff7ce3 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 1 Jan 2020 16:50:42 +0900 Subject: [PATCH 03/14] fix to cast type proprtlu --- databricks/koalas/indexing.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index b031e4992a..a8cfa3ec05 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -22,7 +22,7 @@ from pandas.api.types import is_list_like from pyspark import sql as spark from pyspark.sql import functions as F -from pyspark.sql.types import BooleanType, StringType +from pyspark.sql.types import BooleanType, StringType, LongType from pyspark.sql.utils import AnalysisException from databricks.koalas.internal import _InternalFrame, HIDDEN_COLUMNS, NATURAL_ORDER_COLUMN_NAME @@ -417,19 +417,24 @@ def _select_rows(self, rows_sel): order_column = sdf[NATURAL_ORDER_COLUMN_NAME] if start is not None: start = start[0] + elif rows_sel.start is None: + pass else: raise KeyError(rows_sel.start) if stop is not None: stop = stop[0] + elif rows_sel.stop is None: + pass else: raise KeyError(rows_sel.stop) + index_data_type = LongType() else: order_column = index_column._scol cond = [] if start is not None: - cond.append(order_column >= F.lit(start)) + cond.append(order_column >= F.lit(start).cast(index_data_type)) if stop is not None: - cond.append(order_column <= F.lit(stop)) + cond.append(order_column <= F.lit(stop).cast(index_data_type)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None From 97bd89cb9aca895bda501fb34f00c84651874096 Mon Sep 17 00:00:00 2001 From: itholic Date: Thu, 2 Jan 2020 15:31:38 +0900 Subject: [PATCH 04/14] first() -> collect() --- databricks/koalas/indexing.py | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index a8cfa3ec05..22acf16223 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -408,24 +408,21 @@ def _select_rows(self, rows_sel): # get natural order from '__natural_order__' from start to stop # to keep natural order when type of rows are StringType if isinstance(index_data_type, StringType): - start = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ - .where(index_column._scol == start) \ - .first() - stop = sdf.select(NATURAL_ORDER_COLUMN_NAME) \ - .where(index_column._scol == stop) \ - .first() + start_and_stop = ( + sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME) + .where((index_column._scol == start) | (index_column._scol == stop)) + .collect()) + + start = [row[1] for row in start_and_stop if row[0] == start] + start = start[0] if len(start) > 0 else None + + stop = [row[1] for row in start_and_stop if row[0] == stop] + stop = stop[0] if len(stop) > 0 else None + order_column = sdf[NATURAL_ORDER_COLUMN_NAME] - if start is not None: - start = start[0] - elif rows_sel.start is None: - pass - else: + if start is None and rows_sel.start is not None: raise KeyError(rows_sel.start) - if stop is not None: - stop = stop[0] - elif rows_sel.stop is None: - pass - else: + if stop is None and rows_sel.stop is not None: raise KeyError(rows_sel.stop) index_data_type = LongType() else: From 2a1f00fdd4f0d60244a3d4da09b742ce78415352 Mon Sep 17 00:00:00 2001 From: itholic Date: Mon, 6 Jan 2020 21:51:38 +0900 Subject: [PATCH 05/14] add test case & fix --- databricks/koalas/indexing.py | 30 ++++++++++++++---------- databricks/koalas/tests/test_indexing.py | 7 ++++++ 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 22acf16223..3af939c789 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -404,29 +404,33 @@ def _select_rows(self, rows_sel): index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop + order_column = sdf[NATURAL_ORDER_COLUMN_NAME] # get natural order from '__natural_order__' from start to stop - # to keep natural order when type of rows are StringType - if isinstance(index_data_type, StringType): - start_and_stop = ( - sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME) - .where((index_column._scol == start) | (index_column._scol == stop)) - .collect()) + # to keep natural order. + start_and_stop = ( + sdf.select(index_column._scol, NATURAL_ORDER_COLUMN_NAME) + .where((index_column._scol == start) | (index_column._scol == stop)) + .collect()) - start = [row[1] for row in start_and_stop if row[0] == start] - start = start[0] if len(start) > 0 else None + start = [row[1] for row in start_and_stop if row[0] == start] + start = start[0] if len(start) > 0 else None - stop = [row[1] for row in start_and_stop if row[0] == stop] - stop = stop[0] if len(stop) > 0 else None + stop = [row[1] for row in start_and_stop if row[0] == stop] + stop = stop[0] if len(stop) > 0 else None - order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + if isinstance(index_data_type, StringType): + index_data_type = LongType() if start is None and rows_sel.start is not None: raise KeyError(rows_sel.start) if stop is None and rows_sel.stop is not None: raise KeyError(rows_sel.stop) - index_data_type = LongType() else: - order_column = index_column._scol + if start is None and stop is None: + start = rows_sel.start + stop = rows_sel.stop + order_column = index_column._scol + cond = [] if start is not None: cond.append(order_column >= F.lit(start).cast(index_data_type)) diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index f1458de50a..72aa3916ea 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -222,6 +222,13 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) + self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) + + # test when index and column have different type + kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[3, 2, 1]) + pdf = kdf.to_pandas() + + self.assert_eq(kdf.loc[3:2], pdf.loc[3:2]) # KeyError test for string type index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], From d2c0ca3e4d69ffe5735d6f42a13eeaa51a3098f2 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 7 Jan 2020 09:19:56 +0900 Subject: [PATCH 06/14] remove mistaken line --- databricks/koalas/tests/test_indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index 72aa3916ea..ff4b6677d6 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -222,7 +222,6 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) - self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) # test when index and column have different type kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[3, 2, 1]) From 921487750b1833a5ecc2d8ea114163967e1df817 Mon Sep 17 00:00:00 2001 From: itholic Date: Tue, 7 Jan 2020 10:49:13 +0900 Subject: [PATCH 07/14] remove comment in docstring --- databricks/koalas/indexing.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 3af939c789..389ebc3315 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -290,9 +290,6 @@ class LocIndexer(_LocIndexerLike): Slice with labels for row and single label for column. As mentioned above, note that both the start and stop of the slice are included. - Also note that the row for 'sidewinder' is not included since 'sidewinder' - is not between 'cobra' and 'viper'. - >>> df.loc['cobra':'viper', 'max_speed'] cobra 1 viper 4 From 617899c9fded2f4b3fbbd18f1c450c8740fcb931 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 09:02:38 +0900 Subject: [PATCH 08/14] removed 236 --- databricks/koalas/indexing.py | 1 - 1 file changed, 1 deletion(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 389ebc3315..ea324b7943 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -233,7 +233,6 @@ class LocIndexer(_LocIndexerLike): .. note:: Note that contrary to usual python slices, **both** the start and the stop are included, and the step of the slice is not allowed. - In addition, with a slice, Koalas works as a filter between the range. .. note:: With a list or array of labels for row selection, Koalas behaves as a filter without reordering by the labels. From 1092cf4b8b60baa17a03bf2a4932f1294d75e96e Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 12:33:58 +0900 Subject: [PATCH 09/14] fix loc and related functions & add tests --- databricks/koalas/indexing.py | 37 ++++++++++++++++-------- databricks/koalas/series.py | 5 +--- databricks/koalas/tests/test_indexing.py | 12 ++++++-- 3 files changed, 35 insertions(+), 19 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index ea324b7943..53fc3f17c6 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -396,11 +396,13 @@ def _select_rows(self, rows_sel): return None, None elif len(self._internal.index_columns) == 1: sdf = self._kdf_or_kser._internal.sdf - index_column = self._kdf_or_kser.index.to_series() + index = self._kdf_or_kser.index + index_column = index.to_series() index_data_type = index_column.spark_type start = rows_sel.start stop = rows_sel.stop - order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + start_order_column = sdf[NATURAL_ORDER_COLUMN_NAME] + stop_order_column = sdf[NATURAL_ORDER_COLUMN_NAME] # get natural order from '__natural_order__' from start to stop # to keep natural order. @@ -415,23 +417,34 @@ def _select_rows(self, rows_sel): stop = [row[1] for row in start_and_stop if row[0] == stop] stop = stop[0] if len(stop) > 0 else None - if isinstance(index_data_type, StringType): - index_data_type = LongType() - if start is None and rows_sel.start is not None: + # if index order is not monotonic increasing or decreasing + # and specified values don't exist in index, raise KeyError + if start is None and rows_sel.start is not None: + if not (index.is_monotonic_decreasing or index.is_monotonic_increasing): raise KeyError(rows_sel.start) - if stop is None and rows_sel.stop is not None: - raise KeyError(rows_sel.stop) - else: - if start is None and stop is None: + else: start = rows_sel.start + start_order_column = index_column._scol + if stop is None and rows_sel.stop is not None: + if not (index.is_monotonic_decreasing or index.is_monotonic_increasing): + raise KeyError(rows_sel.stop) + else: stop = rows_sel.stop - order_column = index_column._scol + stop_order_column = index_column._scol + + # we don't use StringType since we're using `__natural_order__` for comparing + if index_data_type is StringType: + index_data_type = LongType() + + # if start and stop are same, just get all start(or stop) values + if start == stop: + return index_column._scol == F.lit(rows_sel.start).cast(index_data_type), None cond = [] if start is not None: - cond.append(order_column >= F.lit(start).cast(index_data_type)) + cond.append(start_order_column >= F.lit(start).cast(index_data_type)) if stop is not None: - cond.append(order_column <= F.lit(stop).cast(index_data_type)) + cond.append(stop_order_column <= F.lit(stop).cast(index_data_type)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py index 1453b72db5..52b3836237 100644 --- a/databricks/koalas/series.py +++ b/databricks/koalas/series.py @@ -3441,10 +3441,7 @@ def truncate(self, before=None, after=None, copy=True): if before > after: raise ValueError("Truncate: %s must be after %s" % (after, before)) - if indexes_increasing: - result = _col(self.to_frame()[before:after]) - else: - result = _col(self.to_frame()[after:before]) + result = _col(self.to_frame()[before:after]) return result.copy() if copy else result diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index ff4b6677d6..dda4c476e6 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -229,14 +229,20 @@ def test_loc(self): self.assert_eq(kdf.loc[3:2], pdf.loc[3:2]) - # KeyError test for string type index + # KeyError when index is not monotonic increasing or decreasing + # and specified values don't exist in index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], - index=['cobra', 'viper', 'sidewinder'], - columns=['max_speed', 'shield']) + index=['cobra', 'viper', 'sidewinder']) self.assertRaises(KeyError, lambda: kdf.loc['cobra':'koalas']) self.assertRaises(KeyError, lambda: kdf.loc['koalas':'viper']) + kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], + index=[10, 30, 20]) + + self.assertRaises(KeyError, lambda: kdf.loc[0:30]) + self.assertRaises(KeyError, lambda: kdf.loc[10:100]) + def test_loc_non_informative_index(self): pdf = pd.DataFrame({'x': [1, 2, 3, 4]}, index=[10, 20, 30, 40]) kdf = ks.from_pandas(pdf) From 7948e4cc78f4d52ca77fe0151c036a966e8c05ea Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 13:04:15 +0900 Subject: [PATCH 10/14] remove unnecessary tests --- databricks/koalas/tests/test_indexing.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index dda4c476e6..f1cee8806f 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -223,12 +223,6 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) - # test when index and column have different type - kdf = ks.DataFrame({'a': [1, 2, 3]}, index=[3, 2, 1]) - pdf = kdf.to_pandas() - - self.assert_eq(kdf.loc[3:2], pdf.loc[3:2]) - # KeyError when index is not monotonic increasing or decreasing # and specified values don't exist in index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], From e4f2c1a5cb5b50fb1e73a011c65273b225f98d82 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 14:02:45 +0900 Subject: [PATCH 11/14] fix --- databricks/koalas/indexing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 53fc3f17c6..2769a55b62 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -426,14 +426,14 @@ def _select_rows(self, rows_sel): start = rows_sel.start start_order_column = index_column._scol if stop is None and rows_sel.stop is not None: - if not (index.is_monotonic_decreasing or index.is_monotonic_increasing): + if not (index.is_monotonic_increasing or index.is_monotonic_decreasing): raise KeyError(rows_sel.stop) else: stop = rows_sel.stop stop_order_column = index_column._scol # we don't use StringType since we're using `__natural_order__` for comparing - if index_data_type is StringType: + if isinstance(index_data_type, StringType): index_data_type = LongType() # if start and stop are same, just get all start(or stop) values From e7311f5447ce23a14633c80fe7cc4d7cc1aa8641 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 14:11:30 +0900 Subject: [PATCH 12/14] fix --- databricks/koalas/indexing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index 2769a55b62..fd50b4caee 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -420,7 +420,7 @@ def _select_rows(self, rows_sel): # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError if start is None and rows_sel.start is not None: - if not (index.is_monotonic_decreasing or index.is_monotonic_increasing): + if not (index.is_monotonic_increasing or index.is_monotonic_decreasing): raise KeyError(rows_sel.start) else: start = rows_sel.start From 62db0e2f3b9bfafab9bbf641301d61debe725fb2 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 15:30:37 +0900 Subject: [PATCH 13/14] fix data type handling --- databricks/koalas/indexing.py | 10 +++++----- databricks/koalas/tests/test_indexing.py | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index fd50b4caee..c822c6b790 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -415,7 +415,7 @@ def _select_rows(self, rows_sel): start = start[0] if len(start) > 0 else None stop = [row[1] for row in start_and_stop if row[0] == stop] - stop = stop[0] if len(stop) > 0 else None + stop = stop[-1] if len(stop) > 0 else None # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError @@ -432,14 +432,14 @@ def _select_rows(self, rows_sel): stop = rows_sel.stop stop_order_column = index_column._scol - # we don't use StringType since we're using `__natural_order__` for comparing - if isinstance(index_data_type, StringType): - index_data_type = LongType() - # if start and stop are same, just get all start(or stop) values if start == stop: return index_column._scol == F.lit(rows_sel.start).cast(index_data_type), None + # we don't use StringType since we're using `__natural_order__` for comparing + if isinstance(index_data_type, StringType): + index_data_type = LongType() + cond = [] if start is not None: cond.append(start_order_column >= F.lit(start).cast(index_data_type)) diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index f1cee8806f..59c1fc8fa2 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -223,6 +223,14 @@ def test_loc(self): self.assert_eq(kdf.loc[1000:], pdf.loc[1000:]) self.assert_eq(kdf.loc[-2000:-1000], pdf.loc[-2000:-1000]) + # duplicated index test + pdf = pd.DataFrame( + [1, 2, 3, 4, 5, 6, 7, 8, 9], + index=[0, 1, 1, 2, 2, 2, 3, 4, 5]) + kdf = ks.from_pandas(pdf) + + self.assert_eq(repr(kdf.loc[:2]), repr(pdf.loc[:2])) + # KeyError when index is not monotonic increasing or decreasing # and specified values don't exist in index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]], From 87183522356388ec6c7d763c79074e95daacbdd7 Mon Sep 17 00:00:00 2001 From: itholic Date: Wed, 8 Jan 2020 15:55:58 +0900 Subject: [PATCH 14/14] fix & add test cases related with it --- databricks/koalas/indexing.py | 14 ++++++++------ databricks/koalas/tests/test_indexing.py | 6 ++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/databricks/koalas/indexing.py b/databricks/koalas/indexing.py index c822c6b790..5b079bd9ab 100644 --- a/databricks/koalas/indexing.py +++ b/databricks/koalas/indexing.py @@ -417,6 +417,10 @@ def _select_rows(self, rows_sel): stop = [row[1] for row in start_and_stop if row[0] == stop] stop = stop[-1] if len(stop) > 0 else None + # Assume we use the natural order by default. + start_order_column_type = LongType() + stop_order_column_type = LongType() + # if index order is not monotonic increasing or decreasing # and specified values don't exist in index, raise KeyError if start is None and rows_sel.start is not None: @@ -425,26 +429,24 @@ def _select_rows(self, rows_sel): else: start = rows_sel.start start_order_column = index_column._scol + start_order_column_type = index_data_type if stop is None and rows_sel.stop is not None: if not (index.is_monotonic_increasing or index.is_monotonic_decreasing): raise KeyError(rows_sel.stop) else: stop = rows_sel.stop stop_order_column = index_column._scol + stop_order_column_type = index_data_type # if start and stop are same, just get all start(or stop) values if start == stop: return index_column._scol == F.lit(rows_sel.start).cast(index_data_type), None - # we don't use StringType since we're using `__natural_order__` for comparing - if isinstance(index_data_type, StringType): - index_data_type = LongType() - cond = [] if start is not None: - cond.append(start_order_column >= F.lit(start).cast(index_data_type)) + cond.append(start_order_column >= F.lit(start).cast(start_order_column_type)) if stop is not None: - cond.append(stop_order_column <= F.lit(stop).cast(index_data_type)) + cond.append(stop_order_column <= F.lit(stop).cast(stop_order_column_type)) if len(cond) > 0: return reduce(lambda x, y: x & y, cond), None diff --git a/databricks/koalas/tests/test_indexing.py b/databricks/koalas/tests/test_indexing.py index 59c1fc8fa2..85f5cd2196 100644 --- a/databricks/koalas/tests/test_indexing.py +++ b/databricks/koalas/tests/test_indexing.py @@ -231,6 +231,12 @@ def test_loc(self): self.assert_eq(repr(kdf.loc[:2]), repr(pdf.loc[:2])) + # test when type of key is string and given value is not included in key + pdf = pd.DataFrame([1, 2, 3], index=['a', 'b', 'd']).loc['a':'z'] + kdf = ks.from_pandas(pdf) + + self.assert_eq(repr(kdf.loc['a':'z']), repr(pdf.loc['a':'z'])) + # KeyError when index is not monotonic increasing or decreasing # and specified values don't exist in index kdf = ks.DataFrame([[1, 2], [4, 5], [7, 8]],