Cleanup and add some more tests

HyukjinKwon · HyukjinKwon · commit 05f9fcd7f285 · 2019-08-12T10:26:48.000+09:00
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -400,8 +400,7 @@ def apply_op(kdf, this_columns, that_columns):
                 for this_column, that_column in zip(this_columns, that_columns):
                     yield getattr(kdf[this_column], op)(kdf[that_column])
 
-            return align_diff_frames(
-                apply_op, self, other, fillna=True, how="full", include_all_that_columns=False)
+            return align_diff_frames(apply_op, self, other, fillna=True, how="full")
         elif isinstance(other, DataFrame) and self is not other:
             # Same DataFrames
             for column in self._internal.data_columns:
@@ -6354,9 +6353,9 @@ def __getitem__(self, key):
     def __setitem__(self, key, value):
         from databricks.koalas.series import Series
 
-        if ((isinstance(value, Series) and value._kdf is not self) or
-                (isinstance(value, DataFrame) and value is not self)):
-            # Different (anchor) DataFrames
+        if (isinstance(value, Series) and value._kdf is not self) or \
+                (isinstance(value, DataFrame) and value is not self):
+            # Different Series or DataFrames
             if isinstance(value, Series):
                 value = value.to_frame()
 
@@ -6369,20 +6368,17 @@ def assign_columns(kdf, this_columns, that_columns):
                 # that_columns.
                 for k, this_column, that_column in zip_longest(key, this_columns, that_columns):
                     yield kdf[that_column].rename(k)
-                    if this_column is not None:
-                        # if both're same columns first one is higher priority.
+                    if this_column != k and this_column is not None:
                         yield kdf[this_column]
 
-            kdf = align_diff_frames(
-                assign_columns, self, value, fillna=False,
-                how="left", include_all_that_columns=True)
+            kdf = align_diff_frames(assign_columns, self, value, fillna=False, how="left")
         elif isinstance(key, (tuple, list)):
             assert isinstance(value, DataFrame)
             # Same DataFrames.
             field_names = value.columns
             kdf = self.assign(**{k: value[c] for k, c in zip(key, field_names)})
         else:
-            # Same anchor DataFrames.
+            # Same Series.
             kdf = self.assign(**{key: value})
 
         self._internal = kdf._internal
diff --git a/databricks/koalas/tests/test_ops_on_diff_frames.py b/databricks/koalas/tests/test_ops_on_diff_frames.py
@@ -62,6 +62,23 @@ def pdf4(self):
             'f': [2, 2, 2, 2, 2, 2, 2, 2, 2],
         }, index=list(range(9)))
 
+    @property
+    def pdf5(self):
+        return pd.DataFrame({
+            'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            'b': [4, 5, 6, 3, 2, 1, 0, 0, 0],
+            'c': [4, 5, 6, 3, 2, 1, 0, 0, 0],
+        }, index=[0, 1, 3, 5, 6, 8, 9, 10, 11]).set_index(['a', 'b'])
+
+    @property
+    def pdf6(self):
+        return pd.DataFrame({
+            'a': [9, 8, 7, 6, 5, 4, 3, 2, 1],
+            'b': [0, 0, 0, 4, 5, 6, 1, 2, 3],
+            'c': [9, 8, 7, 6, 5, 4, 3, 2, 1],
+            'e': [4, 5, 6, 3, 2, 1, 0, 0, 0],
+        }, index=list(range(9))).set_index(['a', 'b'])
+
     @property
     def kdf1(self):
         return ks.from_pandas(self.pdf1)
@@ -78,6 +95,23 @@ def kdf3(self):
     def kdf4(self):
         return ks.from_pandas(self.pdf4)
 
+    @property
+    def kdf5(self):
+        return ks.from_pandas(self.pdf5)
+
+    @property
+    def kdf6(self):
+        return ks.from_pandas(self.pdf6)
+
+    def test_no_index(self):
+        with self.assertRaisesRegex(AssertionError, "cannot join with no overlapping index name"):
+            ks.range(10) + ks.range(10)
+
+    def test_no_matched_index(self):
+        with self.assertRaisesRegex(AssertionError, "cannot join with no overlapping index name"):
+            ks.DataFrame({'a': [1, 2, 3]}).set_index('a') + \
+                ks.DataFrame({'b': [1, 2, 3]}).set_index('b')
+
     def test_arithmetic(self):
         # Series
         self.assertEqual(
@@ -196,6 +230,66 @@ def test_assignment_frame_chain(self):
 
         self.assert_eq(kdf.sort_index(), pdf.sort_index())
 
+    def test_multi_index_arithmetic(self):
+        # Series
+        self.assertEqual(
+            repr((self.kdf5.c - self.kdf6.e).sort_index()),
+            repr((self.pdf5.c - self.pdf6.e).rename("c").sort_index()))
+
+        self.assertEqual(
+            repr((self.kdf5["c"] / self.kdf6["e"]).sort_index()),
+            repr((self.pdf5["c"] / self.pdf6["e"]).rename("c").sort_index()))
+
+        # DataFrame
+        self.assert_eq(
+            repr((self.kdf5 + self.kdf6).sort_index()),
+            repr((self.pdf5 + self.pdf6).sort_index()))
+
+    def test_multi_index_assignment_series(self):
+        kdf = ks.from_pandas(self.pdf5)
+        pdf = self.pdf5.copy()
+        kdf['x'] = self.kdf6.e
+        pdf['x'] = self.pdf6.e
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        kdf = ks.from_pandas(self.pdf5)
+        pdf = self.pdf5.copy()
+        kdf['x'] = self.kdf6.e
+        pdf['x'] = self.pdf6.e
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        kdf = ks.from_pandas(self.pdf5)
+        pdf = self.pdf5.copy()
+        kdf['c'] = self.kdf6.e
+
+        pdf['c'] = self.pdf6.e
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+    def test_multi_index_assignment_frame(self):
+        kdf = ks.from_pandas(self.pdf5)
+        pdf = self.pdf5.copy()
+        kdf[['c']] = self.kdf5
+        pdf[['c']] = self.pdf5
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        kdf = ks.from_pandas(self.pdf5)
+        pdf = self.pdf5.copy()
+        kdf[['x']] = self.kdf5
+        pdf[['x']] = self.pdf5
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
+        kdf = ks.from_pandas(self.pdf6)
+        pdf = self.pdf6.copy()
+        kdf[['x', 'y']] = self.kdf6
+        pdf[['x', 'y']] = self.pdf6
+
+        self.assert_eq(kdf.sort_index(), pdf.sort_index())
+
 
 class OpsOnDiffFramesDisabledTest(ReusedSQLTestCase, SQLTestUtils):
 
diff --git a/databricks/koalas/utils.py b/databricks/koalas/utils.py
@@ -35,8 +35,8 @@ def combine_frames(this, *args, how="full"):
     This method combines `this` DataFrame with a different `that` DataFrame or
     Series from a different DataFrame.
 
-    It returns a dataframe that has prefix, `this_` and `that_` to distinct
-    the columns names.
+    It returns a DataFrame that has prefix `this_` and `that_` to distinct
+    the columns names from both DataFrames
 
     It internally performs a join operation which can be expensive in general.
     So, if `OPS_ON_DIFF_FRAMES` environment variable is not set,
@@ -70,11 +70,12 @@ def combine_frames(this, *args, how="full"):
         join_scols = []
         merged_index_scols = []
 
+        # If the same named index is found, that's used.
         for this_column, this_name in this_index_map:
             for that_col, that_name in that_index_map:
                 if this_name == that_name:
-                    # We should map the actual Spark columns even if
-                    # the index names are the name.
+                    # We should merge the Spark columns into one
+                    # to mimic pandas' behavior.
                     this_scol = this._internal.scol_for(this_column)
                     that_scol = that._internal.scol_for(that_col)
                     join_scol = this_scol == that_scol
@@ -85,7 +86,7 @@ def combine_frames(this, *args, how="full"):
                         ).otherwise(that_scol).alias(this_column))
                     break
             else:
-                raise ValueError("Index names must be matched.")
+                raise ValueError("Index names must be exactly matched currently.")
 
         assert len(join_scols) > 0, "cannot join with no overlapping index names"
 
@@ -106,13 +107,13 @@ def combine_frames(this, *args, how="full"):
                          "it comes from a different dataframe")
 
 
-def align_diff_frames(func, this, that, fillna=True, how="full", include_all_that_columns=False):
+def align_diff_frames(resolve_func, this, that, fillna=True, how="full"):
     """
     This method aligns two different DataFrames with a given `func`. Columns are resolved and
     handled within the given `func`.
     To use this, `OPS_ON_DIFF_FRAMES` environment variable should be enabled, for now.
 
-    :param func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
+    :param resolve_func: Takes aligned (joined) DataFrame, the column of the current DataFrame, and
         the column of another DataFrame. It returns an iterable that produces Series.
 
         >>> import os
@@ -152,15 +153,19 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
     :param that: another DataFrame to align
     :param fillna: If True, it fills missing values in non-common columns in both `this` and `that`.
         Otherwise, it returns as are.
-    :param how: join way.
-    :param include_all_that_columns: If True, all non-common columns from `that` are added into
-        `that_columns` into `func`, and they are excluded in non-common columns group
-        (controlled by `fillna`). Otherwise, `this_columns` and `that_columns` will always in the
-        same common column group.
+    :param how: join way. In addition, it affects how `resolve_func` resolves the column conflict.
+        - full: `resolve_func` should resolve only common columns from 'this' and 'that' DataFrames.
+            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` and
+            'that_columns' in this function are B, C and B, C.
+        - left: `resolve_func` should resolve columns including that columns.
+            For instance, if 'this' has columns A, B, C and that has B, C, D, `this_columns` is
+            B, C but `that_columns` are B, C, D.
     :return: Alined DataFrame
     """
     from databricks.koalas import DataFrame
 
+    assert how == "full" or how == "left"
+
     this_data_columns = this._internal.data_columns
     that_data_columns = that._internal.data_columns
     common_columns = set(this_data_columns).intersection(that_data_columns)
@@ -185,11 +190,12 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
                 that_columns_to_apply.append(combined_column)
                 break
         else:
-            if include_all_that_columns and \
+            if how == "left" and \
                     combined_column in ["__that_%s" % c for c in that_data_columns]:
-                # In this case, we will drop that columns in columns to keep but passes it later
-                # to `func`. Note that adding this into a separate list is intentional so that
-                # `this_columns` and `that_columns` can be paired.
+                # In this case, we will drop `that_columns` in `columns_to_keep` but passes
+                # it later to `func`. `func` should resolve it.
+                # Note that adding this into a separate list (`additional_that_columns`)
+                # is intentional so that `this_columns` and `that_columns` can be paired.
                 additional_that_columns.append(combined_column)
             elif fillna:
                 columns_to_keep.append(F.lit(None).cast(FloatType()).alias(combined_column))
@@ -200,7 +206,7 @@ def align_diff_frames(func, this, that, fillna=True, how="full", include_all_tha
 
     # Should extract columns to apply and do it in a batch in case
     # it adds new columns for example.
-    kser_set = list(func(combined, this_columns_to_apply, that_columns_to_apply))
+    kser_set = list(resolve_func(combined, this_columns_to_apply, that_columns_to_apply))
     columns_applied = [c._scol for c in kser_set]
 
     sdf = combined._sdf.select(