databricks · ueshin · Sep 21, 2020 · Sep 14, 2020 · Sep 16, 2020 · Sep 16, 2020
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -6930,16 +6930,24 @@ def join(
             raise ValueError(
                 "columns overlap but no suffix specified: " "{rename}".format(rename=common)
             )
+
+        need_set_index = False
         if on:
+            if not is_list_like(on):
+                on = [on]  # type: ignore
+            if len(on) != right.index.nlevels:
+                raise ValueError(
+                    'len(left_on) must equal the number of levels in the index of "right"'
+                )
+
+            need_set_index = len(set(on) & set(self.index.names)) == 0
+        if need_set_index:
             self = self.set_index(on)
-            join_kdf = self.merge(
-                right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix)
-            ).reset_index()
-        else:
-            join_kdf = self.merge(
-                right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix)
-            )
-        return join_kdf
+
+        join_kdf = self.merge(
+            right, left_index=True, right_index=True, how=how, suffixes=(lsuffix, rsuffix)
+        )
+        return join_kdf.reset_index() if need_set_index else join_kdf
 
     def append(
         self,

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -1576,13 +1576,9 @@ def test_join(self):
         pdf2 = pd.DataFrame(
             {"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}, columns=["key", "B"]
         )
-        kdf1 = ks.DataFrame(
-            {"key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"]}, columns=["key", "A"]
-        )
-        kdf2 = ks.DataFrame(
-            {"key": ["K0", "K1", "K2"], "B": ["B0", "B1", "B2"]}, columns=["key", "B"]
-        )
-        ks1 = ks.Series(["A1", "A5"], index=[1, 2], name="A")
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
         join_pdf = pdf1.join(pdf2, lsuffix="_left", rsuffix="_right")
         join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
 
@@ -1593,6 +1589,7 @@ def test_join(self):
 
         # join with duplicated columns in Series
         with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"):
+            ks1 = ks.Series(["A1", "A5"], index=[1, 2], name="A")
             kdf1.join(ks1, how="outer")
         # join with duplicated columns in DataFrame
         with self.assertRaisesRegex(ValueError, "columns overlap but no suffix specified"):
@@ -1606,6 +1603,17 @@ def test_join(self):
         join_kdf.sort_values(by=list(join_kdf.columns), inplace=True)
         self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True))
 
+        join_pdf = pdf1.set_index("key").join(
+            pdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right"
+        )
+        join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
+
+        join_kdf = kdf1.set_index("key").join(
+            kdf2.set_index("key"), on="key", lsuffix="_left", rsuffix="_right"
+        )
+        join_kdf.sort_values(by=list(join_kdf.columns), inplace=True)
+        self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True))
+
         # multi-index columns
         columns1 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "A")])
         columns2 = pd.MultiIndex.from_tuples([("x", "key"), ("Y", "B")])
@@ -1635,6 +1643,43 @@ def test_join(self):
 
         self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True))
 
+        join_pdf = pdf1.set_index(("x", "key")).join(
+            pdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
+        )
+        join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
+
+        join_kdf = kdf1.set_index(("x", "key")).join(
+            kdf2.set_index(("x", "key")), on=[("x", "key")], lsuffix="_left", rsuffix="_right"
+        )
+        join_kdf.sort_values(by=list(join_kdf.columns), inplace=True)
+
+        self.assert_eq(join_pdf.reset_index(drop=True), join_kdf.reset_index(drop=True))
+
+        # multi-index
+        midx1 = pd.MultiIndex.from_tuples(
+            [("w", "a"), ("x", "b"), ("y", "c"), ("z", "d")], names=["index1", "index2"]
+        )
+        midx2 = pd.MultiIndex.from_tuples(
+            [("w", "a"), ("x", "b"), ("y", "c")], names=["index1", "index2"]
+        )
+        pdf1.index = midx1
+        pdf2.index = midx2
+        kdf1 = ks.from_pandas(pdf1)
+        kdf2 = ks.from_pandas(pdf2)
+
+        join_pdf = pdf1.join(pdf2, on=["index1", "index2"], rsuffix="_right")
+        join_pdf.sort_values(by=list(join_pdf.columns), inplace=True)
+
+        join_kdf = kdf1.join(kdf2, on=["index1", "index2"], rsuffix="_right")
+        join_kdf.sort_values(by=list(join_kdf.columns), inplace=True)
+
+        self.assert_eq(join_pdf, join_kdf)
+
+        with self.assertRaisesRegex(
+            ValueError, r'len\(left_on\) must equal the number of levels in the index of "right"'
+        ):
+            kdf1.join(kdf2, on=["index1"], rsuffix="_right")
+
     def test_replace(self):
         pdf = pd.DataFrame(
             {