apache · zhengruifeng · Jan 4, 2024
diff --git a/dev/sparktestsupport/modules.py b/dev/sparktestsupport/modules.py
@@ -865,6 +865,7 @@ def __hash__(self):
         "pyspark.pandas.tests.frame.test_reshaping",
         "pyspark.pandas.tests.frame.test_spark",
         "pyspark.pandas.tests.frame.test_take",
+        "pyspark.pandas.tests.frame.test_take_adv",
         "pyspark.pandas.tests.frame.test_time_series",
         "pyspark.pandas.tests.frame.test_truncate",
         "pyspark.pandas.tests.io.test_io",
@@ -1165,6 +1166,7 @@ def __hash__(self):
         "pyspark.pandas.tests.connect.frame.test_parity_reshaping",
         "pyspark.pandas.tests.connect.frame.test_parity_spark",
         "pyspark.pandas.tests.connect.frame.test_parity_take",
+        "pyspark.pandas.tests.connect.frame.test_parity_take_adv",
         "pyspark.pandas.tests.connect.frame.test_parity_time_series",
         "pyspark.pandas.tests.connect.frame.test_parity_truncate",
         "pyspark.pandas.tests.connect.groupby.test_parity_aggregate",

diff --git a/python/pyspark/pandas/tests/connect/frame/test_parity_take.py b/python/pyspark/pandas/tests/connect/frame/test_parity_take.py
@@ -16,16 +16,17 @@
 #
 import unittest
 
-from pyspark import pandas as ps
 from pyspark.pandas.tests.frame.test_take import FrameTakeMixin
 from pyspark.testing.connectutils import ReusedConnectTestCase
 from pyspark.testing.pandasutils import PandasOnSparkTestUtils
 
 
-class FrameParityTakeTests(FrameTakeMixin, PandasOnSparkTestUtils, ReusedConnectTestCase):
-    @property
-    def psdf(self):
-        return ps.from_pandas(self.pdf)
+class FrameTakeParityTests(
+    FrameTakeMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
 
 
 if __name__ == "__main__":

diff --git a/python/pyspark/pandas/tests/connect/frame/test_parity_take_adv.py b/python/pyspark/pandas/tests/connect/frame/test_parity_take_adv.py
@@ -0,0 +1,41 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+from pyspark.pandas.tests.frame.test_take_adv import FrameTakeAdvMixin
+from pyspark.testing.connectutils import ReusedConnectTestCase
+from pyspark.testing.pandasutils import PandasOnSparkTestUtils
+
+
+class FrameTakeAdvParityTests(
+    FrameTakeAdvMixin,
+    PandasOnSparkTestUtils,
+    ReusedConnectTestCase,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.connect.frame.test_parity_take_adv import *  # noqa: F401
+
+    try:
+        import xmlrunner  # type: ignore[import]
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)
diff --git a/python/pyspark/pandas/tests/frame/test_take.py b/python/pyspark/pandas/tests/frame/test_take.py
@@ -19,7 +19,7 @@
 import pandas as pd
 
 from pyspark import pandas as ps
-from pyspark.testing.pandasutils import ComparisonTestBase
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
 from pyspark.testing.sqlutils import SQLTestUtils
 
 
@@ -72,61 +72,12 @@ def test_take(self):
             pdf.take([-1, -2], axis=1).sort_index(),
         )
 
-        # MultiIndex columns
-        columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")])
-        psdf.columns = columns
-        pdf.columns = columns
 
-        # MultiIndex columns with axis=0 (default)
-        self.assert_eq(psdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
-        self.assert_eq(psdf.take([-1, -2]).sort_index(), pdf.take([-1, -2]).sort_index())
-        self.assert_eq(
-            psdf.take(range(100, 110)).sort_index(), pdf.take(range(100, 110)).sort_index()
-        )
-        self.assert_eq(
-            psdf.take(range(-110, -100)).sort_index(), pdf.take(range(-110, -100)).sort_index()
-        )
-        self.assert_eq(
-            psdf.take([10, 100, 1000, 10000]).sort_index(),
-            pdf.take([10, 100, 1000, 10000]).sort_index(),
-        )
-        self.assert_eq(
-            psdf.take([-10, -100, -1000, -10000]).sort_index(),
-            pdf.take([-10, -100, -1000, -10000]).sort_index(),
-        )
-
-        # axis=1
-        self.assert_eq(
-            psdf.take([1, 2], axis=1).sort_index(), pdf.take([1, 2], axis=1).sort_index()
-        )
-        self.assert_eq(
-            psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index()
-        )
-        self.assert_eq(
-            psdf.take(range(1, 3), axis=1).sort_index(),
-            pdf.take(range(1, 3), axis=1).sort_index(),
-        )
-        self.assert_eq(
-            psdf.take(range(-1, -3), axis=1).sort_index(),
-            pdf.take(range(-1, -3), axis=1).sort_index(),
-        )
-        self.assert_eq(
-            psdf.take([2, 1], axis=1).sort_index(),
-            pdf.take([2, 1], axis=1).sort_index(),
-        )
-        self.assert_eq(
-            psdf.take([-1, -2], axis=1).sort_index(),
-            pdf.take([-1, -2], axis=1).sort_index(),
-        )
-
-        # Checking the type of indices.
-        self.assertRaises(TypeError, lambda: psdf.take(1))
-        self.assertRaises(TypeError, lambda: psdf.take("1"))
-        self.assertRaises(TypeError, lambda: psdf.take({1, 2}))
-        self.assertRaises(TypeError, lambda: psdf.take({1: None, 2: None}))
-
-
-class FrameTakeTests(FrameTakeMixin, ComparisonTestBase, SQLTestUtils):
+class FrameTakeTests(
+    FrameTakeMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
     pass
 
 

diff --git a/python/pyspark/pandas/tests/frame/test_take_adv.py b/python/pyspark/pandas/tests/frame/test_take_adv.py
@@ -0,0 +1,104 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import unittest
+
+import pandas as pd
+
+from pyspark import pandas as ps
+from pyspark.testing.pandasutils import PandasOnSparkTestCase
+from pyspark.testing.sqlutils import SQLTestUtils
+
+
+class FrameTakeAdvMixin:
+    def test_take_adv(self):
+        pdf = pd.DataFrame(
+            {"A": range(0, 50000), "B": range(100000, 0, -2), "C": range(100000, 50000, -1)}
+        )
+        psdf = ps.from_pandas(pdf)
+
+        # MultiIndex columns
+        columns = pd.MultiIndex.from_tuples([("A", "Z"), ("B", "X"), ("C", "C")])
+        psdf.columns = columns
+        pdf.columns = columns
+
+        # MultiIndex columns with axis=0 (default)
+        self.assert_eq(psdf.take([1, 2]).sort_index(), pdf.take([1, 2]).sort_index())
+        self.assert_eq(psdf.take([-1, -2]).sort_index(), pdf.take([-1, -2]).sort_index())
+        self.assert_eq(
+            psdf.take(range(100, 110)).sort_index(), pdf.take(range(100, 110)).sort_index()
+        )
+        self.assert_eq(
+            psdf.take(range(-110, -100)).sort_index(), pdf.take(range(-110, -100)).sort_index()
+        )
+        self.assert_eq(
+            psdf.take([10, 100, 1000, 10000]).sort_index(),
+            pdf.take([10, 100, 1000, 10000]).sort_index(),
+        )
+        self.assert_eq(
+            psdf.take([-10, -100, -1000, -10000]).sort_index(),
+            pdf.take([-10, -100, -1000, -10000]).sort_index(),
+        )
+
+        # axis=1
+        self.assert_eq(
+            psdf.take([1, 2], axis=1).sort_index(), pdf.take([1, 2], axis=1).sort_index()
+        )
+        self.assert_eq(
+            psdf.take([-1, -2], axis=1).sort_index(), pdf.take([-1, -2], axis=1).sort_index()
+        )
+        self.assert_eq(
+            psdf.take(range(1, 3), axis=1).sort_index(),
+            pdf.take(range(1, 3), axis=1).sort_index(),
+        )
+        self.assert_eq(
+            psdf.take(range(-1, -3), axis=1).sort_index(),
+            pdf.take(range(-1, -3), axis=1).sort_index(),
+        )
+        self.assert_eq(
+            psdf.take([2, 1], axis=1).sort_index(),
+            pdf.take([2, 1], axis=1).sort_index(),
+        )
+        self.assert_eq(
+            psdf.take([-1, -2], axis=1).sort_index(),
+            pdf.take([-1, -2], axis=1).sort_index(),
+        )
+
+        # Checking the type of indices.
+        self.assertRaises(TypeError, lambda: psdf.take(1))
+        self.assertRaises(TypeError, lambda: psdf.take("1"))
+        self.assertRaises(TypeError, lambda: psdf.take({1, 2}))
+        self.assertRaises(TypeError, lambda: psdf.take({1: None, 2: None}))
+
+
+class FrameTakeAdvTests(
+    FrameTakeAdvMixin,
+    PandasOnSparkTestCase,
+    SQLTestUtils,
+):
+    pass
+
+
+if __name__ == "__main__":
+    from pyspark.pandas.tests.frame.test_take_adv import *  # noqa: F401
+
+    try:
+        import xmlrunner
+
+        testRunner = xmlrunner.XMLTestRunner(output="target/test-reports", verbosity=2)
+    except ImportError:
+        testRunner = None
+    unittest.main(testRunner=testRunner, verbosity=2)