Complete NumPy universial functions for DataFrames

HyukjinKwon · HyukjinKwon · commit 1737eb0ff344 · 2019-12-16T19:53:25.000+09:00
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -26,7 +26,7 @@
 from functools import partial, reduce
 import sys
 from itertools import zip_longest
-from typing import Any, Optional, List, Tuple, Union, Generic, TypeVar, Iterable, Dict
+from typing import Any, Optional, List, Tuple, Union, Generic, TypeVar, Iterable, Dict, Callable
 
 import numpy as np
 import pandas as pd
@@ -8499,6 +8499,46 @@ def __dir__(self):
     def __iter__(self):
         return iter(self.columns)
 
+    # NDArray Compat
+    def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
+        # TODO: is it possible to deduplicate it with '_map_series_op'?
+        if (all(isinstance(inp, DataFrame) for inp in inputs)
+                and any(inp is not inputs[0] for inp in inputs)):
+            # binary only
+            assert len(inputs) == 2
+            this = inputs[0]
+            that = inputs[1]
+            if this._internal.column_index_level != that._internal.column_index_level:
+                raise ValueError('cannot join with no overlapping index names')
+
+            # Different DataFrames
+            def apply_op(kdf, this_column_index, that_column_index):
+                for this_idx, that_idx in zip(this_column_index, that_column_index):
+                    yield (ufunc(kdf[this_idx], kdf[that_idx], **kwargs), this_idx)
+
+            return align_diff_frames(apply_op, this, that, fillna=True, how="full")
+        else:
+            # DataFrame and Series
+            applied = []
+            this = inputs[0]
+            assert all(inp is this for inp in inputs if isinstance(inp, DataFrame))
+
+            for idx in this._internal.column_index:
+                arguments = []
+                for inp in inputs:
+                    arguments.append(inp[idx] if isinstance(inp, DataFrame) else inp)
+                # both binary and unary.
+                applied.append(ufunc(*arguments, **kwargs))
+
+            sdf = this._sdf.select(
+                this._internal.index_scols + [c._scol for c in applied])
+            internal = this._internal.copy(sdf=sdf,
+                                           column_index=[c._internal.column_index[0]
+                                                         for c in applied],
+                                           column_scols=[scol_for(sdf, c._internal.data_columns[0])
+                                                         for c in applied])
+            return DataFrame(internal)
+
     if sys.version_info >= (3, 7):
         def __class_getitem__(cls, params):
             # This is a workaround to support variadic generic in DataFrame in Python 3.7.
diff --git a/databricks/koalas/tests/test_numpy_compat.py b/databricks/koalas/tests/test_numpy_compat.py
@@ -23,6 +23,29 @@
 
 
 class NumPyCompatTest(ReusedSQLTestCase, SQLTestUtils):
+    blacklist = [
+        # Koalas does not currently support
+        "conj",
+        "conjugate",
+        "isnat",
+        "matmul",
+        "frexp",
+
+        # Values are close enough but tests failed.
+        "arccos",
+        "exp",
+        "expm1",
+        "log",  # flaky
+        "log10",  # flaky
+        "log1p",  # flaky
+        "modf",
+        "floor_divide",  # flaky
+
+        # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow.
+        # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we
+        # can enable it later when Koalas switches to PyArrow 0.15 completely.
+        "left_shift",
+    ]
 
     @property
     def pdf(self):
@@ -49,12 +72,17 @@ def test_np_add_index(self):
         p_index = self.pdf.index
         self.assert_eq(np.add(k_index, k_index), np.add(p_index, p_index))
 
-    def test_np_unsupported(self):
+    def test_np_unsupported_series(self):
         kdf = self.kdf
         with self.assertRaisesRegex(NotImplementedError, "Koalas.*not.*support.*sqrt.*"):
             np.sqrt(kdf.a, kdf.b)
 
-    def test_np_spark_compat(self):
+    def test_np_unsupported_frame(self):
+        kdf = self.kdf
+        with self.assertRaisesRegex(NotImplementedError, "Koalas.*not.*support.*sqrt.*"):
+            np.sqrt(kdf, kdf)
+
+    def test_np_spark_compat_series(self):
         # Use randomly generated dataFrame
         pdf = pd.DataFrame(
             np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b'])
@@ -63,33 +91,9 @@ def test_np_spark_compat(self):
         kdf = ks.from_pandas(pdf)
         kdf2 = ks.from_pandas(pdf2)
 
-        blacklist = [
-            # Koalas does not currently support
-            "conj",
-            "conjugate",
-            "isnat",
-            "matmul",
-            "frexp",
-
-            # Values are close enough but tests failed.
-            "arccos",
-            "exp",
-            "expm1",
-            "log",  # flaky
-            "log10",  # flaky
-            "log1p",  # flaky
-            "modf",
-            "floor_divide",  # flaky
-
-            # Results seem inconsistent in a different version of, I (Hyukjin) suspect, PyArrow.
-            # From PyArrow 0.15, seems it returns the correct results via PySpark. Probably we
-            # can enable it later when Koalas switches to PyArrow 0.15 completely.
-            "left_shift",
-        ]
-
         for np_name, spark_func in unary_np_spark_mappings.items():
             np_func = getattr(np, np_name)
-            if np_name not in blacklist:
+            if np_name not in self.blacklist:
                 try:
                     # unary ufunc
                     self.assert_eq(np_func(pdf.a), np_func(kdf.a), almost=True)
@@ -98,7 +102,7 @@ def test_np_spark_compat(self):
 
         for np_name, spark_func in binary_np_spark_mappings.items():
             np_func = getattr(np, np_name)
-            if np_name not in blacklist:
+            if np_name not in self.blacklist:
                 try:
                     # binary ufunc
                     self.assert_eq(
@@ -113,7 +117,7 @@ def test_np_spark_compat(self):
             set_option('compute.ops_on_diff_frames', True)
             for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]:
                 np_func = getattr(np, np_name)
-                if np_name not in blacklist:
+                if np_name not in self.blacklist:
                     try:
                         # binary ufunc
                         self.assert_eq(
@@ -123,3 +127,50 @@ def test_np_spark_compat(self):
                         raise AssertionError("Test in '%s' function was failed." % np_name) from e
         finally:
             reset_option('compute.ops_on_diff_frames')
+
+    def test_np_spark_compat_frame(self):
+        # Use randomly generated dataFrame
+        pdf = pd.DataFrame(
+            np.random.randint(-100, 100, size=(np.random.randint(100), 2)), columns=['a', 'b'])
+        pdf2 = pd.DataFrame(
+            np.random.randint(-100, 100, size=(len(pdf), len(pdf.columns))), columns=['a', 'b'])
+        kdf = ks.from_pandas(pdf)
+        kdf2 = ks.from_pandas(pdf2)
+
+        for np_name, spark_func in unary_np_spark_mappings.items():
+            np_func = getattr(np, np_name)
+            if np_name not in self.blacklist:
+                try:
+                    # unary ufunc
+                    self.assert_eq(np_func(pdf), np_func(kdf), almost=True)
+                except Exception as e:
+                    raise AssertionError("Test in '%s' function was failed." % np_name) from e
+
+        for np_name, spark_func in binary_np_spark_mappings.items():
+            np_func = getattr(np, np_name)
+            if np_name not in self.blacklist:
+                try:
+                    # binary ufunc
+                    self.assert_eq(
+                        np_func(pdf, pdf), np_func(kdf, kdf), almost=True)
+                    self.assert_eq(
+                        np_func(pdf, 1), np_func(kdf, 1), almost=True)
+                except Exception as e:
+                    raise AssertionError("Test in '%s' function was failed." % np_name) from e
+
+        # Test only top 5 for now. 'compute.ops_on_diff_frames' option increases too much time.
+        try:
+            set_option('compute.ops_on_diff_frames', True)
+            for np_name, spark_func in list(binary_np_spark_mappings.items())[:5]:
+                np_func = getattr(np, np_name)
+                if np_name not in self.blacklist:
+                    try:
+                        # binary ufunc
+                        self.assert_eq(
+                            np_func(pdf, pdf2).sort_index(),
+                            np_func(kdf, kdf2).sort_index(), almost=True)
+
+                    except Exception as e:
+                        raise AssertionError("Test in '%s' function was failed." % np_name) from e
+        finally:
+            reset_option('compute.ops_on_diff_frames')