databricks
diff --git a/‎.travis.yml
+1-1 b/‎.travis.yml
+1-1
diff --git a/‎databricks/koalas/base.py
+229-19 b/‎databricks/koalas/base.py
+229-19
diff --git a/‎databricks/koalas/frame.py
+56 b/‎databricks/koalas/frame.py
+56
@@ -37,7 +37,7 @@ matrix:
         - JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk-amd64
         - SPARK_VERSION=2.4.4
         - PANDAS_VERSION=0.25.3
-        - PYARROW_VERSION=0.15.1
+        - PYARROW_VERSION=0.14.1
 
 before_install:
   - ./dev/download_travis_dependencies.sh
 
@@ -19,7 +19,7 @@
 """
 
 from functools import wraps
-from typing import Union
+from typing import Union, Callable, Any
 
 import numpy as np
 import pandas as pd
@@ -30,10 +30,32 @@
 from pyspark.sql.functions import monotonically_increasing_id
 
 from databricks import koalas as ks  # For running doctests and reference resolution in PyCharm.
-from databricks.koalas.frame import DataFrame
-from databricks.koalas.internal import _InternalFrame
+from databricks.koalas import numpy_compat
+from databricks.koalas.internal import _InternalFrame, SPARK_INDEX_NAME_FORMAT
 from databricks.koalas.typedef import pandas_wraps, spark_type_to_pandas_dtype
 from databricks.koalas.utils import align_diff_series, scol_for, validate_axis
+from databricks.koalas.frame import DataFrame
+
+
+def booleanize_null(left_scol, scol, f):
+    """
+    Booleanize Null in Spark Column
+    """
+    comp_ops = [getattr(spark.Column, '__{}__'.format(comp_op))
+                for comp_op in ['eq', 'ne', 'lt', 'le', 'ge', 'gt']]
+
+    if f in comp_ops:
+        # if `f` is "!=", fill null with True otherwise False
+        filler = f == spark.Column.__ne__
+        scol = F.when(scol.isNull(), filler).otherwise(scol)
+
+    elif f == spark.Column.__or__:
+        scol = F.when(left_scol.isNull() | scol.isNull(), False).otherwise(scol)
+
+    elif f == spark.Column.__and__:
+        scol = F.when(scol.isNull(), False).otherwise(scol)
+
+    return scol
 
 
 def _column_op(f):
@@ -57,27 +79,14 @@ def wrapper(self, *args):
             # Same DataFrame anchors
             args = [arg._scol if isinstance(arg, IndexOpsMixin) else arg for arg in args]
             scol = f(self._scol, *args)
-
-            # check if `f` is a comparison operator
-            comp_ops = ['eq', 'ne', 'lt', 'le', 'ge', 'gt']
-            is_comp_op = any(f == getattr(spark.Column, '__{}__'.format(comp_op))
-                             for comp_op in comp_ops)
-
-            if is_comp_op:
-                filler = f == spark.Column.__ne__
-                scol = F.when(scol.isNull(), filler).otherwise(scol)
-
-            elif f == spark.Column.__or__:
-                scol = F.when(self._scol.isNull() | scol.isNull(), False).otherwise(scol)
-
-            elif f == spark.Column.__and__:
-                scol = F.when(scol.isNull(), False).otherwise(scol)
+            scol = booleanize_null(self._scol, scol, f)
 
             return self._with_new_scol(scol)
         else:
             # Different DataFrame anchors
             def apply_func(this_column, *that_columns):
-                return f(this_column, *that_columns)
+                scol = f(this_column, *that_columns)
+                return booleanize_null(this_column, scol, f)
 
             return align_diff_series(apply_func, self, *args, how="full")
 
@@ -216,6 +225,23 @@ def __rfloordiv__(self, other):
     __rand__ = _column_op(spark.Column.__rand__)
     __ror__ = _column_op(spark.Column.__ror__)
 
+    # NDArray Compat
+    def __array_ufunc__(self, ufunc: Callable, method: str, *inputs: Any, **kwargs: Any):
+        # Try dunder methods first.
+        result = numpy_compat.maybe_dispatch_ufunc_to_dunder_op(
+            self, ufunc, method, *inputs, **kwargs)
+
+        # After that, we try with PySpark APIs.
+        if result is NotImplemented:
+            result = numpy_compat.maybe_dispatch_ufunc_to_spark_func(
+                self, ufunc, method, *inputs, **kwargs)
+
+        if result is not NotImplemented:
+            return result
+        else:
+            # TODO: support more APIs?
+            raise NotImplementedError("Koalas objects currently do not support %s." % ufunc)
+
     @property
     def dtype(self):
         """Return the dtype object of the underlying data.
@@ -763,3 +789,187 @@ def _shift(self, periods, fill_value, part_cols=()):
         lag_col = F.lag(col, periods).over(window)
         col = F.when(lag_col.isNull() | F.isnan(lag_col), fill_value).otherwise(lag_col)
         return self._with_new_scol(col).rename(self.name)
+
+    # TODO: Update Documentation for Bins Parameter when its supported
+    def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
+        """
+        Return a Series containing counts of unique values.
+        The resulting object will be in descending order so that the
+        first element is the most frequently-occurring element.
+        Excludes NA values by default.
+
+        Parameters
+        ----------
+        normalize : boolean, default False
+            If True then the object returned will contain the relative
+            frequencies of the unique values.
+        sort : boolean, default True
+            Sort by values.
+        ascending : boolean, default False
+            Sort in ascending order.
+        bins : Not Yet Supported
+        dropna : boolean, default True
+            Don't include counts of NaN.
+
+        Returns
+        -------
+        counts : Series
+
+        See Also
+        --------
+        Series.count: Number of non-NA elements in a Series.
+
+        Examples
+        --------
+        For Series
+
+        >>> df = ks.DataFrame({'x':[0, 0, 1, 1, 1, np.nan]})
+        >>> df.x.value_counts()  # doctest: +NORMALIZE_WHITESPACE
+        1.0    3
+        0.0    2
+        Name: x, dtype: int64
+
+        With `normalize` set to `True`, returns the relative frequency by
+        dividing all values by the sum of values.
+
+        >>> df.x.value_counts(normalize=True)  # doctest: +NORMALIZE_WHITESPACE
+        1.0    0.6
+        0.0    0.4
+        Name: x, dtype: float64
+
+        **dropna**
+        With `dropna` set to `False` we can also see NaN index values.
+
+        >>> df.x.value_counts(dropna=False)  # doctest: +NORMALIZE_WHITESPACE
+        1.0    3
+        0.0    2
+        NaN    1
+        Name: x, dtype: int64
+
+        For Index
+
+        >>> from databricks.koalas.indexes import Index
+        >>> idx = Index([3, 1, 2, 3, 4, np.nan])
+        >>> idx
+        Float64Index([3.0, 1.0, 2.0, 3.0, 4.0, nan], dtype='float64')
+
+        >>> idx.value_counts().sort_index()
+        1.0    1
+        2.0    1
+        3.0    2
+        4.0    1
+        Name: count, dtype: int64
+
+        **sort**
+
+        With `sort` set to `False`, the result wouldn't be sorted by number of count.
+
+        >>> idx.value_counts(sort=True).sort_index()
+        1.0    1
+        2.0    1
+        3.0    2
+        4.0    1
+        Name: count, dtype: int64
+
+        **normalize**
+
+        With `normalize` set to `True`, returns the relative frequency by
+        dividing all values by the sum of values.
+
+        >>> idx.value_counts(normalize=True).sort_index()
+        1.0    0.2
+        2.0    0.2
+        3.0    0.4
+        4.0    0.2
+        Name: count, dtype: float64
+
+        **dropna**
+
+        With `dropna` set to `False` we can also see NaN index values.
+
+        >>> idx.value_counts(dropna=False).sort_index()  # doctest: +SKIP
+        1.0    1
+        2.0    1
+        3.0    2
+        4.0    1
+        NaN    1
+        Name: count, dtype: int64
+
+        For MultiIndex.
+
+        >>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
+        ...                       ['speed', 'weight', 'length']],
+        ...                      [[0, 0, 0, 1, 1, 1, 2, 2, 2],
+        ...                       [1, 1, 1, 1, 1, 2, 1, 2, 2]])
+        >>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3], index=midx)
+        >>> s.index  # doctest: +SKIP
+        MultiIndex([(  'lama', 'weight'),
+                    (  'lama', 'weight'),
+                    (  'lama', 'weight'),
+                    (   'cow', 'weight'),
+                    (   'cow', 'weight'),
+                    (   'cow', 'length'),
+                    ('falcon', 'weight'),
+                    ('falcon', 'length'),
+                    ('falcon', 'length')],
+                   )
+
+        >>> s.index.value_counts().sort_index()
+        (cow, length)       1
+        (cow, weight)       2
+        (falcon, length)    2
+        (falcon, weight)    1
+        (lama, weight)      3
+        Name: count, dtype: int64
+
+        >>> s.index.value_counts(normalize=True).sort_index()
+        (cow, length)       0.111111
+        (cow, weight)       0.222222
+        (falcon, length)    0.222222
+        (falcon, weight)    0.111111
+        (lama, weight)      0.333333
+        Name: count, dtype: float64
+
+        If Index has name, keep the name up.
+
+        >>> idx = Index([0, 0, 0, 1, 1, 2, 3], name='koalas')
+        >>> idx.value_counts().sort_index()
+        0    3
+        1    2
+        2    1
+        3    1
+        Name: koalas, dtype: int64
+        """
+        from databricks.koalas.series import Series, _col
+        if bins is not None:
+            raise NotImplementedError("value_counts currently does not support bins")
+
+        if dropna:
+            sdf_dropna = self._internal._sdf.dropna()
+        else:
+            sdf_dropna = self._internal._sdf
+        index_name = SPARK_INDEX_NAME_FORMAT(0)
+        sdf = sdf_dropna.groupby(self._scol.alias(index_name)).count()
+        if sort:
+            if ascending:
+                sdf = sdf.orderBy(F.col('count'))
+            else:
+                sdf = sdf.orderBy(F.col('count').desc())
+
+        if normalize:
+            sum = sdf_dropna.count()
+            sdf = sdf.withColumn('count', F.col('count') / F.lit(sum))
+
+        column_index = self._internal.column_index
+        if (column_index[0] is None) or (None in column_index[0]):
+            internal = _InternalFrame(sdf=sdf,
+                                      index_map=[(index_name, None)],
+                                      column_scols=[scol_for(sdf, 'count')])
+        else:
+            internal = _InternalFrame(sdf=sdf,
+                                      index_map=[(index_name, None)],
+                                      column_index=column_index,
+                                      column_scols=[scol_for(sdf, 'count')],
+                                      column_index_names=self._internal.column_index_names)
+
+        return _col(DataFrame(internal))
@@ -1100,6 +1100,62 @@ def iteritems(self) -> Iterable:
         cols = list(self.columns)
         return list((col_name, self[col_name]) for col_name in cols)
 
+    def iterrows(self):
+        """
+        Iterate over DataFrame rows as (index, Series) pairs.
+
+        Yields
+        ------
+        index : label or tuple of label
+            The index of the row. A tuple for a `MultiIndex`.
+        data : pandas.Series
+            The data of the row as a Series.
+
+        it : generator
+            A generator that iterates over the rows of the frame.
+
+        Notes
+        -----
+
+        1. Because ``iterrows`` returns a Series for each row,
+           it does **not** preserve dtypes across the rows (dtypes are
+           preserved across columns for DataFrames). For example,
+
+           >>> df = ks.DataFrame([[1, 1.5]], columns=['int', 'float'])
+           >>> row = next(df.iterrows())[1]
+           >>> row
+           int      1.0
+           float    1.5
+           Name: 0, dtype: float64
+           >>> print(row['int'].dtype)
+           float64
+           >>> print(df['int'].dtype)
+           int64
+
+           To preserve dtypes while iterating over the rows, it is better
+           to use :meth:`itertuples` which returns namedtuples of the values
+           and which is generally faster than ``iterrows``.
+
+        2. You should **never modify** something you are iterating over.
+           This is not guaranteed to work in all cases. Depending on the
+           data types, the iterator returns a copy and not a view, and writing
+           to it will have no effect.
+        """
+
+        columns = self.columns
+        internal_index_columns = self._internal.index_columns
+        internal_data_columns = self._internal.data_columns
+
+        def extract_kv_from_spark_row(row):
+            k = row[internal_index_columns[0]] if len(internal_index_columns) == 1 else tuple(
+                row[c] for c in internal_index_columns)
+            v = [row[c] for c in internal_data_columns]
+            return k, v
+
+        for k, v in map(extract_kv_from_spark_row, self._sdf.toLocalIterator()):
+            s = pd.Series(v, index=columns, name=k)
+            yield k, s
+
     def items(self) -> Iterable:
         """This is an alias of ``iteritems``."""
         return self.iteritems()