Skip to content

Commit

Permalink
Implement DataFrame.itertuples (#1960)
Browse files Browse the repository at this point in the history
ref #1929

```
        >>> df = ks.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
        ...                   index=['dog', 'hawk'])
        >>> df
              num_legs  num_wings
        dog          4          0
        hawk         2          2
        >>> for row in df.itertuples():
        ...     print(row)
        ...
        Koalas(Index='dog', num_legs=4, num_wings=0)
        Koalas(Index='hawk', num_legs=2, num_wings=2)
```
  • Loading branch information
xinrong-meng authored Dec 10, 2020
1 parent 2c23b2a commit 02133a8
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 2 deletions.
99 changes: 98 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"""
A wrapper class for Spark DataFrame to behave similar to pandas DataFrame.
"""
from collections import OrderedDict, defaultdict
from collections import OrderedDict, defaultdict, namedtuple
from collections.abc import Mapping
from distutils.version import LooseVersion
import re
Expand Down Expand Up @@ -1436,6 +1436,103 @@ def extract_kv_from_spark_row(row):
s = pd.Series(v, index=columns, name=k)
yield k, s

def itertuples(self, index: bool = True, name: Optional[str] = "Koalas") -> Iterator:
"""
Iterate over DataFrame rows as namedtuples.
Parameters
----------
index : bool, default True
If True, return the index as the first element of the tuple.
name : str or None, default "Koalas"
The name of the returned namedtuples or None to return regular
tuples.
Returns
-------
iterator
An object to iterate over namedtuples for each row in the
DataFrame with the first field possibly being the index and
following fields being the column values.
See Also
--------
DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
pairs.
DataFrame.items : Iterate over (column name, Series) pairs.
Notes
-----
The column names will be renamed to positional names if they are
invalid Python identifiers, repeated, or start with an underscore.
On python versions < 3.7 regular tuples are returned for DataFrames
with a large number of columns (>254).
Examples
--------
>>> df = ks.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
... index=['dog', 'hawk'])
>>> df
num_legs num_wings
dog 4 0
hawk 2 2
>>> for row in df.itertuples():
... print(row)
...
Koalas(Index='dog', num_legs=4, num_wings=0)
Koalas(Index='hawk', num_legs=2, num_wings=2)
By setting the `index` parameter to False we can remove the index
as the first element of the tuple:
>>> for row in df.itertuples(index=False):
... print(row)
...
Koalas(num_legs=4, num_wings=0)
Koalas(num_legs=2, num_wings=2)
With the `name` parameter set we set a custom name for the yielded
namedtuples:
>>> for row in df.itertuples(name='Animal'):
... print(row)
...
Animal(Index='dog', num_legs=4, num_wings=0)
Animal(Index='hawk', num_legs=2, num_wings=2)
"""
fields = list(self.columns)
if index:
fields.insert(0, "Index")

index_spark_column_names = self._internal.index_spark_column_names
data_spark_column_names = self._internal.data_spark_column_names

def extract_kv_from_spark_row(row):
k = (
row[index_spark_column_names[0]]
if len(index_spark_column_names) == 1
else tuple(row[c] for c in index_spark_column_names)
)
v = [row[c] for c in data_spark_column_names]
return k, v

can_return_named_tuples = sys.version_info >= (3, 7) or len(self.columns) + index < 255

if name is not None and can_return_named_tuples:
itertuple = namedtuple(name, fields, rename=True) # type: ignore
for k, v in map(
extract_kv_from_spark_row,
self._internal.resolved_copy.spark_frame.toLocalIterator(),
):
yield itertuple._make(([k] if index else []) + list(v))
else:
for k, v in map(
extract_kv_from_spark_row,
self._internal.resolved_copy.spark_frame.toLocalIterator(),
):
yield tuple(([k] if index else []) + list(v))

def items(self) -> Iterator:
"""This is an alias of ``iteritems``."""
return self.iteritems()
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ class _MissingPandasLikeDataFrame(object):
infer_objects = _unsupported_function("infer_objects")
insert = _unsupported_function("insert")
interpolate = _unsupported_function("interpolate")
itertuples = _unsupported_function("itertuples")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
Expand Down
38 changes: 38 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,44 @@ def test_dataframe_multiindex_names_level(self):
self.assert_eq(kdf[("X", "A")].to_pandas().columns.names, pdf[("X", "A")].columns.names)
self.assert_eq(kdf[("X", "A", "Z")], pdf[("X", "A", "Z")])

def test_itertuples(self):
pdf = pd.DataFrame({"num_legs": [4, 2], "num_wings": [0, 2]}, index=["dog", "hawk"])
kdf = ks.from_pandas(pdf)

for ptuple, ktuple in zip(
pdf.itertuples(index=False, name="Animal"), kdf.itertuples(index=False, name="Animal")
):
self.assert_eq(ptuple, ktuple)
for ptuple, ktuple in zip(pdf.itertuples(name=None), kdf.itertuples(name=None)):
self.assert_eq(ptuple, ktuple)

pdf.index = pd.MultiIndex.from_arrays(
[[1, 2], ["black", "brown"]], names=("count", "color")
)
kdf = ks.from_pandas(pdf)
for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), kdf.itertuples(name="Animal")):
self.assert_eq(ptuple, ktuple)

pdf.columns = pd.MultiIndex.from_arrays(
[["CA", "WA"], ["age", "children"]], names=("origin", "info")
)
kdf = ks.from_pandas(pdf)
for ptuple, ktuple in zip(pdf.itertuples(name="Animal"), kdf.itertuples(name="Animal")):
self.assert_eq(ptuple, ktuple)

pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)
for ptuple, ktuple in zip(
(pdf + 1).itertuples(name="num"), (kdf + 1).itertuples(name="num")
):
self.assert_eq(ptuple, ktuple)

# DataFrames with a large number of columns (>254)
pdf = pd.DataFrame(np.random.random((1, 255)))
kdf = ks.from_pandas(pdf)
for ptuple, ktuple in zip(pdf.itertuples(name="num"), kdf.itertuples(name="num")):
self.assert_eq(ptuple, ktuple)

def test_iterrows(self):
pdf = pd.DataFrame(
{
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ Indexing, iteration
DataFrame.items
DataFrame.iteritems
DataFrame.iterrows
DataFrame.itertuples
DataFrame.keys
DataFrame.pop
DataFrame.tail
Expand Down

0 comments on commit 02133a8

Please sign in to comment.