Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame.insert #1983

Merged
merged 25 commits into from
Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 83 additions & 1 deletion databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
spark_type_to_pandas_dtype,
DataFrameType,
SeriesType,
Scalar,
)
from databricks.koalas.plot import KoalasPlotAccessor

Expand Down Expand Up @@ -3698,7 +3699,88 @@ def notnull(self) -> "DataFrame":

notna = notnull

# TODO: add frep and axis parameter
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved
def insert(
self,
loc: int,
column,
value: Union[Scalar, "Series", Iterable],
allow_duplicates: bool = False,
) -> None:
"""
Insert column into DataFrame at specified location.

Raises a ValueError if `column` is already contained in the DataFrame,
unless `allow_duplicates` is set to True.

Parameters
----------
loc : int
Insertion index. Must verify 0 <= loc <= len(columns).
column : str, number, or hashable object
Label of the inserted column.
value : int, Series, or array-like
allow_duplicates : bool, optional

Examples
--------
>>> kdf = ks.DataFrame([1, 2, 3])
>>> kdf.sort_index()
0
0 1
1 2
2 3
>>> kdf.insert(0, 'x', 4)
>>> kdf.sort_index()
x 0
0 4 1
1 4 2
2 4 3
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

>>> from databricks.koalas.config import set_option, reset_option
>>> set_option("compute.ops_on_diff_frames", True)

>>> kdf.insert(1, 'y', [5, 6, 7])
>>> kdf.sort_index()
x y 0
0 4 5 1
1 4 6 2
2 4 7 3

>>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
>>> kdf.sort_index()
x y z 0
0 4 5 8 1
1 4 6 9 2
2 4 7 10 3

>>> reset_option("compute.ops_on_diff_frames")
"""
if not isinstance(loc, int):
raise TypeError("loc must be int")

assert 0 <= loc <= len(self.columns)
assert allow_duplicates is False

if not is_name_like_value(column):
raise ValueError(
'"column" should be a scalar value or tuple that contains scalar values'
)

if is_name_like_tuple(column):
if len(column) != len(self.columns.levels):
# To be consistent with pandas
raise ValueError('"column" must have length equal to number of column levels.')

if column in self.columns:
raise ValueError("cannot insert %s, already exists" % column)

kdf = self.copy()
kdf[column] = value
columns = kdf.columns[:-1].insert(loc, kdf.columns[-1])
kdf = kdf[columns]
self._update_internal_frame(kdf._internal)

# TODO: add frep and axis parqameter
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shall we revert this change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

May I ask why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking kdf and self should share the same anchor.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean this line # TODO: add frep and axis parqameter

Copy link
Collaborator

@ueshin ueshin Jan 15, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

btw

I was thinking kdf and self should share the same anchor.

kdf could have a different anchor after assigning the value if the value is not the same anchor.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, may I get some help understand requires_same_anchor parameter of _update_internal_frame ?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's False, the existing linked Series will see the updates even when the anchor changes; otherwise the link will be disconnected.
See #1592 for more detail.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for the reference!

No matter what requires_same_anchor is, the test case doesn't seem to pass https://github.com/databricks/koalas/pull/1983/files#diff-028bf26a42786beb47e6707fe34867dc720d3279ae4c942226abc3eb40f26eeaR104.
Would you give me a clue?

def shift(self, periods=1, fill_value=None) -> "DataFrame":
"""
Shift DataFrame by desired number of periods.
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrame(object):
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
insert = _unsupported_function("insert")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
Expand Down
57 changes: 57 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,63 @@ def test_dataframe(self):
index_cols = pdf.columns[column_mask]
self.assert_eq(kdf[index_cols], pdf[index_cols])

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
ueshin marked this conversation as resolved.
Show resolved Hide resolved
kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

kser = ks.Series([4, 5, 6])
self.assertRaises(ValueError, lambda: kdf.insert(0, "y", kser))
self.assertRaisesRegex(
ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
)
self.assertRaisesRegex(
ValueError,
'"column" should be a scalar value or tuple that contains scalar values',
lambda: kdf.insert(0, list("abc"), kser),
)
self.assertRaises(ValueError, lambda: kdf.insert(0, "e", [7, 8, 9, 10]))
self.assertRaises(ValueError, lambda: kdf.insert(0, "f", ks.Series([7, 8])))
self.assertRaises(AssertionError, lambda: kdf.insert(100, "y", kser))
self.assertRaises(AssertionError, lambda: kdf.insert(1, "y", kser, allow_duplicates=True))
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

#
# DataFrame with MultiIndex as columns
#
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

self.assertRaisesRegex(
ValueError, "cannot insert d, already exists", lambda: kdf.insert(4, "d", 11)
)
self.assertRaisesRegex(
ValueError,
'"column" must have length equal to number of column levels.',
lambda: kdf.insert(4, ("e",), 11),
)

def test_inplace(self):
pdf, kdf = self.df_pair

Expand Down
42 changes: 42 additions & 0 deletions databricks/koalas/tests/test_ops_on_diff_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,48 @@ def test_combine_first(self):
kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
)

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Index different from inserting Series'
#
pdf = pd.DataFrame([1, 2, 3], index=[10, 20, 30])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Multi-index columns
#
pdf = pd.DataFrame({("x", "a"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)
kdf.insert(0, "a", kser)
pdf.insert(0, "a", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())
kdf.insert(0, ("b", "c", ""), kser)
pdf.insert(0, ("b", "c", ""), pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

def test_compare(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ Combining / joining / merging
DataFrame.merge
DataFrame.join
DataFrame.update
DataFrame.insert

Time series-related
-------------------
Expand Down