Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame.insert #1983

Merged
merged 25 commits into from
Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 82 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
spark_type_to_pandas_dtype,
DataFrameType,
SeriesType,
Scalar,
)
from databricks.koalas.plot import KoalasPlotAccessor

Expand Down Expand Up @@ -3698,6 +3699,87 @@ def notnull(self) -> "DataFrame":

notna = notnull

def insert(
self,
loc: int,
column,
value: Union[Scalar, "Series", Iterable],
allow_duplicates: bool = False,
) -> None:
"""
Insert column into DataFrame at specified location.

Raises a ValueError if `column` is already contained in the DataFrame,
unless `allow_duplicates` is set to True.

Parameters
----------
loc : int
Insertion index. Must verify 0 <= loc <= len(columns).
column : str, number, or hashable object
Label of the inserted column.
value : int, Series, or array-like
allow_duplicates : bool, optional

Examples
--------
>>> kdf = ks.DataFrame([1, 2, 3])
>>> kdf.sort_index()
0
0 1
1 2
2 3
>>> kdf.insert(0, 'x', 4)
>>> kdf.sort_index()
x 0
0 4 1
1 4 2
2 4 3
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

>>> from databricks.koalas.config import set_option, reset_option
>>> set_option("compute.ops_on_diff_frames", True)

>>> kdf.insert(1, 'y', [5, 6, 7])
>>> kdf.sort_index()
x y 0
0 4 5 1
1 4 6 2
2 4 7 3

>>> kdf.insert(2, 'z', ks.Series([8, 9, 10]))
>>> kdf.sort_index()
x y z 0
0 4 5 8 1
1 4 6 9 2
2 4 7 10 3

>>> reset_option("compute.ops_on_diff_frames")
"""
if not isinstance(loc, int):
raise TypeError("loc must be int")

assert 0 <= loc <= len(self.columns)
assert allow_duplicates is False

if not is_name_like_value(column):
raise ValueError(
'"column" should be a scalar value or tuple that contains scalar values'
)

if is_name_like_tuple(column):
if len(column) != len(self.columns.levels):
# To be consistent with pandas
raise ValueError('"column" must have length equal to number of column levels.')

if column in self.columns:
raise ValueError("cannot insert %s, already exists" % column)

kdf = self.copy()
kdf[column] = value
columns = kdf.columns[:-1].insert(loc, kdf.columns[-1])
kdf = kdf[columns]
self._update_internal_frame(kdf._internal, requires_same_anchor=False)

# TODO: add frep and axis parameter
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved
def shift(self, periods=1, fill_value=None) -> "DataFrame":
"""
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ class _MissingPandasLikeDataFrame(object):
ewm = _unsupported_function("ewm")
first = _unsupported_function("first")
infer_objects = _unsupported_function("infer_objects")
insert = _unsupported_function("insert")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
Expand Down
62 changes: 62 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,68 @@ def test_dataframe(self):
index_cols = pdf.columns[column_mask]
self.assert_eq(kdf[index_cols], pdf[index_cols])

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
ueshin marked this conversation as resolved.
Show resolved Hide resolved

kdf.b = kdf.b + 1
pdf.b = pdf.b + 1
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

kser = ks.Series([4, 5, 6])
self.assertRaises(ValueError, lambda: kdf.insert(0, "y", kser))
self.assertRaisesRegex(
ValueError, "cannot insert b, already exists", lambda: kdf.insert(1, "b", 10)
)
self.assertRaisesRegex(
ValueError,
'"column" should be a scalar value or tuple that contains scalar values',
lambda: kdf.insert(0, list("abc"), kser),
)
self.assertRaises(ValueError, lambda: kdf.insert(0, "e", [7, 8, 9, 10]))
self.assertRaises(ValueError, lambda: kdf.insert(0, "f", ks.Series([7, 8])))
self.assertRaises(AssertionError, lambda: kdf.insert(100, "y", kser))
self.assertRaises(AssertionError, lambda: kdf.insert(1, "y", kser, allow_duplicates=True))
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

#
# DataFrame with MultiIndex as columns
#
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

kdf.insert(1, "b", 10)
pdf.insert(1, "b", 10)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(2, "c", 0.1)
pdf.insert(2, "c", 0.1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)
kdf.insert(3, "d", kdf.b + 1)
pdf.insert(3, "d", pdf.b + 1)
self.assert_eq(kdf.sort_index(), pdf.sort_index(), almost=True)

self.assertRaisesRegex(
ValueError, "cannot insert d, already exists", lambda: kdf.insert(4, "d", 11)
)
self.assertRaisesRegex(
ValueError,
'"column" must have length equal to number of column levels.',
lambda: kdf.insert(4, ("e",), 11),
)

def test_inplace(self):
pdf, kdf = self.df_pair

Expand Down
42 changes: 42 additions & 0 deletions databricks/koalas/tests/test_ops_on_diff_frames.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,48 @@ def test_combine_first(self):
kser1.combine_first(kser2).sort_index(), pser1.combine_first(pser2).sort_index()
)

def test_insert(self):
#
# Basic DataFrame
#
pdf = pd.DataFrame([1, 2, 3])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Index different from inserting Series'
#
pdf = pd.DataFrame([1, 2, 3], index=[10, 20, 30])
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
kdf.insert(1, "y", kser)
pdf.insert(1, "y", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())

#
# DataFrame with Multi-index columns
#
pdf = pd.DataFrame({("x", "a"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)

pser = pd.Series([4, 5, 6])
kser = ks.from_pandas(pser)
pdf = pd.DataFrame({("x", "a", "b"): [1, 2, 3]})
kdf = ks.from_pandas(pdf)
kdf.insert(0, "a", kser)
pdf.insert(0, "a", pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())
kdf.insert(0, ("b", "c", ""), kser)
pdf.insert(0, ("b", "c", ""), pser)
self.assert_eq(kdf.sort_index(), pdf.sort_index())
xinrong-meng marked this conversation as resolved.
Show resolved Hide resolved

def test_compare(self):
if LooseVersion(pd.__version__) >= LooseVersion("1.1"):
pser1 = pd.Series(["b", "c", np.nan, "g", np.nan])
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ Combining / joining / merging
DataFrame.merge
DataFrame.join
DataFrame.update
DataFrame.insert

Time series-related
-------------------
Expand Down