Skip to content

Commit

Permalink
Fix Dataframe.melt function & Add doctest case for melt function (#987)
Browse files Browse the repository at this point in the history
There is no test that tested with `str, tuple type` of `value_vars` parameter at Dataframe.melt function. 
I update doctest case.
<img width="424" alt="Screen Shot 2019-10-31 at 5 36 37 PM" src="https://user-images.githubusercontent.com/29699834/67931364-0034b100-fc05-11e9-80e6-f607a66884ba.png">

And, I thought, There is a bug like the image below.
<img width="991" alt="Screen Shot 2019-10-31 at 5 46 57 PM" src="https://user-images.githubusercontent.com/29699834/67932015-71c12f00-fc06-11e9-8021-1507dee32a69.png">

I changed some codes.
fwani authored and ueshin committed Nov 8, 2019
1 parent 42758e6 commit c4deb88
Showing 2 changed files with 66 additions and 15 deletions.
72 changes: 57 additions & 15 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
@@ -6791,6 +6791,12 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
4 c B 5
5 c C 6
>>> df.melt(value_vars='A')
variable value
0 A a
1 A b
2 A c
>>> ks.melt(df, id_vars=['A', 'B'])
A B variable value
0 a 1 C 2
@@ -6812,28 +6818,64 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
1 b B 3
2 c B 5
"""
column_index = self._internal.column_index

if id_vars is None:
id_vars = []
elif isinstance(id_vars, str):
id_vars = [(id_vars,)]
elif isinstance(id_vars, tuple):
if self._internal.column_index_level == 1:
id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars]
else:
raise ValueError('id_vars must be a list of tuples when columns are a MultiIndex')
else:
id_vars = [idv if isinstance(idv, tuple) else (idv,) for idv in id_vars]

column_index = self._internal.column_index
if isinstance(id_vars, str):
id_vars = [(id_vars,)]
elif isinstance(id_vars, tuple):
if self._internal.column_index_level == 1:
id_vars = [idv if isinstance(idv, tuple) else (idv,)
for idv in id_vars]
else:
raise ValueError('id_vars must be a list of tuples'
' when columns are a MultiIndex')
else:
id_vars = [idv if isinstance(idv, tuple) else (idv,)
for idv in id_vars]

non_existence_col = [idv for idv in id_vars if idv not in column_index]
if len(non_existence_col) != 0:
raveled_column_index = np.ravel(column_index)
missing = [nec for nec in np.ravel(non_existence_col)
if nec not in raveled_column_index]
if len(missing) != 0:
raise KeyError("The following 'id_vars' are not present"
" in the DataFrame: {}".format(missing))
else:
raise KeyError("None of {} are in the {}"
.format(non_existence_col, column_index))

if value_vars is None:
value_vars = []
elif isinstance(value_vars, str):
value_vars = [(value_vars,)]
elif isinstance(value_vars, tuple):
value_vars = [value_vars]
else:
value_vars = [valv if isinstance(valv, tuple) else (valv,) for valv in value_vars]
if isinstance(value_vars, str):
value_vars = [(value_vars,)]
elif isinstance(value_vars, tuple):
if self._internal.column_index_level == 1:
value_vars = [valv if isinstance(valv, tuple) else (valv,)
for valv in value_vars]
else:
raise ValueError('value_vars must be a list of tuples'
' when columns are a MultiIndex')
else:
value_vars = [valv if isinstance(valv, tuple) else (valv,)
for valv in value_vars]

non_existence_col = [valv for valv in value_vars if valv not in column_index]
if len(non_existence_col) != 0:
raveled_column_index = np.ravel(column_index)
missing = [nec for nec in np.ravel(non_existence_col)
if nec not in raveled_column_index]
if len(missing) != 0:
raise KeyError("The following 'value_vars' are not present"
" in the DataFrame: {}".format(missing))
else:
raise KeyError("None of {} are in the {}"
.format(non_existence_col, column_index))

if len(value_vars) == 0:
value_vars = column_index

9 changes: 9 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -1822,6 +1822,12 @@ def test_melt(self):
pdf.melt(id_vars=['A'], value_vars=['B'],
var_name='myVarname', value_name='myValname')
.sort_values(['myVarname', 'myValname']))
self.assert_eq(kdf.melt(value_vars=('A', 'B')).sort_values(['variable', 'value'])
.reset_index(drop=True),
pdf.melt(value_vars=('A', 'B')).sort_values(['variable', 'value']))

self.assertRaises(KeyError, lambda: kdf.melt(id_vars='Z'))
self.assertRaises(KeyError, lambda: kdf.melt(value_vars='Z'))

# multi-index columns
columns = pd.MultiIndex.from_tuples([('X', 'A'), ('X', 'B'), ('Y', 'C')])
@@ -1855,6 +1861,9 @@ def test_melt(self):
pdf.melt().sort_values(['v0', 'v1', 'value']))

self.assertRaises(ValueError, lambda: kdf.melt(id_vars=('X', 'A')))
self.assertRaises(ValueError, lambda: kdf.melt(value_vars=('X', 'A')))
self.assertRaises(KeyError, lambda: kdf.melt(id_vars=[('Y', 'A')]))
self.assertRaises(KeyError, lambda: kdf.melt(value_vars=[('Y', 'A')]))

def test_all(self):
pdf = pd.DataFrame({

0 comments on commit c4deb88

Please sign in to comment.