Skip to content

Commit

Permalink
Use name_like_string instead of str directly. (#942)
Browse files Browse the repository at this point in the history
  • Loading branch information
ueshin authored Oct 23, 2019
1 parent cd51cf2 commit 0df6453
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 30 deletions.
12 changes: 6 additions & 6 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
from databricks.koalas.internal import _InternalFrame, IndexMap, SPARK_INDEX_NAME_FORMAT
from databricks.koalas.missing.frame import _MissingPandasLikeDataFrame
from databricks.koalas.ml import corr
from databricks.koalas.utils import column_index_level, scol_for
from databricks.koalas.utils import column_index_level, name_like_string, scol_for
from databricks.koalas.typedef import _infer_return_type, as_spark_type, as_python_type
from databricks.koalas.plot import KoalasFramePlotMethods
from databricks.koalas.config import get_option
Expand Down Expand Up @@ -419,7 +419,7 @@ def _reduce_for_stat_function(self, sfun, name, axis=None, numeric_only=False):
assert num_args == 2
# Pass in both the column and its data type if sfun accepts two args
col_sdf = sfun(col_sdf, col_type)
exprs.append(col_sdf.alias(str(idx) if len(idx) > 1 else idx[0]))
exprs.append(col_sdf.alias(name_like_string(idx)))

sdf = self._sdf.select(*exprs)
pdf = sdf.toPandas()
Expand Down Expand Up @@ -3257,7 +3257,7 @@ def to_spark(self, index_col: Optional[Union[str, List[str]]] = None):
# TODO: this code is similar with _InternalFrame.spark_df. Might have to deduplicate.
for i, (column, idx) in enumerate(data_columns_column_index):
scol = self._internal.scol_for(idx)
name = str(i) if idx is None else str(idx) if len(idx) > 1 else idx[0]
name = str(i) if idx is None else name_like_string(idx)
data_column_names.append(name)
if column != name:
scol = scol.alias(name)
Expand Down Expand Up @@ -4553,7 +4553,7 @@ def columns(self, columns):
"Length mismatch: Expected axis has %d elements, new values have %d elements"
% (len(old_names), len(column_index)))
column_index_names = columns.names
data_columns = [str(idx) if len(idx) > 1 else idx[0] for idx in column_index]
data_columns = [name_like_string(idx) for idx in column_index]
sdf = self._sdf.select(
self._internal.index_scols +
[self._internal.scol_for(idx).alias(name)
Expand All @@ -4573,7 +4573,7 @@ def columns(self, columns):
column_index_names = columns.names
else:
column_index_names = None
data_columns = [str(idx) if len(idx) > 1 else idx[0] for idx in column_index]
data_columns = [name_like_string(idx) for idx in column_index]
sdf = self._sdf.select(
self._internal.index_scols +
[self._internal.scol_for(idx).alias(name)
Expand Down Expand Up @@ -6815,7 +6815,7 @@ def melt(self, id_vars=None, value_vars=None, var_name=None,
[self._internal.scol_for(idx).alias(value_name)])
) for idx in column_index if idx in value_vars]))

columns = ([self._internal.scol_for(idx).alias(str(idx) if len(idx) > 1 else idx[0])
columns = ([self._internal.scol_for(idx).alias(name_like_string(idx))
for idx in id_vars] +
[F.col("pairs.%s" % name)
for name in var_name[:self._internal.column_index_level]] +
Expand Down
9 changes: 5 additions & 4 deletions databricks/koalas/internal.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@
from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
from databricks.koalas.config import get_option
from databricks.koalas.typedef import infer_pd_series_spark_type, spark_type_to_pandas_dtype
from databricks.koalas.utils import column_index_level, default_session, lazy_property, scol_for
from databricks.koalas.utils import (column_index_level, default_session, lazy_property,
name_like_string, scol_for)


# A function to turn given numbers to Spark columns that represent Koalas index.
Expand Down Expand Up @@ -616,7 +617,7 @@ def spark_internal_df(self) -> spark.DataFrame:
for i, (column, idx) in enumerate(zip(self._data_columns, self.column_index)):
if column not in index_columns:
scol = self.scol_for(idx)
name = str(i) if idx is None else str(idx) if len(idx) > 1 else idx[0]
name = str(i) if idx is None else name_like_string(idx)
if column != name:
scol = scol.alias(name)
data_columns.append(scol)
Expand All @@ -628,7 +629,7 @@ def spark_df(self) -> spark.DataFrame:
data_columns = []
for i, (column, idx) in enumerate(zip(self._data_columns, self.column_index)):
scol = self.scol_for(idx)
name = str(i) if idx is None else str(idx) if len(idx) > 1 else idx[0]
name = str(i) if idx is None else name_like_string(idx)
if column != name:
scol = scol.alias(name)
data_columns.append(scol)
Expand All @@ -651,7 +652,7 @@ def pandas_df(self):
pdf = pdf.set_index(index_field, drop=drop, append=append)
append = True
pdf = pdf[[col if col in index_columns
else str(i) if idx is None else str(idx) if len(idx) > 1 else idx[0]
else str(i) if idx is None else name_like_string(idx)
for i, (col, idx) in enumerate(zip(self.data_columns, self.column_index))]]

if self.column_index_level > 1:
Expand Down
4 changes: 2 additions & 2 deletions databricks/koalas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.
from databricks.koalas.base import IndexOpsMixin
from databricks.koalas.utils import default_session
from databricks.koalas.utils import default_session, name_like_string
from databricks.koalas.frame import DataFrame, _reduce_spark_multi
from databricks.koalas.internal import _InternalFrame, IndexMap
from databricks.koalas.typedef import pandas_wraps
Expand Down Expand Up @@ -1277,7 +1277,7 @@ def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None,
prefix = [str(idx) if len(idx) > 1 else idx[0] for idx in column_index]

column_index_set = set(column_index)
remaining_columns = [kdf[idx].rename(str(idx) if len(idx) > 1 else idx[0])
remaining_columns = [kdf[idx].rename(name_like_string(idx))
for idx in kdf._internal.column_index
if idx not in column_index_set]

Expand Down
6 changes: 3 additions & 3 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
from databricks.koalas.missing.series import _MissingPandasLikeSeries
from databricks.koalas.plot import KoalasSeriesPlotMethods
from databricks.koalas.utils import (validate_arguments_and_invoke_function, scol_for,
tuple_like_strings)
name_like_string)
from databricks.koalas.datetimes import DatetimeMethods
from databricks.koalas.strings import StringMethods

Expand Down Expand Up @@ -3665,8 +3665,8 @@ def __getitem__(self, key):
if length == 1:
return pdf[self.name].iloc[0]

key_string = tuple_like_strings(key) if len(key) > 1 else key[0]
sdf = sdf.withColumn(SPARK_INDEX_NAME_FORMAT(0), F.lit(str(key_string)))
key_string = name_like_string(key)
sdf = sdf.withColumn(SPARK_INDEX_NAME_FORMAT(0), F.lit(key_string))
internal = _InternalFrame(sdf=sdf, index_map=[(SPARK_INDEX_NAME_FORMAT(0), None)])
return _col(DataFrame(internal))

Expand Down
7 changes: 2 additions & 5 deletions databricks/koalas/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from databricks.koalas.frame import DataFrame
from databricks.koalas.indexes import Index
from databricks.koalas.series import Series
from databricks.koalas.utils import name_like_string


class PySparkTestCase(unittest.TestCase):
Expand Down Expand Up @@ -201,11 +202,7 @@ def assertPandasAlmostEqual(self, left, right):
"\n\nRight:\n%s\n%s" % (right, right.dtypes))
self.assertEqual(left.shape, right.shape, msg=msg)
for lcol, rcol in zip(left.columns, right.columns):
if isinstance(lcol, tuple) and isinstance(rcol, tuple):
for l, r in zip(lcol, rcol):
self.assertEqual(str(l), str(r), msg=msg)
else:
self.assertEqual(str(lcol), str(rcol), msg=msg)
self.assertEqual(name_like_string(lcol), name_like_string(rcol), msg=msg)
for lnull, rnull in zip(left[lcol].isnull(), right[rcol].isnull()):
self.assertEqual(lnull, rnull, msg=msg)
for lval, rval in zip(left[lcol].dropna(), right[rcol].dropna()):
Expand Down
10 changes: 5 additions & 5 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,7 @@ def test_rename_columns(self):
kdf.columns = [1, 2, 3, 4]

# Multi-index columns
pdf = pd.DataFrame({('A', '0'): [1, 2, 2, 3], ('B', 1): [1, 2, 3, 4]})
pdf = pd.DataFrame({('A', '0'): [1, 2, 2, 3], ('B', '1'): [1, 2, 3, 4]})
kdf = ks.from_pandas(pdf)

columns = pdf.columns
Expand All @@ -368,16 +368,16 @@ def test_rename_columns(self):
kdf.columns = columns
self.assert_eq(kdf.columns, columns)
self.assert_eq(kdf, pdf)
self.assert_eq(kdf._internal.data_columns, ["('A', '0')", "('B', 1)"])
self.assert_eq(kdf._internal.spark_df.columns, ["('A', '0')", "('B', 1)"])
self.assert_eq(kdf._internal.data_columns, ["(A, 0)", "(B, 1)"])
self.assert_eq(kdf._internal.spark_df.columns, ["(A, 0)", "(B, 1)"])

columns.names = ['lvl_1', 'lvl_2']

kdf.columns = columns
self.assert_eq(kdf.columns.names, ['lvl_1', 'lvl_2'])
self.assert_eq(kdf, pdf)
self.assert_eq(kdf._internal.data_columns, ["('A', '0')", "('B', 1)"])
self.assert_eq(kdf._internal.spark_df.columns, ["('A', '0')", "('B', 1)"])
self.assert_eq(kdf._internal.data_columns, ["(A, 0)", "(B, 1)"])
self.assert_eq(kdf._internal.spark_df.columns, ["(A, 0)", "(B, 1)"])

def test_rename_dataframe(self):
kdf1 = ks.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
Expand Down
23 changes: 18 additions & 5 deletions databricks/koalas/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType
import pandas as pd
from pandas.api.types import is_list_like

from databricks import koalas as ks # For running doctests and reference resolution in PyCharm.

Expand Down Expand Up @@ -360,14 +361,26 @@ def column_index_level(column_index: List[Tuple[str, ...]]) -> int:
return list(levels)[0]


def tuple_like_strings(items):
def name_like_string(name: Union[str, Tuple]) -> str:
"""
Return the tuple-like strings from items
Return the name-like strings from str or tuple of str
Examples
--------
>>> items = ('a', 'b', 'c')
>>> tuple_like_strings(items)
>>> name = 'abc'
>>> name_like_string(name)
'abc'
>>> name = ('abc',)
>>> name_like_string(name)
'abc'
>>> name = ('a', 'b', 'c')
>>> name_like_string(name)
'(a, b, c)'
"""
return '(%s)' % ', '.join(items)
if is_list_like(name):
name = tuple([str(n) for n in name])
else:
name = (str(name),)
return ('(%s)' % ', '.join(name)) if len(name) > 1 else name[0]

0 comments on commit 0df6453

Please sign in to comment.