databricks · awdavidson · Apr 1, 2021 · Apr 1, 2021 · Apr 1, 2021 · Apr 2, 2021
diff --git a/databricks/koalas/indexes/base.py b/databricks/koalas/indexes/base.py
@@ -15,7 +15,7 @@
 #
 
 from functools import partial
-from typing import Any, List, Optional, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 import warnings
 
 import pandas as pd
@@ -507,6 +507,29 @@ def to_numpy(self, dtype=None, copy=False) -> np.ndarray:
             result = result.copy()
         return result
 
+    def map(
+        self,
+        mapper: Union[dict, Callable[[Any], Any], pd.Series],
+        return_type: ks.typedef.Dtype = str,
 Dtype = Union[np.dtype, ExtensionDtype] 
 Dtype = Union[np.dtype, ExtensionDtype] 
+        na_action: Any = None,
+    ):
+        """
+        Use to change Index values
+
+        Parameters
+        ----------
+        mapper: dict, function or pd.Series
+        return_type: Dtype
+
+        Returns
+        -------
+        ks.Index
+
+        """
+        from databricks.koalas.indexes.extension import MapExtension
+
+        return MapExtension(index=self, na_action=na_action).map(mapper, return_type)
+
     @property
     def values(self) -> np.ndarray:
         """

diff --git a/databricks/koalas/indexes/extension.py b/databricks/koalas/indexes/extension.py
@@ -0,0 +1,144 @@
+#
+# Copyright (C) 2019 Databricks, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any, Callable, Union
+
+import pandas as pd
+
+from pyspark.sql.functions import pandas_udf, PandasUDFType
+
+import databricks.koalas as ks
+from databricks.koalas.indexes.base import Index
+from databricks.koalas.internal import SPARK_DEFAULT_INDEX_NAME
+from databricks.koalas.typedef.typehints import Dtype, as_spark_type
+
+
+# TODO: Implement na_action similar functionality to pandas
+# NB: Passing return_type into class cause Serialisation errors; instead pass at method level
+class MapExtension:
+    def __init__(self, index, na_action: Any):
+        self._index = index
+        if na_action is not None:
+            raise NotImplementedError("Currently do not support na_action functionality")
+        else:
+            self._na_action = na_action
+
+    def map(
+        self, mapper: Union[dict, Callable[[Any], Any], pd.Series], return_type: Dtype
+    ) -> Index:
+        """
+        Single callable/entry point to map Index values
+
+        Parameters
+        ----------
+        mapper: dict, function or pd.Series
+        return_type: Dtype
+
+        Returns
+        -------
+        ks.Index
+
+        """
+        if isinstance(mapper, dict):
+            idx = self._map_dict(mapper, return_type)
+        elif isinstance(mapper, pd.Series):
+            idx = self._map_series(mapper, return_type)
+        elif isinstance(mapper, ks.Series):
+            raise NotImplementedError("Currently do not support input of ks.Series in Index.map")
+        else:
+            idx = self._map_lambda(mapper, return_type)
+        return idx
+
+    def _map_dict(self, mapper: dict, return_type: Dtype) -> Index:
+        """
+        Helper method that has been isolated to merely help map an Index when argument in dict type.
+
+        Parameters
+        ----------
+        mapper: dict
+            Key-value pairs that are used to instruct mapping from index value to new value
+        return_type: Dtype
+            Data type of returned value
+
+        Returns
+        -------
+        ks.Index
+
+        .. note:: Default return value for missing elements is the index's original value
+
+        """
+
+        @pandas_udf(as_spark_type(return_type), PandasUDFType.SCALAR)
+        def pyspark_mapper(col):
+            return col.apply(lambda i: mapper.get(i, return_type(i)))
+
+        return self._index._with_new_scol(pyspark_mapper(SPARK_DEFAULT_INDEX_NAME))
+
+    def _map_series(self, mapper: pd.Series, return_type: Dtype) -> Index:
+        """
+        Helper method that has been isolated to merely help map an Index
+        when argument in pandas.Series type.
+
+        Parameters
+        ----------
+        mapper: pandas.Series
+            Series of (index, value) that is used to instruct mapping from index value to new value
+        return_type: Dtype
+            Data type of returned value
+
+        Returns
+        -------
+        ks.Index
+
+        .. note:: Default return value for missing elements is the index's original value
+
+        """
+        # TODO: clean up, maybe move somewhere else
+        def getOrElse(i):
+            try:
+                return mapper.loc[i]
+            except:
+                return return_type(i)
+
+        @pandas_udf(as_spark_type(return_type), PandasUDFType.SCALAR)
+        def pyspark_mapper(col):
+            return col.apply(lambda i: getOrElse(i))
+
+        return self._index._with_new_scol(pyspark_mapper(SPARK_DEFAULT_INDEX_NAME))
+
+    def _map_lambda(self, mapper: Callable[[Any], Any], return_type: Dtype) -> Index:
+        """
+        Helper method that has been isolated to merely help map Index when the argument is a
+        generic lambda function.
+
+        Parameters
+        ----------
+        mapper: Callable[[Any], Any]
+            Generic lambda function that is applied to index
+        return_type: Dtype
+            Data type of returned value
+
+        Returns
+        -------
+        ks.Index
+
+        """
+
+        @pandas_udf(as_spark_type(return_type), PandasUDFType.SCALAR)
+        def pyspark_mapper(col):
+            return col.apply(mapper)
+
+        return self._index._with_new_scol(pyspark_mapper(SPARK_DEFAULT_INDEX_NAME))
diff --git a/databricks/koalas/missing/indexes.py b/databricks/koalas/missing/indexes.py
@@ -57,7 +57,6 @@ class MissingPandasLikeIndex(object):
     is_ = _unsupported_function("is_")
     is_lexsorted_for_tuple = _unsupported_function("is_lexsorted_for_tuple")
     join = _unsupported_function("join")
-    map = _unsupported_function("map")
     putmask = _unsupported_function("putmask")
     ravel = _unsupported_function("ravel")
     reindex = _unsupported_function("reindex")
@@ -132,7 +131,6 @@ class MissingPandasLikeCategoricalIndex(MissingPandasLikeIndex):
     set_categories = _unsupported_function("set_categories", cls="CategoricalIndex")
     as_ordered = _unsupported_function("as_ordered", cls="CategoricalIndex")
     as_unordered = _unsupported_function("as_unordered", cls="CategoricalIndex")
-    map = _unsupported_function("map", cls="CategoricalIndex")
 
 
 class MissingPandasLikeMultiIndex(object):
@@ -161,7 +159,6 @@ class MissingPandasLikeMultiIndex(object):
     is_lexsorted = _unsupported_function("is_lexsorted")
     is_lexsorted_for_tuple = _unsupported_function("is_lexsorted_for_tuple")
     join = _unsupported_function("join")
-    map = _unsupported_function("map")
     putmask = _unsupported_function("putmask")
     ravel = _unsupported_function("ravel")
     reindex = _unsupported_function("reindex")

diff --git a/databricks/koalas/tests/indexes/test_base.py b/databricks/koalas/tests/indexes/test_base.py
@@ -21,6 +21,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.tseries.offsets import DateOffset
 import pyspark
 
 import databricks.koalas as ks
@@ -65,6 +66,46 @@ def test_index_basic(self):
             self.assert_eq(kdf.index, pdf.index)
             self.assert_eq(type(kdf.index).__name__, type(pdf.index).__name__)
 
+    def test_map(self):
+        kser = ks.Series([1, 2, 3], index=[1, 2, 3])
+
+        with self.assertRaisesRegex(
+            NotImplementedError, "Currently do not support input of ks.Series in Index.map"
+        ):
+            kser.index.map(ks.Series(["one", "two", "three"], index=[1, 2, 3]))
+
+        # Apply series
+        self.assert_eq(
+            kser.index.map(pd.Series(["one", "2", "three"], index=[1, 2, 3])),
+            ks.Index(["one", "2", "three"]),
+        )
+        self.assert_eq(
+            kser.index.map(pd.Series(["one", "2"], index=[1, 2])), ks.Index(["one", "2", "3"]),
+        )
+
+        # Apply dict
+        self.assert_eq(
+            kser.index.map({1: "one", 2: "two", 3: "three"}), ks.Index(["one", "two", "three"])
+        )
+        self.assert_eq(kser.index.map({1: "one", 2: "two"}), ks.Index(["one", "two", "3"]))
+        self.assert_eq(kser.index.map({1: 10, 2: 20}, return_type=int), ks.Index([10, 20, 3]))
+
+        # Apply lambda
+        self.assert_eq(kser.index.map(lambda id: id + 1, return_type=int), ks.Index([2, 3, 4]))
+        self.assert_eq(
+            kser.index.map(lambda id: id + 1.1, return_type=float), ks.Index([2.1, 3.1, 4.1])
+        )
+        self.assert_eq(
+            kser.index.map(lambda id: "{id} + 1".format(id=id), str),
+            ks.Index(["1 + 1", "2 + 1", "3 + 1"]),
+        )
+
+        kser = ks.Series([1, 2, 3, 4], index=pd.date_range("2018-04-09", periods=4, freq="2D"))
+        self.assert_eq(
+            kser.index.map(lambda id: id + DateOffset(days=1), return_type=datetime),
+            ks.Series([1, 2, 3, 4], index=pd.date_range("2018-04-10", periods=4, freq="2D")).index,
+        )
+
     def test_index_from_series(self):
         pser = pd.Series([1, 2, 3], name="a", index=[10, 20, 30])
         kser = ks.from_pandas(pser)

diff --git a/databricks/koalas/tests/indexes/test_extension.py b/databricks/koalas/tests/indexes/test_extension.py
@@ -0,0 +1,89 @@
+#
+# Copyright (C) 2019 Databricks, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from datetime import datetime
+
+import pandas as pd
+from pandas.tseries.offsets import DateOffset
+
+import databricks.koalas as ks
+from databricks.koalas.indexes.extension import MapExtension
+from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils
+
+
+class MapExtensionTest(ReusedSQLTestCase, TestUtils):
+    @property
+    def kidx(self):
+        return ks.Index([1, 2, 3])
+
+    def test_na_action(self):
+        with self.assertRaisesRegex(
+            NotImplementedError, "Currently do not support na_action functionality"
+        ):
+            MapExtension(self.kidx, "ignore")
+
+    def test_map_dict(self):
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_dict({1: "one", 2: "two", 3: "three"}, str),
+            ks.Index(["one", "two", "three"]),
+        )
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_dict({1: "one", 2: "two"}, str),
+            ks.Index(["one", "two", "3"]),
+        )
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_dict({1: 10, 2: 20}, int), ks.Index([10, 20, 3])
+        )
+
+    def test_map_series(self):
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_series(
+                pd.Series(["one", "2", "three"], index=[1, 2, 3]), str
+            ),
+            ks.Index(["one", "2", "three"]),
+        )
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_series(pd.Series(["one", "2"], index=[1, 2]), str),
+            ks.Index(["one", "2", "3"]),
+        )
+
+    def test_map_lambda(self):
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_lambda(lambda id: id + 1, int), ks.Index([2, 3, 4])
+        )
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_lambda(lambda id: id + 1.1, float),
+            ks.Index([2.1, 3.1, 4.1]),
+        )
+        self.assert_eq(
+            MapExtension(self.kidx, None)._map_lambda(lambda id: "{id} + 1".format(id=id), str),
+            ks.Index(["1 + 1", "2 + 1", "3 + 1"]),
+        )
+        kser = ks.Series([1, 2, 3, 4], index=pd.date_range("2018-04-09", periods=4, freq="2D"))
+        self.assert_eq(
+            MapExtension(kser.index, None)._map_lambda(
+                lambda id: id + DateOffset(days=1), datetime
+            ),
+            ks.Series([1, 2, 3, 4], index=pd.date_range("2018-04-10", periods=4, freq="2D")).index,
+        )
+
+    def test_map(self):
+        with self.assertRaisesRegex(
+            NotImplementedError, "Currently do not support input of ks.Series in Index.map"
+        ):
+            MapExtension(self.kidx, None).map(
+                ks.Series(["one", "two", "three"], index=[1, 2, 3]), str
+            )