Implement DataFrame.at_time() (#2116)

Implement DataFrame.at_time()
databricks · Mar 31, 2021 · b413231 · b413231
1 parent 6fae0cb
commit b413231
Show file tree

Hide file tree

Showing 4 changed files with 133 additions and 3 deletions.
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -3015,7 +3015,7 @@ def between_time(
         include_start: bool = True,
         include_end: bool = True,
         axis: Union[int, str] = 0,
-    ) -> Union["Series", "DataFrame"]:
+    ) -> "DataFrame":
         """
         Select values between particular times of the day (e.g., 9:00-9:30 AM).
 
@@ -3037,7 +3037,7 @@ def between_time(
 
         Returns
         -------
-        Series or DataFrame
+        DataFrame
             Data from the original object filtered to the specified dates range.
 
         Raises
@@ -3107,6 +3107,89 @@ def pandas_between_time(pdf) -> ks.DataFrame[return_types]:  # type: ignore
             )
         )
 
+    # TODO: implement axis=1
+    def at_time(
+        self, time: Union[datetime.time, str], asof: bool = False, axis: Union[int, str] = 0
+    ) -> "DataFrame":
+        """
+        Select values at particular time of day (e.g., 9:30AM).
+
+        Parameters
+        ----------
+        time : datetime.time or str
+        axis : {0 or 'index', 1 or 'columns'}, default 0
+
+        Returns
+        -------
+        DataFrame
+
+        Raises
+        ------
+        TypeError
+            If the index is not  a :class:`DatetimeIndex`
+
+        See Also
+        --------
+        between_time : Select values between particular times of the day.
+        DatetimeIndex.indexer_at_time : Get just the index locations for
+            values at particular time of the day.
+
+        Examples
+        --------
+        >>> idx = pd.date_range('2018-04-09', periods=4, freq='12H')
+        >>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=idx)
+        >>> kdf
+                             A
+        2018-04-09 00:00:00  1
+        2018-04-09 12:00:00  2
+        2018-04-10 00:00:00  3
+        2018-04-10 12:00:00  4
+
+        >>> kdf.at_time('12:00')
+                             A
+        2018-04-09 12:00:00  2
+        2018-04-10 12:00:00  4
+        """
+        from databricks.koalas.indexes import DatetimeIndex
+
+        if asof:
+            raise NotImplementedError("'asof' argument is not supported")
+
+        axis = validate_axis(axis)
+
+        if axis != 0:
+            raise NotImplementedError("at_time currently only works for axis=0")
+
+        if not isinstance(self.index, DatetimeIndex):
+            raise TypeError("Index must be DatetimeIndex")
+
+        kdf = self.copy()
+        kdf.index.name = verify_temp_column_name(kdf, "__index_name__")
+        return_types = [kdf.index.dtype] + list(kdf.dtypes)
+
+        if LooseVersion(pd.__version__) < LooseVersion("0.24"):
+
+            def pandas_at_time(pdf) -> ks.DataFrame[return_types]:  # type: ignore
+                return pdf.at_time(time, asof).reset_index()
+
+        else:
+
+            def pandas_at_time(pdf) -> ks.DataFrame[return_types]:  # type: ignore
+                return pdf.at_time(time, asof, axis).reset_index()
+
+        # apply_batch will remove the index of the Koalas DataFrame and attach a default index,
+        # which will never be used. So use "distributed" index as a dummy to avoid overhead.
+        with option_context("compute.default_index_type", "distributed"):
+            kdf = kdf.koalas.apply_batch(pandas_at_time)
+
+        return DataFrame(
+            self._internal.copy(
+                spark_frame=kdf._internal.spark_frame,
+                index_spark_columns=kdf._internal.data_spark_columns[:1],
+                data_spark_columns=kdf._internal.data_spark_columns[1:],
+            )
+        )
+
     def where(self, cond, other=np.nan) -> "DataFrame":
         """
         Replace values where the condition is False.

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -37,7 +37,6 @@ class _MissingPandasLikeDataFrame(object):
     # Functions
     asfreq = _unsupported_function("asfreq")
     asof = _unsupported_function("asof")
-    at_time = _unsupported_function("at_time")
     boxplot = _unsupported_function("boxplot")
     combine = _unsupported_function("combine")
     combine_first = _unsupported_function("combine_first")

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -5487,3 +5487,50 @@ def test_between_time(self):
         kdf = ks.DataFrame({"A": [1, 2, 3, 4]})
         with self.assertRaisesRegex(TypeError, "Index must be DatetimeIndex"):
             kdf.between_time("0:15", "0:45")
+
+    def test_at_time(self):
+        idx = pd.date_range("2018-04-09", periods=4, freq="1D20min")
+        pdf = pd.DataFrame({"A": [1, 2, 3, 4]}, index=idx)
+        kdf = ks.from_pandas(pdf)
+        kdf.at_time("0:20")
+        self.assert_eq(
+            pdf.at_time("0:20").sort_index(), kdf.at_time("0:20").sort_index(),
+        )
+
+        # Index name is 'ts'
+        pdf.index.name = "ts"
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(
+            pdf.at_time("0:20").sort_index(), kdf.at_time("0:20").sort_index(),
+        )
+
+        # Index name is 'ts', column label is 'index'
+        pdf.columns = pd.Index(["index"])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(
+            pdf.at_time("0:40").sort_index(), kdf.at_time("0:40").sort_index(),
+        )
+
+        # Both index name and column label are 'index'
+        pdf.index.name = "index"
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(
+            pdf.at_time("0:40").sort_index(), kdf.at_time("0:40").sort_index(),
+        )
+
+        # Index name is 'index', column label is ('X', 'A')
+        pdf.columns = pd.MultiIndex.from_arrays([["X"], ["A"]])
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(
+            pdf.at_time("0:40").sort_index(), kdf.at_time("0:40").sort_index(),
+        )
+
+        with self.assertRaisesRegex(NotImplementedError, "'asof' argument is not supported"):
+            kdf.at_time("0:15", asof=True)
+
+        with self.assertRaisesRegex(NotImplementedError, "at_time currently only works for axis=0"):
+            kdf.at_time("0:15", axis=1)
+
+        kdf = ks.DataFrame({"A": [1, 2, 3, 4]})
+        with self.assertRaisesRegex(TypeError, "Index must be DatetimeIndex"):
+            kdf.at_time("0:15")
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -164,6 +164,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.add_prefix
    DataFrame.add_suffix
    DataFrame.align
+   DataFrame.at_time
    DataFrame.between_time
    DataFrame.drop
    DataFrame.droplevel