diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst index d7015c9348d..2fd5f71b738 100644 --- a/docs/cudf/source/api_docs/series.rst +++ b/docs/cudf/source/api_docs/series.rst @@ -117,6 +117,7 @@ Computations / descriptive stats Series.all Series.any Series.autocorr + Series.between Series.clip Series.corr Series.count diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ce3f2f7b619..3b80132ab4b 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1510,6 +1510,95 @@ def fillna( value=value, method=method, axis=axis, inplace=inplace, limit=limit ) + def between(self, left, right, inclusive="both") -> Series: + """ + Return boolean Series equivalent to left <= series <= right. + + This function returns a boolean vector containing `True` wherever the + corresponding Series element is between the boundary values `left` and + `right`. NA values are treated as `False`. + + Parameters + ---------- + left : scalar or list-like + Left boundary. + right : scalar or list-like + Right boundary. + inclusive : {"both", "neither", "left", "right"} + Include boundaries. Whether to set each bound as closed or open. + + Returns + ------- + Series + Series representing whether each element is between left and + right (inclusive). + + See Also + -------- + Series.gt : Greater than of series and other. + Series.lt : Less than of series and other. + + Notes + ----- + This function is equivalent to ``(left <= ser) & (ser <= right)`` + + Examples + -------- + >>> import cudf + >>> s = cudf.Series([2, 0, 4, 8, None]) + + Boundary values are included by default: + + >>> s.between(1, 4) + 0 True + 1 False + 2 True + 3 False + 4 + dtype: bool + + With `inclusive` set to ``"neither"`` boundary values are excluded: + + >>> s.between(1, 4, inclusive="neither") + 0 True + 1 False + 2 False + 3 False + 4 + dtype: bool + + `left` and `right` can be any scalar value: + + >>> s = cudf.Series(['Alice', 'Bob', 'Carol', 'Eve']) + >>> s.between('Anna', 'Daniel') + 0 False + 1 True + 2 True + 3 False + dtype: bool + """ + left_operand = left if is_scalar(left) else as_column(left) + right_operand = right if is_scalar(right) else as_column(right) + + if inclusive == "both": + lmask = self._column >= left_operand + rmask = self._column <= right_operand + elif inclusive == "left": + lmask = self._column >= left_operand + rmask = self._column < right_operand + elif inclusive == "right": + lmask = self._column > left_operand + rmask = self._column <= right_operand + elif inclusive == "neither": + lmask = self._column > left_operand + rmask = self._column < right_operand + else: + raise ValueError( + "Inclusive has to be either string of 'both', " + "'left', 'right', or 'neither'." + ) + return self._from_data({self.name: lmask & rmask}, self._index) + @_cudf_nvtx_annotate def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index c11ab16ccec..4849acd6c0f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1813,3 +1813,54 @@ def test_series_digitize_invalid_bins(): ValueError, match="`bins` cannot contain null entries." ): _ = s.digitize(bins) + + +@pytest.mark.parametrize( + "data,left,right", + [ + ([0, 1, 2, 3, 4, 5, 10], 0, 5), + ([0, 1, 2, 3, 4, 5, 10], 10, 1), + ([0, 1, 2, 3, 4, 5], [0, 10, 11] * 2, [1, 2, 5] * 2), + (["a", "few", "set", "of", "strings", "xyz", "abc"], "banana", "few"), + (["a", "few", "set", "of", "strings", "xyz", "abc"], "phone", "hello"), + ( + ["a", "few", "set", "of", "strings", "xyz", "abc"], + ["a", "hello", "rapids", "ai", "world", "chars", "strs"], + ["yes", "no", "hi", "bye", "test", "pass", "fail"], + ), + ([0, 1, 2, np.nan, 4, np.nan, 10], 10, 1), + ], +) +@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) +def test_series_between(data, left, right, inclusive): + ps = pd.Series(data) + gs = cudf.from_pandas(ps, nan_as_null=False) + + expected = ps.between(left, right, inclusive=inclusive) + actual = gs.between(left, right, inclusive=inclusive) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data,left,right", + [ + ([0, 1, 2, None, 4, 5, 10], 0, 5), + ([0, 1, 2, 3, None, 5, 10], 10, 1), + ([None, 1, 2, 3, 4, None], [0, 10, 11] * 2, [1, 2, 5] * 2), + ( + ["a", "few", "set", None, "strings", "xyz", "abc"], + ["a", "hello", "rapids", "ai", "world", "chars", "strs"], + ["yes", "no", "hi", "bye", "test", "pass", "fail"], + ), + ], +) +@pytest.mark.parametrize("inclusive", ["both", "neither", "left", "right"]) +def test_series_between_with_null(data, left, right, inclusive): + gs = cudf.Series(data) + ps = gs.to_pandas(nullable=True) + + expected = ps.between(left, right, inclusive=inclusive) + actual = gs.between(left, right, inclusive=inclusive) + + assert_eq(expected, actual.to_pandas(nullable=True))