diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index f7ef0f0db12..e9c8b71fbe3 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -4225,7 +4225,7 @@ def dropna( name toy born 0 Alfred Batmobile 1940-04-25 """ - if axis == 0: + if axis in [0, "index"]: result = self._drop_na_rows(how=how, subset=subset, thresh=thresh) if ignore_index: result.index = RangeIndex(len(result)) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 9a1051ec158..b0419f844fd 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -5157,6 +5157,7 @@ def pytest_unconfigure(config): "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_dropna", "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_dropna_multiple_axes", "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_no_nans_in_frame[axis=0]", + "tests/frame/methods/test_dropna.py::TestDataFrameMissingData::test_no_nans_in_frame[axis='index']", "tests/frame/methods/test_dtypes.py::TestDataFrameDataTypes::test_dtypes_timedeltas", "tests/frame/methods/test_equals.py::TestEquals::test_equals_different_blocks", "tests/frame/methods/test_explode.py::test_duplicate_index[input_dict0-input_index0-expected_dict0-expected_index0]", @@ -13144,7 +13145,6 @@ def pytest_unconfigure(config): "tests/series/methods/test_update.py::TestUpdate::test_update_dtypes[other9-int64-expected9-FutureWarning]", "tests/series/methods/test_value_counts.py::TestSeriesValueCounts::test_value_counts_categorical_with_nan", "tests/series/test_api.py::TestSeriesMisc::test_attrs", - "tests/series/test_api.py::TestSeriesMisc::test_axis_alias", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index0]", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index10]", "tests/series/test_api.py::TestSeriesMisc::test_index_tab_completion[index11]", diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py index 4c662808b9c..2987f94d1c9 100644 --- a/python/cudf/cudf/testing/_utils.py +++ b/python/cudf/cudf/testing/_utils.py @@ -1,7 +1,6 @@ # Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations -import itertools import string from collections import abc from contextlib import contextmanager @@ -358,12 +357,6 @@ def assert_asserters_equal( cudf_asserter(cudf_left, cudf_right, *args, **kwargs) -parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize( - "left_dtype,right_dtype", - list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)), -) - - @contextmanager def expect_warning_if(condition, warning=FutureWarning, *args, **kwargs): """Catch a warning using pytest.warns if the expect_warning is True. diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py index 772abdfba32..b9c21a67c43 100644 --- a/python/cudf/cudf/tests/conftest.py +++ b/python/cudf/cudf/tests/conftest.py @@ -1,6 +1,7 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools +import math import operator import os import pathlib @@ -210,6 +211,44 @@ def set_decomp_env_vars(monkeypatch, request): operator.gt, operator.ge, ] +bitwise_ops = [ + operator.and_, + operator.or_, + operator.xor, +] +unary_ops = [ + math.acos, + math.acosh, + math.asin, + math.asinh, + math.atan, + math.atanh, + math.ceil, + math.cos, + math.degrees, + math.erf, + math.erfc, + math.exp, + math.expm1, + math.fabs, + math.floor, + math.gamma, + math.lgamma, + math.log, + math.log10, + math.log1p, + math.log2, + math.radians, + math.sin, + math.sinh, + math.sqrt, + math.tan, + math.tanh, + operator.pos, + operator.neg, + operator.not_, + operator.invert, +] @pytest.fixture(params=arithmetic_ops) @@ -238,6 +277,16 @@ def comparison_op_method(comparison_op): return comparison_op.__name__ +@pytest.fixture(params=bitwise_ops) +def bitwise_op(request): + return request.param + + +@pytest.fixture(params=unary_ops) +def unary_op(request): + return request.param + + @pytest.fixture(params=arithmetic_ops + comparison_ops) def binary_op(request): return request.param @@ -576,3 +625,9 @@ def categorical_ordered(request): def interval_closed(request): """Param for `closed` argument for interval types""" return request.param + + +@pytest.fixture(params=["all", "any"]) +def dropna_how(request): + """Param for `how` argument""" + return request.param diff --git a/python/cudf/cudf/tests/dataframe/methods/test_apply.py b/python/cudf/cudf/tests/dataframe/methods/test_apply.py new file mode 100644 index 00000000000..b49f8d9dc25 --- /dev/null +++ b/python/cudf/cudf/tests/dataframe/methods/test_apply.py @@ -0,0 +1,772 @@ +# Copyright (c) 2021-2025, NVIDIA CORPORATION. +import decimal +import math +import operator + +import numpy as np +import pytest +from numba import cuda +from numba.core.typing import signature as nb_signature +from numba.core.typing.templates import AbstractTemplate +from numba.cuda.cudadecl import registry as cuda_decl_registry +from numba.cuda.cudaimpl import lower as cuda_lower + +import cudf +from cudf.core.missing import NA +from cudf.core.udf._ops import ( + comparison_ops, +) +from cudf.core.udf.strings_lowering import ( + cast_string_view_to_managed_udf_string, +) +from cudf.core.udf.strings_typing import ( + StringView, + managed_udf_string, + string_view, +) +from cudf.testing import assert_eq + + +def sv_to_managed_udf_str(sv): + """ + Cast a string_view object to a managed_udf_string object + + This placeholder function never runs in python + It exists only for numba to have something to replace + with the typing and lowering code below + + This is similar conceptually to needing a translation + engine to emit an expression in target language "B" when + there is no equivalent in the source language "A" to + translate from. This function effectively defines the + expression in language "A" and the associated typing + and lowering describe the translation process, despite + the expression having no meaning in language "A" + """ + pass + + +@cuda_decl_registry.register_global(sv_to_managed_udf_str) +class StringViewToUDFStringDecl(AbstractTemplate): + def generic(args, kws): + if isinstance(args[0], StringView) and len(args) == 1: + return nb_signature(managed_udf_string, string_view) + + +@cuda_lower(sv_to_managed_udf_str, string_view) +def sv_to_udf_str_testing_lowering(context, builder, sig, args): + return cast_string_view_to_managed_udf_string( + context, builder, sig.args[0], sig.return_type, args[0] + ) + + +def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): + gdf = data + pdf = data.to_pandas(nullable=nullable) + + expect = pdf.apply(func, args=args, axis=1) + obtain = gdf.apply(func, args=args, axis=1) + assert_eq(expect, obtain, **kwargs) + + +@pytest.fixture +def str_udf_data(): + return cudf.DataFrame( + { + "str_col": [ + "abc", + "ABC", + "AbC", + "123", + "123aBc", + "123@.!", + "", + "rapids ai", + "gpu", + "True", + "False", + "1.234", + ".123a", + "0.013", + "1.0", + "01", + "20010101", + "cudf", + "cuda", + "gpu", + "This Is A Title", + "This is Not a Title", + "Neither is This a Title", + "NoT a TiTlE", + "123 Title Works", + ] + } + ) + + +@pytest.fixture(params=["a", "2", "gpu", "", " "]) +def substr(request): + return request.param + + +def test_string_udf_len(str_udf_data): + def func(row): + return len(row["str_col"]) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_startswith(str_udf_data, substr): + def func(row): + return row["str_col"].startswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_endswith(str_udf_data, substr): + def func(row): + return row["str_col"].endswith(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_find(str_udf_data, substr): + def func(row): + return row["str_col"].find(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_rfind(str_udf_data, substr): + def func(row): + return row["str_col"].rfind(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_contains(str_udf_data, substr): + def func(row): + return substr in row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("other", ["cudf", "123", "", " "]) +@pytest.mark.parametrize("cmpop", comparison_ops) +def test_string_udf_cmpops(str_udf_data, other, cmpop): + def func(row): + return cmpop(row["str_col"], other) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isalnum(str_udf_data): + def func(row): + return row["str_col"].isalnum() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isalpha(str_udf_data): + def func(row): + return row["str_col"].isalpha() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isdigit(str_udf_data): + def func(row): + return row["str_col"].isdigit() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isdecimal(str_udf_data): + def func(row): + return row["str_col"].isdecimal() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isupper(str_udf_data): + def func(row): + return row["str_col"].isupper() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_islower(str_udf_data): + def func(row): + return row["str_col"].islower() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_isspace(str_udf_data): + def func(row): + return row["str_col"].isspace() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_istitle(str_udf_data): + def func(row): + return row["str_col"].istitle() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_count(str_udf_data, substr): + def func(row): + return row["str_col"].count(substr) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.xfail(reason="Identity function not supported.") +def test_string_udf_return_string(str_udf_data): + def func(row): + return row["str_col"] + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_strip(str_udf_data, strip_char): + def func(row): + return row["str_col"].strip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_lstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].lstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_rstrip(str_udf_data, strip_char): + def func(row): + return row["str_col"].rstrip(strip_char) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_upper(str_udf_data): + def func(row): + return row["str_col"].upper() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_string_udf_lower(str_udf_data): + def func(row): + return row["str_col"].lower() + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"]) +def test_string_udf_concat(str_udf_data, concat_char): + def func(row): + return row["str_col"] + concat_char + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +@pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) +@pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) +def test_string_udf_replace(str_udf_data, to_replace, replacement): + def func(row): + return row["str_col"].replace(to_replace, replacement) + + run_masked_udf_test(func, str_udf_data, check_dtype=False) + + +def test_arith_masked_vs_masked(arithmetic_op): + # This test should test all the typing + # and lowering for arithmetic ops between + # two columns + def func(row): + x = row["a"] + y = row["b"] + return arithmetic_op(x, y) + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_bitwise_masked_vs_masked(bitwise_op): + # This test should test all the typing + # and lowering for bitwise ops between + # two columns + def func(row): + x = row["a"] + y = row["b"] + return bitwise_op(x, y) + + gdf = cudf.DataFrame( + { + "a": [1, 0, 1, 0, 0b1011, 42, None], + "b": [1, 1, 0, 0, 0b1100, -42, 5], + } + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("op", [operator.add, operator.sub]) +def test_arith_masked_vs_masked_datelike( + op, datetime_types_as_str, temporal_types_as_str +): + # Datetime version of the above + # does not test all dtype combinations for now + if temporal_types_as_str.startswith("datetime") and op is operator.add: + # don't try adding datetimes to datetimes. + pytest.skip("Adding datetime to datetime is not valid") + + def func(row): + x = row["a"] + y = row["b"] + return op(x, y) + + gdf = cudf.DataFrame( + { + "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA], + "b": [4, 5, cudf.NA, cudf.NA], + } + ) + gdf["a"] = gdf["a"].astype(datetime_types_as_str) + gdf["b"] = gdf["b"].astype(temporal_types_as_str) + + pdf = gdf.to_pandas() + expect = op(pdf["a"], pdf["b"]) + obtain = gdf.apply(func, axis=1) + assert_eq(expect, obtain, check_dtype=False) + # TODO: After the following pandas issue is + # fixed, uncomment the following line and delete + # through `to_pandas()` statement. + # https://github.com/pandas-dev/pandas/issues/52411 + + # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) + + +def test_compare_masked_vs_masked(comparison_op): + # this test should test all the + # typing and lowering for comparisons + # between columns + + def func(row): + x = row["a"] + y = row["b"] + return comparison_op(x, y) + + # we should get: + # [?, ?, , , ] + gdf = cudf.DataFrame( + {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +def test_arith_masked_vs_constant(arithmetic_op, constant): + if constant is False and arithmetic_op in { + operator.mod, + operator.pow, + operator.truediv, + operator.floordiv, + operator.imod, + operator.ipow, + operator.itruediv, + operator.ifloordiv, + }: + # The following tests cases yield undefined behavior: + # - truediv(x, False) because its dividing by zero + # - floordiv(x, False) because its dividing by zero + # - mod(x, False) because its mod by zero, + # - pow(x, False) because we have an NA in the series and pandas + # insists that (NA**0 == 1) where we do not + pytest.skip( + f"{constant=} yields undefined behavior for {arithmetic_op=}" + ) + + def func(row): + x = row["data"] + return arithmetic_op(x, constant) + + gdf = cudf.DataFrame({"data": [1, 2, cudf.NA]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, True, False]) +@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) +def test_arith_masked_vs_constant_reflected( + request, arithmetic_op, constant, data +): + def func(row): + x = row["data"] + return arithmetic_op(constant, x) + + # Just a single column -> result will be all NA + gdf = cudf.DataFrame({"data": data}) + + # cudf differs from pandas for 1**NA + request.applymarker( + pytest.mark.xfail( + condition=( + constant == 1 + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) +def test_arith_masked_vs_null(request, arithmetic_op, data): + def func(row): + x = row["data"] + return arithmetic_op(x, NA) + + gdf = cudf.DataFrame({"data": data}) + + # In pandas, 1**NA == 1. + request.applymarker( + pytest.mark.xfail( + condition=( + (gdf["data"] == 1).any() + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_arith_masked_vs_null_reflected(arithmetic_op): + def func(row): + x = row["data"] + return arithmetic_op(NA, x) + + gdf = cudf.DataFrame({"data": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_unary_masked(unary_op): + # This test should test all the typing + # and lowering for unary ops + + def func(row): + x = row["a"] + return unary_op(x) if x is not NA else NA + + if "log" in unary_op.__name__: + gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]}) + elif unary_op.__name__ in {"asin", "acos"}: + gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]}) + elif unary_op.__name__ in {"atanh"}: + gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]}) + elif unary_op.__name__ in {"acosh", "sqrt", "lgamma"}: + gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]}) + elif unary_op.__name__ in {"gamma"}: + gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]}) + elif unary_op.__name__ in {"invert"}: + gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64") + else: + gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_masked_is_null_conditional(): + def func(row): + x = row["a"] + y = row["b"] + if x is NA: + return y + else: + return x + y + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_contains(): + def func(row): + x = row["a"] + return x in [1, 2] + + gdf = cudf.DataFrame({"a": [1, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) +def test_apply_mixed_dtypes(numeric_types_as_str, numeric_types_as_str2, op): + """ + Test that operations can be performed between columns + of different dtypes and return a column with the correct + values and nulls + """ + + # First perform the op on two dummy data on host, if numpy can + # safely type cast, we should expect it to work in udf too. + try: + op( + np.dtype(numeric_types_as_str).type(0), + np.dtype(numeric_types_as_str2).type(42), + ) + except TypeError: + pytest.skip("Operation is unsupported for corresponding dtype.") + + def func(row): + x = row["a"] + y = row["b"] + return op(x, y) + + gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) + gdf["a"] = gdf["a"].astype(numeric_types_as_str) + gdf["b"] = gdf["b"].astype(numeric_types_as_str2) + + run_masked_udf_test(func, gdf, check_dtype=False) + + +@pytest.mark.parametrize("val", [5, 5.5]) +def test_apply_return_literal(val): + """ + Test unification codepath for scalars and MaskedType + makes sure that numba knows how to cast a scalar value + to a MaskedType + """ + + def func(row): + x = row["a"] + y = row["b"] + if x is not NA and x < 2: + return val + else: + return x + y + + gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) + + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_null(): + """ + Tests casting / unification of Masked and NA + """ + + def func(row): + x = row["a"] + if x is NA: + return NA + else: + return x + + gdf = cudf.DataFrame({"a": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_either_null_or_literal(): + def func(row): + x = row["a"] + if x > 5: + return 2 + else: + return NA + + gdf = cudf.DataFrame({"a": [1, 3, 6]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_return_literal_only(): + def func(x): + return 5 + + gdf = cudf.DataFrame({"a": [1, None, 3]}) + run_masked_udf_test(func, gdf, check_dtype=False) + + +def test_apply_everything(): + def func(row): + w = row["a"] + x = row["b"] + y = row["c"] + z = row["d"] + if x is NA: + return w + y - z + elif ((z > y) is not NA) and z > y: + return x + elif ((x + y) is not NA) and x + y == 0: + return z / x + elif x + y is NA: + return 2.5 + elif w > 100: + return ( + math.sin(x) + + math.sqrt(y) + - (-z) + + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14) + ) + else: + return y > 2 + + gdf = cudf.DataFrame( + { + "a": [1, 3, 6, 0, None, 5, None, 101], + "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0], + "c": [2, 3, 6, 0, None, 5, None, 6], + "d": [4, None, 6, 0, None, 5, None, 7.5], + } + ) + run_masked_udf_test(func, gdf, check_dtype=False) + + +### + + +### + + +def test_masked_udf_lambda_support(binary_op): + func = lambda row: binary_op(row["a"], row["b"]) # noqa: E731 + + data = cudf.DataFrame( + {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} + ) + + run_masked_udf_test(func, data, check_dtype=False) + + +def test_masked_udf_nested_function_support(binary_op): + """ + Nested functions need to be explicitly jitted by the user + for numba to recognize them. Unfortunately the object + representing the jitted function can not itself be used in + pandas udfs. + """ + + def inner(x, y): + return binary_op(x, y) + + def outer(row): + x = row["a"] + y = row["b"] + return inner(x, y) + + gdf = cudf.DataFrame( + {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} + ) + + with pytest.raises(ValueError): + gdf.apply(outer, axis=1) + + pdf = gdf.to_pandas(nullable=True) + inner_gpu = cuda.jit(device=True)(inner) + + def outer_gpu(row): + x = row["a"] + y = row["b"] + return inner_gpu(x, y) + + got = gdf.apply(outer_gpu, axis=1) + expect = pdf.apply(outer, axis=1) + assert_eq(expect, got, check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, + {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]}, + {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}, + ], +) +def test_masked_udf_subset_selection(data): + def func(row): + return row["a"] + row["b"] + + data = cudf.DataFrame(data) + run_masked_udf_test(func, data) + + +@pytest.mark.parametrize( + "unsupported_col", + [ + lambda: cudf.Series( + [ + decimal.Decimal("1.0"), + decimal.Decimal("2.0"), + decimal.Decimal("3.0"), + ], + dtype=cudf.Decimal64Dtype(2, 1), + ), + lambda: cudf.Series([1, 2, 3], dtype="category"), + lambda: cudf.interval_range(start=0, end=3), + lambda: [[1, 2], [3, 4], [5, 6]], + lambda: [{"a": 1}, {"a": 2}, {"a": 3}], + ], +) +def test_masked_udf_unsupported_dtype(unsupported_col): + data = cudf.DataFrame({"unsupported_col": unsupported_col()}) + + def func(row): + return row["unsupported_col"] + + # check that we fail when an unsupported type is used within a function + with pytest.raises(ValueError): + data.apply(func, axis=1) + + # also check that a DF containing unsupported dtypes can still run a + # function that does NOT involve any of the unsupported dtype columns + data["supported_col"] = 1 + + def other_func(row): + return row["supported_col"] + + expect = cudf.Series(np.ones(len(data))) + got = data.apply(other_func, axis=1) + + assert_eq(expect, got, check_dtype=False) + + +# tests for `DataFrame.apply(f, args=(x,y,z))` +# testing the whole space of possibilities is intractable +# these test the most rudimentary guaranteed functionality +@pytest.mark.parametrize( + "data", + [ + {"a": [1, cudf.NA, 3]}, + {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, + {"a": [True, False, cudf.NA]}, + ], +) +def test_masked_udf_scalar_args_binops(data, binary_op): + data = cudf.DataFrame(data) + + def func(row, c): + return binary_op(row["a"], c) + + run_masked_udf_test(func, data, args=(1,), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, cudf.NA, 3]}, + {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, + {"a": [True, False, cudf.NA]}, + ], +) +def test_masked_udf_scalar_args_binops_multiple(data, binary_op): + data = cudf.DataFrame(data) + + def func(row, c, k): + x = binary_op(row["a"], c) + y = binary_op(x, k) + return y + + run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) diff --git a/python/cudf/cudf/tests/dataframe/methods/test_dropna.py b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py index ec27503a0ef..f3c2f5a236a 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_dropna.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_dropna.py @@ -1,8 +1,9 @@ # Copyright (c) 2025, NVIDIA CORPORATION. - import numpy as np import pandas as pd +import pyarrow as pa +import pytest import cudf from cudf.testing import assert_eq @@ -32,3 +33,157 @@ def test_datetime_dataframe(): assert_eq(ps.dropna(), gs.dropna()) assert_eq(ps.isnull(), gs.isnull()) + + +@pytest.mark.parametrize( + "data", + [ + {"a": [1, 2, None]}, + {"a": [1, 2, None], "b": [3, 4, 5]}, + {"a": [1, 2, None], "b": [3, 4, None]}, + {"a": [None, 1, 2], "b": [1, 2, None]}, + {"a": [None, 1, None], "b": [None, 2, None]}, + {"a": [None, None, 1], "b": [1, 2, None]}, + {"a": ["d", "e", "f"], "b": ["a", None, "c"]}, + ], +) +def test_dropna_dataframe(data, dropna_how, axis, inplace): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + expected = pdf.dropna(axis=axis, how=dropna_how, inplace=inplace) + actual = gdf.dropna(axis=axis, how=dropna_how, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "data", + [ + { + "a": pa.array([None, None, None], type=pa.float64()), + "b": [1, 2, None], + }, + { + "a": pa.array([np.nan, np.nan, np.nan]), + "b": [1, 2, None], + }, + {"a": pa.array([None, None, None], type=pa.string())}, + ], +) +def test_dropna_with_all_nulls(dropna_how, data, axis): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + assert_eq( + pdf.dropna(axis=axis, how=dropna_how), + gdf.dropna(axis=axis, how=dropna_how), + check_dtype=False, + ) + + +@pytest.mark.parametrize( + "data,subset", + [ + ({"a": [1, None], "b": [1, 2]}, ["a"]), + ({"a": [1, None], "b": [1, 2]}, ["b"]), + ({"a": [1, None], "b": [1, 2]}, []), + ({"a": [1, 2], "b": [1, 2]}, ["b"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["a"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["b"]), + ({"a": [1, 2, None], "b": [1, None, 2]}, ["a", "b"]), + ], +) +def test_dropna_subset_rows(data, subset): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + assert_eq(pdf.dropna(subset=subset), gdf.dropna(subset=subset)) + + +@pytest.mark.parametrize( + "data, subset", + [ + ({"a": [1, None], "b": [1, 2]}, [0]), + ({"a": [1, None], "b": [1, 2]}, [1]), + ({"a": [1, None], "b": [1, 2]}, []), + ({"a": [1, 2], "b": [1, 2]}, [0]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [1]), + ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0, 1]), + ], +) +def test_dropna_subset_cols(data, subset): + pdf = pd.DataFrame(data) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.dropna(axis=1, subset=subset), gdf.dropna(axis=1, subset=subset) + ) + + +# TODO: can't test with subset=[] below since Pandas +# returns empty DF when both subset=[] and thresh are specified. +@pytest.mark.parametrize("thresh", [0, 1, 2]) +@pytest.mark.parametrize("subset", [None, ["a"], ["b"], ["a", "b"]]) +def test_dropna_thresh(thresh, subset): + pdf = pd.DataFrame({"a": [1, 2, None, None], "b": [1, 2, 3, None]}) + gdf = cudf.from_pandas(pdf) + + assert_eq( + pdf.dropna(axis=0, thresh=thresh, subset=subset), + gdf.dropna(axis=0, thresh=thresh, subset=subset), + ) + + +@pytest.mark.parametrize("thresh", [0, 1, 2]) +@pytest.mark.parametrize("subset", [None, [0], [1], [0, 1]]) +def test_dropna_thresh_cols(thresh, subset, inplace): + pdf = pd.DataFrame( + {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} + ) + gdf = cudf.from_pandas(pdf) + + expected = pdf.dropna( + axis=1, thresh=thresh, subset=subset, inplace=inplace + ) + actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) + + if inplace: + expected = pdf + actual = gdf + + assert_eq( + expected, + actual, + ) + + +@pytest.mark.parametrize( + "data", + [ + { + "key": [1, 2, 10], + "val": pa.array([np.nan, 3.0, 1.0]), + "abc": [np.nan, None, 1], + }, + { + "key": [None, 2, 1], + "val": pa.array([3.0, None, 0.1]), + "abc": [None, 1, None], + }, + ], +) +def test_dropna_dataframe_np_nan(data, axis): + gdf = cudf.DataFrame(data) + pd_data = { + key: value.to_pandas() if isinstance(value, cudf.Series) else value + for key, value in data.items() + } + pdf = pd.DataFrame(pd_data) + + assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) diff --git a/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py new file mode 100644 index 00000000000..b572e9e156d --- /dev/null +++ b/python/cudf/cudf/tests/indexes/index/methods/test_dropna.py @@ -0,0 +1,27 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data, dtype", + [ + ([1, float("nan"), 2], "float64"), + (["x", None, "y"], "str"), + (["x", None, "y"], "category"), + (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"), + (["1s", pd.NaT, "3d"], "timedelta64[ns]"), + ], +) +def test_dropna_index(data, dtype): + pi = pd.Index(data, dtype=dtype) + gi = cudf.from_pandas(pi) + + expect = pi.dropna() + got = gi.dropna() + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/indexes/index/test_attributes.py b/python/cudf/cudf/tests/indexes/index/test_attributes.py index 2e80dfb272e..ee4a1654a10 100644 --- a/python/cudf/cudf/tests/indexes/index/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/index/test_attributes.py @@ -1,10 +1,45 @@ # Copyright (c) 2025, NVIDIA CORPORATION. +import datetime import numpy as np import pandas as pd import pytest -from cudf import Index +import cudf + + +@pytest.mark.parametrize( + "values, item, expected", + [ + [[1, 2, 3], 2, True], + [[1, 2, 3], 4, False], + [[1, 2, 3], "a", False], + [["a", "b", "c"], "a", True], + [["a", "b", "c"], "ab", False], + [["a", "b", "c"], 6, False], + [pd.Categorical(["a", "b", "c"]), "a", True], + [pd.Categorical(["a", "b", "c"]), "ab", False], + [pd.Categorical(["a", "b", "c"]), 6, False], + [pd.date_range("20010101", periods=5, freq="D"), 20000101, False], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2000, 1, 1), + False, + ], + [ + pd.date_range("20010101", periods=5, freq="D"), + datetime.datetime(2001, 1, 1), + True, + ], + ], +) +@pytest.mark.parametrize( + "box", + [cudf.Index, lambda x: cudf.Series(index=x)], + ids=["index", "series"], +) +def test_contains(values, item, expected, box): + assert (item in box(values)) is expected @pytest.mark.parametrize( @@ -26,7 +61,7 @@ ], ) def test_index_is_unique_monotonic(testlist): - index = Index(testlist) + index = cudf.Index(testlist) index_pd = pd.Index(testlist) assert index.is_unique == index_pd.is_unique diff --git a/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py b/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py new file mode 100644 index 00000000000..049dc847e18 --- /dev/null +++ b/python/cudf/cudf/tests/indexes/multiindex/methods/test_dropna.py @@ -0,0 +1,41 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +def test_dropna_multiindex(dropna_how): + pi = pd.MultiIndex.from_arrays([[1, None, 2], [None, None, 2]]) + gi = cudf.from_pandas(pi) + + expect = pi.dropna(dropna_how) + got = gi.dropna(dropna_how) + assert_eq(expect, got) + + +@pytest.mark.parametrize( + "data", + [ + [ + [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], + [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")], + ], + [ + [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], + [np.nan, np.nan, 1.0], + ], + [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]], + ], +) +def test_dropna_multiindex_2(data, dropna_how): + pi = pd.MultiIndex.from_arrays(data) + gi = cudf.from_pandas(pi) + + expect = pi.dropna(dropna_how) + got = gi.dropna(dropna_how) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py index a9de5d39622..60ee8b432e6 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/test_attributes.py @@ -3,14 +3,20 @@ import pandas as pd import pytest -from cudf.core.index import RangeIndex +import cudf + + +def test_rangeindex_contains(): + ridx = cudf.RangeIndex(start=0, stop=10, name="Index") + assert 9 in ridx + assert 10 not in ridx @pytest.mark.parametrize( "start, stop, step", [(10, 20, 1), (0, -10, -1), (5, 5, 1)] ) def test_range_index_is_unique_monotonic(start, stop, step): - index = RangeIndex(start=start, stop=stop, step=step) + index = cudf.RangeIndex(start=start, stop=stop, step=step) index_pd = pd.RangeIndex(start=start, stop=stop, step=step) assert index.is_unique == index_pd.is_unique diff --git a/python/cudf/cudf/tests/series/methods/test_apply.py b/python/cudf/cudf/tests/series/methods/test_apply.py new file mode 100644 index 00000000000..8bdb46e02a2 --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_apply.py @@ -0,0 +1,281 @@ +# Copyright (c) 2021-2025, NVIDIA CORPORATION. +import operator + +import numpy as np +import pytest + +import cudf +from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION +from cudf.core.udf.utils import precompiled +from cudf.testing import assert_eq + + +def run_masked_udf_series(func, data, args=(), **kwargs): + gsr = data + psr = data.to_pandas(nullable=True) + + expect = psr.apply(func, args=args) + obtain = gsr.apply(func, args=args) + assert_eq(expect, obtain, **kwargs) + + +@pytest.mark.parametrize( + "data", + [ + np.array( + [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max], + dtype="int64", + ), + np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"), + np.array( + [ + 0, + 0.0, + -1.0, + 1.5, + -1.5, + np.finfo("float64").min, + np.finfo("float64").max, + np.nan, + np.inf, + -np.inf, + ], + dtype="float64", + ), + [False, True, False, cudf.NA], + ], +) +def test_masked_udf_abs(data): + data = cudf.Series(data) + data[0] = cudf.NA + + def func(x): + return abs(x) + + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize( + "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] +) +@pytest.mark.parametrize("operator", [float, int, bool]) +def test_masked_udf_casting(operator, data): + data = cudf.Series(data) + + def func(x): + return operator(x) + + run_masked_udf_series(func, data, check_dtype=False) + + +def test_masked_udf_caching(): + # Make sure similar functions that differ + # by simple things like constants actually + # recompile + + data = cudf.Series([1, 2, 3]) + + expect = data**2 + got = data.apply(lambda x: x**2) + assert_eq(expect, got, check_dtype=False) + + # update the constant value being used and make sure + # it does not result in a cache hit + + expect = data**3 + got = data.apply(lambda x: x**3) + assert_eq(expect, got, check_dtype=False) + + # make sure we get a hit when reapplying + def f(x): + return x + 1 + + precompiled.clear() + assert precompiled.currsize == 0 + data.apply(f) + + assert precompiled.currsize == 1 + data.apply(f) + + assert precompiled.currsize == 1 + + # validate that changing the type of a scalar arg + # results in a miss + precompiled.clear() + + def f(x, c): + return x + c + + data.apply(f, args=(1,)) + assert precompiled.currsize == 1 + + data.apply(f, args=(1.5,)) + assert precompiled.currsize == 2 + + +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +def test_masked_udf_scalar_args_binops_multiple_series( + request, data, binary_op +): + data = cudf.Series(data) + request.applymarker( + pytest.mark.xfail( + binary_op + in [ + operator.eq, + operator.ne, + operator.lt, + operator.le, + operator.gt, + operator.ge, + ] + and PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION + and data.dtype.kind != "b", + reason="https://github.com/pandas-dev/pandas/issues/57390", + ) + ) + + def func(data, c, k): + x = binary_op(data, c) + y = binary_op(x, k) + return y + + run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) + + +@pytest.mark.parametrize( + "data", + [ + [1, cudf.NA, 3], + [0.5, 2.0, cudf.NA, cudf.NA, 5.0], + [True, False, cudf.NA], + ], +) +def test_mask_udf_scalar_args_binops_series(data): + data = cudf.Series(data) + + def func(x, c): + return x + c + + run_masked_udf_series(func, data, args=(1,), check_dtype=False) + + +@pytest.mark.parametrize( + "data,name", + [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")], +) +def test_series_apply_basic(data, name): + data = cudf.Series(data, name=name) + + def func(x): + return x + 1 + + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_apply_null_conditional(): + def func(x): + if x is cudf.NA: + return 42 + else: + return x - 1 + + data = cudf.Series([1, cudf.NA, 3]) + + run_masked_udf_series(func, data) + + +def test_series_arith_masked_vs_masked(arithmetic_op): + def func(x): + return arithmetic_op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_compare_masked_vs_masked(comparison_op): + """ + In the series case, only one other MaskedType to compare with + - itself + """ + + def func(x): + return comparison_op(x, x) + + data = cudf.Series([1, cudf.NA, 3]) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant(request, arithmetic_op, constant): + def func(x): + return arithmetic_op(x, constant) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + # in pandas, 1**NA == 1. In cudf, 1**NA == NA. + request.applymarker( + pytest.mark.xfail( + condition=( + constant is cudf.NA + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) +def test_series_arith_masked_vs_constant_reflected( + request, arithmetic_op, constant +): + def func(x): + return arithmetic_op(constant, x) + + # Just a single column -> result will be all NA + data = cudf.Series([1, 2, cudf.NA]) + # Using in {1} since bool(NA == 1) raises a TypeError since NA is + # neither truthy nor falsy + # in pandas, 1**NA == 1. In cudf, 1**NA == NA. + request.applymarker( + pytest.mark.xfail( + condition=( + constant in {1} + and arithmetic_op in {operator.pow, operator.ipow} + ), + reason="https://github.com/rapidsai/cudf/issues/7478", + ) + ) + run_masked_udf_series(func, data, check_dtype=False) + + +@pytest.mark.xfail( + PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, + reason="https://github.com/pandas-dev/pandas/issues/57390", +) +def test_series_masked_is_null_conditional(): + def func(x): + if x is cudf.NA: + return 42 + else: + return x + + data = cudf.Series([1, cudf.NA, 3, cudf.NA]) + + run_masked_udf_series(func, data, check_dtype=False) diff --git a/python/cudf/cudf/tests/series/methods/test_dropna.py b/python/cudf/cudf/tests/series/methods/test_dropna.py new file mode 100644 index 00000000000..dafcbf5bbfe --- /dev/null +++ b/python/cudf/cudf/tests/series/methods/test_dropna.py @@ -0,0 +1,81 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. + +import numpy as np +import pandas as pd +import pytest + +import cudf +from cudf.testing import assert_eq + + +@pytest.mark.parametrize( + "data", + [ + [], + [1.0, 2, None, 4], + ["one", "two", "three", "four"], + pd.Series(["a", "b", "c", "d"], dtype="category"), + pd.Series(pd.date_range("2010-01-01", "2010-01-04")), + ], +) +@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) +def test_dropna_series(data, nulls, inplace): + psr = pd.Series(data) + rng = np.random.default_rng(seed=0) + if len(data) > 0: + if nulls == "one": + p = rng.integers(0, 4) + psr[p] = None + elif nulls == "some": + p1, p2 = rng.integers(0, 4, (2,)) + psr[p1] = None + psr[p2] = None + elif nulls == "all": + psr[:] = None + + gsr = cudf.from_pandas(psr) + + check_dtype = True + if gsr.null_count == len(gsr): + check_dtype = False + + expected = psr.dropna() + actual = gsr.dropna() + + if inplace: + expected = psr + actual = gsr + + assert_eq(expected, actual, check_dtype=check_dtype) + + +def test_dropna_nan_as_null(): + sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) + assert_eq(sr.dropna(), sr[:2]) + sr = sr.nans_to_nulls() + assert_eq(sr.dropna(), sr[:2]) + + df = cudf.DataFrame( + { + "a": cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False), + "b": cudf.Series([1, 2, 3, 4]), + } + ) + + got = df.dropna() + expected = df[:2] + assert_eq(expected, got) + + df = df.nans_to_nulls() + got = df.dropna() + expected = df[:2] + assert_eq(expected, got) + + +def test_ignore_index(): + pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) + gser = cudf.from_pandas(pser) + + result = pser.dropna(ignore_index=True) + expected = gser.dropna(ignore_index=True) + assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py index 2c6bc0e8a00..7db558335fe 100644 --- a/python/cudf/cudf/tests/test_contains.py +++ b/python/cudf/cudf/tests/test_contains.py @@ -1,55 +1,14 @@ # Copyright (c) 2019-2025, NVIDIA CORPORATION. -import datetime import numpy as np -import pandas as pd import pytest import cudf from cudf import Series -from cudf.core.index import Index, RangeIndex from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES -@pytest.mark.parametrize( - "values, item, expected", - [ - [[1, 2, 3], 2, True], - [[1, 2, 3], 4, False], - [[1, 2, 3], "a", False], - [["a", "b", "c"], "a", True], - [["a", "b", "c"], "ab", False], - [["a", "b", "c"], 6, False], - [pd.Categorical(["a", "b", "c"]), "a", True], - [pd.Categorical(["a", "b", "c"]), "ab", False], - [pd.Categorical(["a", "b", "c"]), 6, False], - [pd.date_range("20010101", periods=5, freq="D"), 20000101, False], - [ - pd.date_range("20010101", periods=5, freq="D"), - datetime.datetime(2000, 1, 1), - False, - ], - [ - pd.date_range("20010101", periods=5, freq="D"), - datetime.datetime(2001, 1, 1), - True, - ], - ], -) -@pytest.mark.parametrize( - "box", [Index, lambda x: Series(index=x)], ids=["index", "series"] -) -def test_contains(values, item, expected, box): - assert (item in box(values)) is expected - - -def test_rangeindex_contains(): - ridx = RangeIndex(start=0, stop=10, name="Index") - assert 9 in ridx - assert 10 not in ridx - - @pytest.mark.parametrize("dtype", NUMERIC_TYPES) def test_lists_contains(dtype): dtype = cudf.dtype(dtype) diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py deleted file mode 100644 index 1f927d03e95..00000000000 --- a/python/cudf/cudf/tests/test_dropna.py +++ /dev/null @@ -1,299 +0,0 @@ -# Copyright (c) 2020-2025, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd -import pyarrow as pa -import pytest - -import cudf -from cudf.testing import assert_eq - - -@pytest.mark.parametrize( - "data", - [ - [], - [1.0, 2, None, 4], - ["one", "two", "three", "four"], - pd.Series(["a", "b", "c", "d"], dtype="category"), - pd.Series(pd.date_range("2010-01-01", "2010-01-04")), - ], -) -@pytest.mark.parametrize("nulls", ["one", "some", "all", "none"]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_series(data, nulls, inplace): - psr = pd.Series(data) - rng = np.random.default_rng(seed=0) - if len(data) > 0: - if nulls == "one": - p = rng.integers(0, 4) - psr[p] = None - elif nulls == "some": - p1, p2 = rng.integers(0, 4, (2,)) - psr[p1] = None - psr[p2] = None - elif nulls == "all": - psr[:] = None - - gsr = cudf.from_pandas(psr) - - check_dtype = True - if gsr.null_count == len(gsr): - check_dtype = False - - expected = psr.dropna() - actual = gsr.dropna() - - if inplace: - expected = psr - actual = gsr - - assert_eq(expected, actual, check_dtype=check_dtype) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, None]}, - {"a": [1, 2, None], "b": [3, 4, 5]}, - {"a": [1, 2, None], "b": [3, 4, None]}, - {"a": [None, 1, 2], "b": [1, 2, None]}, - {"a": [None, 1, None], "b": [None, 2, None]}, - {"a": [None, None, 1], "b": [1, 2, None]}, - {"a": ["d", "e", "f"], "b": ["a", None, "c"]}, - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize("axis", [0, 1]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_dataframe(data, how, axis, inplace): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna(axis=axis, how=how, inplace=inplace) - actual = gdf.dropna(axis=axis, how=how, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq(expected, actual) - - -@pytest.mark.parametrize("how", ["all", "any"]) -@pytest.mark.parametrize( - "data", - [ - { - "a": pa.array([None, None, None], type=pa.float64()), - "b": [1, 2, None], - }, - { - "a": pa.array([np.nan, np.nan, np.nan]), - "b": [1, 2, None], - }, - {"a": pa.array([None, None, None], type=pa.string())}, - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_with_all_nulls(how, data, axis): - gdf = cudf.DataFrame(data) - pdf = gdf.to_pandas() - - assert_eq( - pdf.dropna(axis=axis, how=how), - gdf.dropna(axis=axis, how=how), - check_dtype=False, - ) - - -def test_dropna_nan_as_null(): - sr = cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False) - assert_eq(sr.dropna(), sr[:2]) - sr = sr.nans_to_nulls() - assert_eq(sr.dropna(), sr[:2]) - - df = cudf.DataFrame( - { - "a": cudf.Series([1.0, 2.0, np.nan, None], nan_as_null=False), - "b": cudf.Series([1, 2, 3, 4]), - } - ) - - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - df = df.nans_to_nulls() - got = df.dropna() - expected = df[:2] - assert_eq(expected, got) - - -@pytest.mark.parametrize( - "data,subset", - [ - ({"a": [1, None], "b": [1, 2]}, ["a"]), - ({"a": [1, None], "b": [1, 2]}, ["b"]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["b"]), - ({"a": [1, 2, None], "b": [1, None, 2]}, ["a", "b"]), - ], -) -def test_dropna_subset_rows(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq(pdf.dropna(subset=subset), gdf.dropna(subset=subset)) - - -@pytest.mark.parametrize( - "data, subset", - [ - ({"a": [1, None], "b": [1, 2]}, [0]), - ({"a": [1, None], "b": [1, 2]}, [1]), - ({"a": [1, None], "b": [1, 2]}, []), - ({"a": [1, 2], "b": [1, 2]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [1]), - ({"a": [1, 2], "b": [None, 2], "c": [3, None]}, [0, 1]), - ], -) -def test_dropna_subset_cols(data, subset): - pdf = pd.DataFrame(data) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=1, subset=subset), gdf.dropna(axis=1, subset=subset) - ) - - -# TODO: can't test with subset=[] below since Pandas -# returns empty DF when both subset=[] and thresh are specified. -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, ["a"], ["b"], ["a", "b"]]) -def test_dropna_thresh(thresh, subset): - pdf = pd.DataFrame({"a": [1, 2, None, None], "b": [1, 2, 3, None]}) - gdf = cudf.from_pandas(pdf) - - assert_eq( - pdf.dropna(axis=0, thresh=thresh, subset=subset), - gdf.dropna(axis=0, thresh=thresh, subset=subset), - ) - - -@pytest.mark.parametrize("thresh", [0, 1, 2]) -@pytest.mark.parametrize("subset", [None, [0], [1], [0, 1]]) -@pytest.mark.parametrize("inplace", [True, False]) -def test_dropna_thresh_cols(thresh, subset, inplace): - pdf = pd.DataFrame( - {"a": [1, 2], "b": [3, 4], "c": [5, None], "d": [np.nan, np.nan]} - ) - gdf = cudf.from_pandas(pdf) - - expected = pdf.dropna( - axis=1, thresh=thresh, subset=subset, inplace=inplace - ) - actual = gdf.dropna(axis=1, thresh=thresh, subset=subset, inplace=inplace) - - if inplace: - expected = pdf - actual = gdf - - assert_eq( - expected, - actual, - ) - - -@pytest.mark.parametrize( - "data", - [ - { - "key": [1, 2, 10], - "val": pa.array([np.nan, 3.0, 1.0]), - "abc": [np.nan, None, 1], - }, - { - "key": [None, 2, 1], - "val": pa.array([3.0, None, 0.1]), - "abc": [None, 1, None], - }, - ], -) -@pytest.mark.parametrize("axis", [0, 1]) -def test_dropna_dataframe_np_nan(data, axis): - gdf = cudf.DataFrame(data) - pd_data = { - key: value.to_pandas() if isinstance(value, cudf.Series) else value - for key, value in data.items() - } - pdf = pd.DataFrame(pd_data) - - assert_eq(pdf.dropna(axis=axis), gdf.dropna(axis=axis), check_dtype=False) - - -@pytest.mark.parametrize( - "data, dtype", - [ - ([1, float("nan"), 2], "float64"), - (["x", None, "y"], "str"), - (["x", None, "y"], "category"), - (["2020-01-20", pd.NaT, "2020-03-15"], "datetime64[ns]"), - (["1s", pd.NaT, "3d"], "timedelta64[ns]"), - ], -) -def test_dropna_index(data, dtype): - pi = pd.Index(data, dtype=dtype) - gi = cudf.from_pandas(pi) - - expect = pi.dropna() - got = gi.dropna() - - assert_eq(expect, got) - - -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex(how): - pi = pd.MultiIndex.from_arrays([[1, None, 2], [None, None, 2]]) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - assert_eq(expect, got) - - -@pytest.mark.parametrize( - "data", - [ - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [pd.NaT, pd.NaT, pd.Timestamp("2020-03-01")], - ], - [ - [pd.Timestamp("2020-01-01"), pd.NaT, pd.Timestamp("2020-02-01")], - [np.nan, np.nan, 1.0], - ], - [[1.0, np.nan, 2.0], [np.nan, np.nan, 1.0]], - ], -) -@pytest.mark.parametrize("how", ["all", "any"]) -def test_dropna_multiindex_2(data, how): - pi = pd.MultiIndex.from_arrays(data) - gi = cudf.from_pandas(pi) - - expect = pi.dropna(how) - got = gi.dropna(how) - - assert_eq(expect, got) - - -def test_ignore_index(): - pser = pd.Series([1, 2, np.nan], index=[2, 4, 1]) - gser = cudf.from_pandas(pser) - - result = pser.dropna(ignore_index=True) - expected = gser.dropna(ignore_index=True) - assert_eq(result, expected) diff --git a/python/cudf/cudf/tests/test_extension_compilation.py b/python/cudf/cudf/tests/test_extension_compilation.py index 8f08272ce66..78ffc662e11 100644 --- a/python/cudf/cudf/tests/test_extension_compilation.py +++ b/python/cudf/cudf/tests/test_extension_compilation.py @@ -12,7 +12,6 @@ from cudf import NA from cudf.core.udf.api import Masked from cudf.core.udf.masked_typing import MaskedType -from cudf.testing._utils import parametrize_numeric_dtypes_pairwise from cudf.utils._numba import _CUDFNumbaConfig arith_ops = ( @@ -166,20 +165,21 @@ def func(x): @pytest.mark.parametrize("op", ops) -@parametrize_numeric_dtypes_pairwise @pytest.mark.parametrize( "masked", ((False, True), (True, False), (True, True)), ids=("um", "mu", "mm"), ) -def test_compile_arith_masked_ops(op, left_dtype, right_dtype, masked): +def test_compile_arith_masked_ops( + op, numeric_types_as_str, numeric_types_as_str2, masked +): def func(x, y): return op(x, y) cc = (7, 5) - ty1 = from_dtype(np.dtype(left_dtype)) - ty2 = from_dtype(np.dtype(right_dtype)) + ty1 = from_dtype(np.dtype(numeric_types_as_str)) + ty2 = from_dtype(np.dtype(numeric_types_as_str2)) if masked[0]: ty1 = MaskedType(ty1) diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py deleted file mode 100644 index 958a3657abb..00000000000 --- a/python/cudf/cudf/tests/test_udf_masked_ops.py +++ /dev/null @@ -1,1069 +0,0 @@ -# Copyright (c) 2021-2025, NVIDIA CORPORATION. -import math -import operator - -import numpy as np -import pytest -from numba import cuda -from numba.core.typing import signature as nb_signature -from numba.core.typing.templates import AbstractTemplate -from numba.cuda.cudadecl import registry as cuda_decl_registry -from numba.cuda.cudaimpl import lower as cuda_lower - -import cudf -from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION -from cudf.core.missing import NA -from cudf.core.udf._ops import ( - arith_ops, - bitwise_ops, - comparison_ops, - unary_ops, -) -from cudf.core.udf.api import Masked -from cudf.core.udf.strings_lowering import ( - cast_string_view_to_managed_udf_string, -) -from cudf.core.udf.strings_typing import ( - StringView, - managed_udf_string, - string_view, -) -from cudf.core.udf.utils import precompiled -from cudf.testing import assert_eq -from cudf.testing._utils import ( - _decimal_series, - parametrize_numeric_dtypes_pairwise, -) - - -def sv_to_managed_udf_str(sv): - """ - Cast a string_view object to a managed_udf_string object - - This placeholder function never runs in python - It exists only for numba to have something to replace - with the typing and lowering code below - - This is similar conceptually to needing a translation - engine to emit an expression in target language "B" when - there is no equivalent in the source language "A" to - translate from. This function effectively defines the - expression in language "A" and the associated typing - and lowering describe the translation process, despite - the expression having no meaning in language "A" - """ - pass - - -@cuda_decl_registry.register_global(sv_to_managed_udf_str) -class StringViewToUDFStringDecl(AbstractTemplate): - def generic(self, args, kws): - if isinstance(args[0], StringView) and len(args) == 1: - return nb_signature(managed_udf_string, string_view) - - -@cuda_lower(sv_to_managed_udf_str, string_view) -def sv_to_udf_str_testing_lowering(context, builder, sig, args): - return cast_string_view_to_managed_udf_string( - context, builder, sig.args[0], sig.return_type, args[0] - ) - - -@pytest.fixture(scope="module") -def str_udf_data(): - return cudf.DataFrame( - { - "str_col": [ - "abc", - "ABC", - "AbC", - "123", - "123aBc", - "123@.!", - "", - "rapids ai", - "gpu", - "True", - "False", - "1.234", - ".123a", - "0.013", - "1.0", - "01", - "20010101", - "cudf", - "cuda", - "gpu", - "This Is A Title", - "This is Not a Title", - "Neither is This a Title", - "NoT a TiTlE", - "123 Title Works", - ] - } - ) - - -@pytest.fixture(params=["a", "cu", "2", "gpu", "", " "]) -def substr(request): - return request.param - - -def run_masked_udf_test(func, data, args=(), nullable=True, **kwargs): - gdf = data - pdf = data.to_pandas(nullable=nullable) - - expect = pdf.apply(func, args=args, axis=1) - obtain = gdf.apply(func, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_string_udf_test(func, data, args=(), **kwargs): - gdf = data - pdf = data.to_pandas(nullable=True) - - def row_wrapper(row): - st = row["str_col"] - return func(st) - - expect = pdf.apply(row_wrapper, args=args, axis=1) - - func = cuda.jit(device=True)(func) - obtain = gdf.apply(row_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - # strings that come directly from input columns are backed by - # MaskedType(string_view) types. But new strings that are returned - # from functions or operators are backed by MaskedType(udf_string) - # types. We need to make sure all of our methods work on both kind - # of MaskedType. This function promotes the former to the latter - # prior to running the input function - def udf_string_wrapper(row): - masked_udf_str = Masked( - sv_to_managed_udf_str(row["str_col"].value), row["str_col"].valid - ) - return func(masked_udf_str) - - obtain = gdf.apply(udf_string_wrapper, args=args, axis=1) - assert_eq(expect, obtain, **kwargs) - - -def run_masked_udf_series(func, data, args=(), **kwargs): - gsr = data - psr = data.to_pandas(nullable=True) - - expect = psr.apply(func, args=args) - obtain = gsr.apply(func, args=args) - assert_eq(expect, obtain, **kwargs) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_masked(op): - # This test should test all the typing - # and lowering for arithmetic ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", bitwise_ops) -def test_bitwise_masked_vs_masked(op): - # This test should test all the typing - # and lowering for bitwise ops between - # two columns - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": [1, 0, 1, 0, 0b1011, 42, None], - "b": [1, 1, 0, 0, 0b1100, -42, 5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize( - "dtype_l", - ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"], -) -@pytest.mark.parametrize( - "dtype_r", - [ - "timedelta64[ns]", - "timedelta64[us]", - "timedelta64[ms]", - "timedelta64[s]", - "datetime64[ns]", - "datetime64[ms]", - "datetime64[us]", - "datetime64[s]", - ], -) -@pytest.mark.parametrize("op", [operator.add, operator.sub]) -def test_arith_masked_vs_masked_datelike(op, dtype_l, dtype_r): - # Datetime version of the above - # does not test all dtype combinations for now - if "datetime" in dtype_l and "datetime" in dtype_r and op is operator.add: - # don't try adding datetimes to datetimes. - pytest.skip("Adding datetime to datetime is not valid") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame( - { - "a": ["2011-01-01", cudf.NA, "2011-03-01", cudf.NA], - "b": [4, 5, cudf.NA, cudf.NA], - } - ) - gdf["a"] = gdf["a"].astype(dtype_l) - gdf["b"] = gdf["b"].astype(dtype_r) - - pdf = gdf.to_pandas() - expect = op(pdf["a"], pdf["b"]) - obtain = gdf.apply(func, axis=1) - assert_eq(expect, obtain, check_dtype=False) - # TODO: After the following pandas issue is - # fixed, uncomment the following line and delete - # through `to_pandas()` statement. - # https://github.com/pandas-dev/pandas/issues/52411 - - # run_masked_udf_test(func, gdf, nullable=False, check_dtype=False) - - -@pytest.mark.parametrize("op", comparison_ops) -def test_compare_masked_vs_masked(op): - # this test should test all the - # typing and lowering for comparisons - # between columns - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - # we should get: - # [?, ?, , , ] - gdf = cudf.DataFrame( - {"a": [1, 0, None, 1, None], "b": [0, 1, 0, None, None]} - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[1, 2, cudf.NA]]) -def test_arith_masked_vs_constant(op, constant, data): - def func(row): - x = row["data"] - return op(x, constant) - - gdf = cudf.DataFrame({"data": data}) - - if constant is False and op in { - operator.mod, - operator.pow, - operator.truediv, - operator.floordiv, - operator.imod, - operator.ipow, - operator.itruediv, - operator.ifloordiv, - }: - # The following tests cases yield undefined behavior: - # - truediv(x, False) because its dividing by zero - # - floordiv(x, False) because its dividing by zero - # - mod(x, False) because its mod by zero, - # - pow(x, False) because we have an NA in the series and pandas - # insists that (NA**0 == 1) where we do not - pytest.skip() - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, True, False]) -@pytest.mark.parametrize("data", [[2, 3, cudf.NA], [1, cudf.NA, 1]]) -def test_arith_masked_vs_constant_reflected(request, op, constant, data): - def func(row): - x = row["data"] - return op(constant, x) - - # Just a single column -> result will be all NA - gdf = cudf.DataFrame({"data": data}) - - # cudf differs from pandas for 1**NA - request.applymarker( - pytest.mark.xfail( - condition=(constant == 1 and op in {operator.pow, operator.ipow}), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("data", [[1, cudf.NA, 3], [2, 3, cudf.NA]]) -def test_arith_masked_vs_null(request, op, data): - def func(row): - x = row["data"] - return op(x, NA) - - gdf = cudf.DataFrame({"data": data}) - - # In pandas, 1**NA == 1. - request.applymarker( - pytest.mark.xfail( - condition=( - (gdf["data"] == 1).any() - and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -def test_arith_masked_vs_null_reflected(op): - def func(row): - x = row["data"] - return op(NA, x) - - gdf = cudf.DataFrame({"data": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("op", unary_ops) -def test_unary_masked(op): - # This test should test all the typing - # and lowering for unary ops - - def func(row): - x = row["a"] - return op(x) if x is not NA else NA - - if "log" in op.__name__: - gdf = cudf.DataFrame({"a": [0.1, 1.0, None, 3.5, 1e8]}) - elif op.__name__ in {"asin", "acos"}: - gdf = cudf.DataFrame({"a": [0.0, 0.5, None, 1.0]}) - elif op.__name__ in {"atanh"}: - gdf = cudf.DataFrame({"a": [0.0, -0.5, None, 0.8]}) - elif op.__name__ in {"acosh", "sqrt", "lgamma"}: - gdf = cudf.DataFrame({"a": [1.0, 2.0, None, 11.0]}) - elif op.__name__ in {"gamma"}: - gdf = cudf.DataFrame({"a": [0.1, 2, None, 4]}) - elif op.__name__ in {"invert"}: - gdf = cudf.DataFrame({"a": [-100, 128, None, 0]}, dtype="int64") - else: - gdf = cudf.DataFrame({"a": [-125.60, 395.2, 0.0, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_masked_is_null_conditional(): - def func(row): - x = row["a"] - y = row["b"] - if x is NA: - return y - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_contains(): - def func(row): - x = row["a"] - return x in [1, 2] - - gdf = cudf.DataFrame({"a": [1, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -@parametrize_numeric_dtypes_pairwise -@pytest.mark.parametrize("op", [operator.add, operator.and_, operator.eq]) -def test_apply_mixed_dtypes(left_dtype, right_dtype, op): - """ - Test that operations can be performed between columns - of different dtypes and return a column with the correct - values and nulls - """ - - # First perform the op on two dummy data on host, if numpy can - # safely type cast, we should expect it to work in udf too. - try: - op(np.dtype(left_dtype).type(0), np.dtype(right_dtype).type(42)) - except TypeError: - pytest.skip("Operation is unsupported for corresponding dtype.") - - def func(row): - x = row["a"] - y = row["b"] - return op(x, y) - - gdf = cudf.DataFrame({"a": [1.5, None, 3, None], "b": [4, 5, None, None]}) - gdf["a"] = gdf["a"].astype(left_dtype) - gdf["b"] = gdf["b"].astype(right_dtype) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -@pytest.mark.parametrize("val", [5, 5.5]) -def test_apply_return_literal(val): - """ - Test unification codepath for scalars and MaskedType - makes sure that numba knows how to cast a scalar value - to a MaskedType - """ - - def func(row): - x = row["a"] - y = row["b"] - if x is not NA and x < 2: - return val - else: - return x + y - - gdf = cudf.DataFrame({"a": [1, None, 3, None], "b": [4, 5, None, None]}) - - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_null(): - """ - Tests casting / unification of Masked and NA - """ - - def func(row): - x = row["a"] - if x is NA: - return NA - else: - return x - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_either_null_or_literal(): - def func(row): - x = row["a"] - if x > 5: - return 2 - else: - return NA - - gdf = cudf.DataFrame({"a": [1, 3, 6]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_return_literal_only(): - def func(x): - return 5 - - gdf = cudf.DataFrame({"a": [1, None, 3]}) - run_masked_udf_test(func, gdf, check_dtype=False) - - -def test_apply_everything(): - def func(row): - w = row["a"] - x = row["b"] - y = row["c"] - z = row["d"] - if x is NA: - return w + y - z - elif ((z > y) is not NA) and z > y: - return x - elif ((x + y) is not NA) and x + y == 0: - return z / x - elif x + y is NA: - return 2.5 - elif w > 100: - return ( - math.sin(x) - + math.sqrt(y) - - (-z) - + math.lgamma(x) * math.fabs(-0.8) / math.radians(3.14) - ) - else: - return y > 2 - - gdf = cudf.DataFrame( - { - "a": [1, 3, 6, 0, None, 5, None, 101], - "b": [3.0, 2.5, None, 5.0, 1.0, 5.0, 11.0, 1.0], - "c": [2, 3, 6, 0, None, 5, None, 6], - "d": [4, None, 6, 0, None, 5, None, 7.5], - } - ) - run_masked_udf_test(func, gdf, check_dtype=False) - - -### - - -@pytest.mark.parametrize( - "data,name", - [([1, 2, 3], None), ([1, cudf.NA, 3], None), ([1, 2, 3], "test_name")], -) -def test_series_apply_basic(data, name): - data = cudf.Series(data, name=name) - - def func(x): - return x + 1 - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_apply_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - 1 - - data = cudf.Series([1, cudf.NA, 3]) - - run_masked_udf_series(func, data) - - -### - - -@pytest.mark.parametrize("op", arith_ops) -def test_series_arith_masked_vs_masked(op): - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -@pytest.mark.parametrize("op", comparison_ops) -def test_series_compare_masked_vs_masked(op): - """ - In the series case, only one other MaskedType to compare with - - itself - """ - - def func(x): - return op(x, x) - - data = cudf.Series([1, cudf.NA, 3]) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant(request, op, constant): - def func(x): - return op(x, constant) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant is cudf.NA and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops) -@pytest.mark.parametrize("constant", [1, 1.5, cudf.NA]) -def test_series_arith_masked_vs_constant_reflected(request, op, constant): - def func(x): - return op(constant, x) - - # Just a single column -> result will be all NA - data = cudf.Series([1, 2, cudf.NA]) - # Using in {1} since bool(NA == 1) raises a TypeError since NA is - # neither truthy nor falsy - # in pandas, 1**NA == 1. In cudf, 1**NA == NA. - request.applymarker( - pytest.mark.xfail( - condition=( - constant in {1} and op in {operator.pow, operator.ipow} - ), - reason="https://github.com/rapidsai/cudf/issues/7478", - ) - ) - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.xfail( - PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION, - reason="https://github.com/pandas-dev/pandas/issues/57390", -) -def test_series_masked_is_null_conditional(): - def func(x): - if x is NA: - return 42 - else: - return x - - data = cudf.Series([1, cudf.NA, 3, cudf.NA]) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_lambda_support(op): - func = lambda row: op(row["a"], row["b"]) # noqa: E731 - - data = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - run_masked_udf_test(func, data, check_dtype=False) - - -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_nested_function_support(op): - """ - Nested functions need to be explicitly jitted by the user - for numba to recognize them. Unfortunately the object - representing the jitted function can not itself be used in - pandas udfs. - """ - - def inner(x, y): - return op(x, y) - - def outer(row): - x = row["a"] - y = row["b"] - return inner(x, y) - - gdf = cudf.DataFrame( - {"a": [1, cudf.NA, 3, cudf.NA], "b": [1, 2, cudf.NA, cudf.NA]} - ) - - with pytest.raises(ValueError): - gdf.apply(outer, axis=1) - - pdf = gdf.to_pandas(nullable=True) - inner_gpu = cuda.jit(device=True)(inner) - - def outer_gpu(row): - x = row["a"] - y = row["b"] - return inner_gpu(x, y) - - got = gdf.apply(outer_gpu, axis=1) - expect = pdf.apply(outer, axis=1) - assert_eq(expect, got, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}, - {"a": [1, 2, 3], "c": [4, 5, 6], "b": [7, 8, 9]}, - {"a": [1, 2, 3], "b": [4, 5, 6], "c": ["a", "b", "c"]}, - ], -) -def test_masked_udf_subset_selection(data): - def func(row): - return row["a"] + row["b"] - - data = cudf.DataFrame(data) - run_masked_udf_test(func, data) - - -@pytest.mark.parametrize( - "unsupported_col", - [ - _decimal_series( - ["1.0", "2.0", "3.0"], dtype=cudf.Decimal64Dtype(2, 1) - ), - cudf.Series([1, 2, 3], dtype="category"), - cudf.interval_range(start=0, end=3), - [[1, 2], [3, 4], [5, 6]], - [{"a": 1}, {"a": 2}, {"a": 3}], - ], -) -def test_masked_udf_unsupported_dtype(unsupported_col): - data = cudf.DataFrame() - data["unsupported_col"] = unsupported_col - - def func(row): - return row["unsupported_col"] - - # check that we fail when an unsupported type is used within a function - with pytest.raises(ValueError): - data.apply(func, axis=1) - - # also check that a DF containing unsupported dtypes can still run a - # function that does NOT involve any of the unsupported dtype columns - data["supported_col"] = 1 - - def other_func(row): - return row["supported_col"] - - expect = cudf.Series(np.ones(len(data))) - got = data.apply(other_func, axis=1) - - assert_eq(expect, got, check_dtype=False) - - -# tests for `DataFrame.apply(f, args=(x,y,z))` -# testing the whole space of possibilities is intractable -# these test the most rudimentary guaranteed functionality -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops(data, op): - data = cudf.DataFrame(data) - - def func(row, c): - return op(row["a"], c) - - run_masked_udf_test(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - {"a": [1, cudf.NA, 3]}, - {"a": [0.5, 2.0, cudf.NA, cudf.NA, 5.0]}, - {"a": [True, False, cudf.NA]}, - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple(data, op): - data = cudf.DataFrame(data) - - def func(row, c, k): - x = op(row["a"], c) - y = op(x, k) - return y - - run_masked_udf_test(func, data, args=(1, 2), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_mask_udf_scalar_args_binops_series(data, op): - data = cudf.Series(data) - - def func(x, c): - return x + c - - run_masked_udf_series(func, data, args=(1,), check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - [1, cudf.NA, 3], - [0.5, 2.0, cudf.NA, cudf.NA, 5.0], - [True, False, cudf.NA], - ], -) -@pytest.mark.parametrize("op", arith_ops + comparison_ops) -def test_masked_udf_scalar_args_binops_multiple_series(request, data, op): - data = cudf.Series(data) - request.applymarker( - pytest.mark.xfail( - op in comparison_ops - and PANDAS_VERSION >= PANDAS_CURRENT_SUPPORTED_VERSION - and data.dtype.kind != "b", - reason="https://github.com/pandas-dev/pandas/issues/57390", - ) - ) - - def func(data, c, k): - x = op(data, c) - y = op(x, k) - return y - - run_masked_udf_series(func, data, args=(1, 2), check_dtype=False) - - -def test_masked_udf_caching(): - # Make sure similar functions that differ - # by simple things like constants actually - # recompile - - data = cudf.Series([1, 2, 3]) - - expect = data**2 - got = data.apply(lambda x: x**2) - assert_eq(expect, got, check_dtype=False) - - # update the constant value being used and make sure - # it does not result in a cache hit - - expect = data**3 - got = data.apply(lambda x: x**3) - assert_eq(expect, got, check_dtype=False) - - # make sure we get a hit when reapplying - def f(x): - return x + 1 - - precompiled.clear() - assert precompiled.currsize == 0 - data.apply(f) - - assert precompiled.currsize == 1 - data.apply(f) - - assert precompiled.currsize == 1 - - # validate that changing the type of a scalar arg - # results in a miss - precompiled.clear() - - def f(x, c): - return x + c - - data.apply(f, args=(1,)) - assert precompiled.currsize == 1 - - data.apply(f, args=(1.5,)) - assert precompiled.currsize == 2 - - -@pytest.mark.parametrize( - "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]] -) -@pytest.mark.parametrize("operator", [float, int, bool]) -def test_masked_udf_casting(operator, data): - data = cudf.Series(data) - - def func(x): - return operator(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -@pytest.mark.parametrize( - "data", - [ - np.array( - [0, 1, -1, 0, np.iinfo("int64").min, np.iinfo("int64").max], - dtype="int64", - ), - np.array([0, 0, 1, np.iinfo("uint64").max], dtype="uint64"), - np.array( - [ - 0, - 0.0, - -1.0, - 1.5, - -1.5, - np.finfo("float64").min, - np.finfo("float64").max, - np.nan, - np.inf, - -np.inf, - ], - dtype="float64", - ), - [False, True, False, cudf.NA], - ], -) -def test_masked_udf_abs(data): - data = cudf.Series(data) - data[0] = cudf.NA - - def func(x): - return abs(x) - - run_masked_udf_series(func, data, check_dtype=False) - - -class TestStringUDFs: - def test_string_udf_len(self, str_udf_data): - def func(row): - return len(row["str_col"]) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_startswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].startswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_endswith(self, str_udf_data, substr): - def func(row): - return row["str_col"].endswith(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_find(self, str_udf_data, substr): - def func(row): - return row["str_col"].find(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_rfind(self, str_udf_data, substr): - def func(row): - return row["str_col"].rfind(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_contains(self, str_udf_data, substr): - def func(row): - return substr in row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("other", ["cudf", "123", "", " "]) - @pytest.mark.parametrize("cmpop", comparison_ops) - def test_string_udf_cmpops(self, str_udf_data, other, cmpop): - def func(row): - return cmpop(row["str_col"], other) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalnum(self, str_udf_data): - def func(row): - return row["str_col"].isalnum() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isalpha(self, str_udf_data): - def func(row): - return row["str_col"].isalpha() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdigit(self, str_udf_data): - def func(row): - return row["str_col"].isdigit() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isdecimal(self, str_udf_data): - def func(row): - return row["str_col"].isdecimal() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isupper(self, str_udf_data): - def func(row): - return row["str_col"].isupper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_islower(self, str_udf_data): - def func(row): - return row["str_col"].islower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_isspace(self, str_udf_data): - def func(row): - return row["str_col"].isspace() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_istitle(self, str_udf_data): - def func(row): - return row["str_col"].istitle() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_count(self, str_udf_data, substr): - def func(row): - return row["str_col"].count(substr) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.xfail(reason="Identity function not supported.") - def test_string_udf_return_string(self, str_udf_data): - def func(row): - return row["str_col"] - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_strip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].strip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_lstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].lstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"]) - def test_string_udf_rstrip(self, str_udf_data, strip_char): - def func(row): - return row["str_col"].rstrip(strip_char) - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_upper(self, str_udf_data): - def func(row): - return row["str_col"].upper() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - def test_string_udf_lower(self, str_udf_data): - def func(row): - return row["str_col"].lower() - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize( - "concat_char", ["1", "a", "12", " ", "", ".", "@"] - ) - def test_string_udf_concat(self, str_udf_data, concat_char): - def func(row): - return row["str_col"] + concat_char - - run_masked_udf_test(func, str_udf_data, check_dtype=False) - - @pytest.mark.parametrize("to_replace", ["a", "1", "", "@"]) - @pytest.mark.parametrize("replacement", ["a", "1", "", "@"]) - def test_string_udf_replace(self, str_udf_data, to_replace, replacement): - def func(row): - return row["str_col"].replace(to_replace, replacement) - - run_masked_udf_test(func, str_udf_data, check_dtype=False)