Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions cpp/src/arrow/compute/kernels/codegen_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1058,6 +1058,22 @@ ArrayKernelExec GenerateTypeAgnosticPrimitive(detail::GetTypeId get_id) {
}
}

// similar to GenerateTypeAgnosticPrimitive, but for variable types
template <template <typename...> class Generator>
ArrayKernelExec GenerateTypeAgnosticVarBinaryBase(detail::GetTypeId get_id) {
switch (get_id.id) {
case Type::BINARY:
case Type::STRING:
return Generator<BinaryType>::Exec;
case Type::LARGE_BINARY:
case Type::LARGE_STRING:
return Generator<LargeBinaryType>::Exec;
default:
DCHECK(false);
return ExecFail;
}
}

// Generate a kernel given a templated functor for base binary types. Generates
// a single kernel for binary/string and large binary / large string. If your
// kernel implementation needs access to the specific type at compile time,
Expand Down
63 changes: 63 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_fill_null.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ namespace {
template <typename Type, typename Enable = void>
struct FillNullFunctor {};

// Numeric inputs

template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
using T = typename TypeTraits<Type>::CType;
Expand Down Expand Up @@ -84,6 +86,8 @@ struct FillNullFunctor<Type, enable_if_t<is_number_type<Type>::value>> {
}
};

// Boolean input

template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand Down Expand Up @@ -131,6 +135,8 @@ struct FillNullFunctor<Type, enable_if_t<is_boolean_type<Type>::value>> {
}
};

// Null input

template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_null_type<Type>::value>> {
static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
Expand All @@ -139,6 +145,53 @@ struct FillNullFunctor<Type, enable_if_t<is_null_type<Type>::value>> {
}
};

// Binary-like input

template <typename Type>
struct FillNullFunctor<Type, enable_if_t<is_base_binary_type<Type>::value>> {
using BuilderType = typename TypeTraits<Type>::BuilderType;

static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
const ArrayData& input = *batch[0].array();
const auto& fill_value_scalar =
checked_cast<const BaseBinaryScalar&>(*batch[1].scalar());
util::string_view fill_value(*fill_value_scalar.value);
ArrayData* output = out->mutable_array();

// Ensure the kernel is configured properly to have no validity bitmap /
// null count 0 unless we explicitly propagate it below.
DCHECK(output->buffers[0] == nullptr);

const int64_t null_count = input.GetNullCount();

if (null_count > 0 && fill_value_scalar.is_valid) {
BuilderType builder(input.type, ctx->memory_pool());
KERNEL_RETURN_IF_ERROR(ctx, builder.ReserveData(input.buffers[2]->size() +
fill_value.length() * null_count));
KERNEL_RETURN_IF_ERROR(ctx, builder.Resize(input.length));

KERNEL_RETURN_IF_ERROR(ctx, VisitArrayDataInline<Type>(
input,
[&](util::string_view s) {
builder.UnsafeAppend(s);
return Status::OK();
},
[&]() {
builder.UnsafeAppend(fill_value);
return Status::OK();
}));
std::shared_ptr<Array> string_array;
KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_array));
*output = *string_array->data();
// The builder does not match the logical type, due to
// GenerateTypeAgnosticVarBinaryBase
output->type = input.type;
} else {
*output = input;
}
}
};

void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
auto AddKernels = [&](const std::vector<std::shared_ptr<DataType>>& types) {
for (const std::shared_ptr<DataType>& ty : types) {
Expand All @@ -153,6 +206,15 @@ void AddBasicFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
AddKernels({boolean(), null()});
}

void AddBinaryFillNullKernels(ScalarKernel kernel, ScalarFunction* func) {
for (const std::shared_ptr<DataType>& ty : BaseBinaryTypes()) {
kernel.signature =
KernelSignature::Make({InputType::Array(ty), InputType::Scalar(ty)}, ty);
kernel.exec = GenerateTypeAgnosticVarBinaryBase<FillNullFunctor>(*ty);
DCHECK_OK(func->AddKernel(kernel));
}
}

const FunctionDoc fill_null_doc{
"Replace null elements",
("`fill_value` must be a scalar of the same type as `values`.\n"
Expand All @@ -170,6 +232,7 @@ void RegisterScalarFillNull(FunctionRegistry* registry) {
auto fill_null =
std::make_shared<ScalarFunction>("fill_null", Arity::Binary(), &fill_null_doc);
AddBasicFillNullKernels(fill_null_base, fill_null.get());
AddBinaryFillNullKernels(fill_null_base, fill_null.get());
DCHECK_OK(registry->AddFunction(fill_null));
}
}
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_fill_null_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,15 @@ TEST_F(TestFillNullKernel, FillNullTimeStamp) {
CheckFillNull(time64_type, "[2, 1, 6, null]", Datum(scalar2), "[2, 1, 6, 6]");
}

TEST_F(TestFillNullKernel, FillNullString) {
auto type = large_utf8();
auto scalar = std::make_shared<LargeStringScalar>("arrow");
// no nulls
CheckFillNull(type, R"(["foo", "bar"])", Datum(scalar), R"(["foo", "bar"])");
// some nulls
CheckFillNull(type, R"(["foo", "bar", null])", Datum(scalar),
R"(["foo", "bar", "arrow"])");
}

} // namespace compute
} // namespace arrow
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/scalar_string.cc
Original file line number Diff line number Diff line change
Expand Up @@ -947,7 +947,7 @@ struct SplitBaseTransform {
if (input.is_valid) {
result->is_valid = true;
BuilderType builder(input.type, ctx->memory_pool());
util::string_view s = static_cast<util::string_view>(*input.value);
util::string_view s(*input.value);
KERNEL_RETURN_IF_ERROR(ctx, Split(s, &builder));
KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&result->value));
}
Expand Down
22 changes: 11 additions & 11 deletions docs/source/cpp/compute.rst
Original file line number Diff line number Diff line change
Expand Up @@ -436,17 +436,17 @@ Structural transforms

.. XXX (this category is a bit of a hodgepodge)

+--------------------------+------------+---------------------------------------+---------------------+---------+
| Function name | Arity | Input types | Output type | Notes |
+==========================+============+=======================================+=====================+=========+
| fill_null | Binary | Boolean, Null, Numeric, Temporal | Boolean | \(1) |
+--------------------------+------------+---------------------------------------+---------------------+---------+
| is_null | Unary | Any | Boolean | \(2) |
+--------------------------+------------+---------------------------------------+---------------------+---------+
| is_valid | Unary | Any | Boolean | \(2) |
+--------------------------+------------+---------------------------------------+---------------------+---------+
| list_value_length | Unary | List-like | Int32 or Int64 | \(4) |
+--------------------------+------------+---------------------------------------+---------------------+---------+
+--------------------------+------------+------------------------------------------------+---------------------+---------+
| Function name | Arity | Input types | Output type | Notes |
+==========================+============+================================================+=====================+=========+
| fill_null | Binary | Boolean, Null, Numeric, Temporal, String-like | Input type | \(1) |
+--------------------------+------------+------------------------------------------------+---------------------+---------+
| is_null | Unary | Any | Boolean | \(2) |
+--------------------------+------------+------------------------------------------------+---------------------+---------+
| is_valid | Unary | Any | Boolean | \(2) |
+--------------------------+------------+------------------------------------------------+---------------------+---------+
| list_value_length | Unary | List-like | Int32 or Int64 | \(4) |
+--------------------------+------------+------------------------------------------------+---------------------+---------+

* \(1) First input must be an array, second input a scalar of the same type.
Output is an array of the same type as the inputs, and with the same values
Expand Down
10 changes: 10 additions & 0 deletions python/pyarrow/tests/test_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,16 @@ def test_fill_null():
expected = pa.array([None, None, None, None])
assert result.equals(expected)

arr = pa.array(['a', 'bb', None])
result = arr.fill_null('ccc')
expected = pa.array(['a', 'bb', 'ccc'])
assert result.equals(expected)

arr = pa.array([b'a', b'bb', None], type=pa.large_binary())
result = arr.fill_null('ccc')
expected = pa.array([b'a', b'bb', b'ccc'], type=pa.large_binary())
assert result.equals(expected)


@pytest.mark.parametrize('arrow_type', numerical_arrow_types)
def test_fill_null_array(arrow_type):
Expand Down